o
    eiU                     @   sT  d Z ddlmZ ddlmZ ddlZddlmZ ddlm	Z	m
Z
 eeZdZeG d	d
 d
Z	ddejdB dejeB eB dejdededB f
ddZ	ddejdB dejeB eB dejdededB f
ddZddejdejdedB fddZddejdejdedB fddZ		d dejeB eB dejdejdededB dejdB fddZdS )!a  
IMPORTANT NOTICE: Every class and function in this file is deprecated in favor of using the much more general
`masking_utils.py` primitives. New code should not rely on it, it is only kept for backward compatibility for now,
and will be removed in the future.
    )	dataclass)UnionN   )logging)is_torchdynamo_compiling
is_tracingzThe attention mask API under `transformers.modeling_attn_mask_utils` (`AttentionMaskConverter`) is deprecated and will be removed in Transformers v5.10. Please use the new API in `transformers.masking_utils`.c                   @   sD  e Zd ZU dZeed< eed< d'dededB fddZ	d(ded	ed
edej	de
ejdf dejdB fddZ	d'dejd	edej	d
edB dejf
ddZe		d)dejdej	dejdededB f
ddZed'dejdej	dedB fddZedejdefdd Ze		!d*d"ejdB d#ejdededB d$edefd%d&ZdS )+AttentionMaskConvertera9  
    A utility attention mask class that allows one to:
        - Create a causal 4d mask
        - Create a causal 4d mask with slided window
        - Convert a 2d attention mask (batch_size, query_length) to a 4d attention mask (batch_size, 1, query_length,
          key_value_length) that can be multiplied with attention scores

    Examples:

    ```python
    >>> import torch
    >>> from transformers.modeling_attn_mask_utils import AttentionMaskConverter

    >>> converter = AttentionMaskConverter(True)
    >>> converter.to_4d(torch.tensor([[0, 0, 0, 1, 1]]), 5, key_value_length=5, dtype=torch.float32)
    tensor([[[[-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
            [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
            [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
            [-3.4028e+38, -3.4028e+38, -3.4028e+38,  0.0000e+00, -3.4028e+38],
            [-3.4028e+38, -3.4028e+38, -3.4028e+38,  0.0000e+00,  0.0000e+00]]]])
    ```

    Parameters:
        is_causal (`bool`):
            Whether the attention mask should be a uni-directional (causal) or bi-directional mask.

        sliding_window (`int`, *optional*):
            Optionally, the sliding window masks can be created if `sliding_window` is defined to a positive integer.
    	is_causalsliding_windowNc                 C   sF   t tt || _|| _| jd ur| jdkr!td| j dd S d S )Nr   zaMake sure that when passing `sliding_window` that its value is a strictly positive integer, not ``)loggerwarning_onceDEPRECATION_MESSAGEFutureWarningr	   r
   
ValueError)selfr	   r
    r   c/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/modeling_attn_mask_utils.py__init__I   s   zAttentionMaskConverter.__init__cpu
batch_sizequery_lengthkey_value_lengthdtypedevicestrreturnc           	      C   s\   | j std| j d||f}|| }d}|d dks!| jdur,| j||||| jd}|S )z
        Creates a causal 4D mask of (bsz, head_dim=1, query_length, key_value_length) shape and adds large negative
        bias to upper right hand triangular matrix (causal mask).
        z"Please use `to_causal_4d` only if z has `is_causal` set to True.Nr   r   past_key_values_lengthr
   )r	   r   	__class__r
   _make_causal_mask)	r   r   r   r   r   r   input_shaper   causal_4d_maskr   r   r   to_causal_4dT   s   z#AttentionMaskConverter.to_causal_4dattention_mask_2dc           
      C   s   |j d |f}d}|d dks| jdur0| jr0|du rtd|| }| j|||j|| jd}n	| jdur9td| j|||d d|j}|durW|	|
 t|j}|}	|	S )	a  
        Converts 2D attention mask to 4D attention mask by expanding mask to (bsz, head_dim=1, query_length,
        key_value_length) shape and by adding a large negative bias to not-attended positions. If attention_mask is
        causal, a causal mask will be added.
        r   Nr   r   zpThis attention mask converter is causal. Make sure to pass `key_value_length` to correctly create a causal mask.r   z?Sliding window is currently only implemented for causal masking)tgt_len)shaper
   r	   r   r!   r   NotImplementedError_expand_masktomasked_fillbooltorchfinfomin)
r   r%   r   r   r   r"   r#   r   expanded_attn_maskexpanded_4d_maskr   r   r   to_4du   s0   
zAttentionMaskConverter.to_4dr   input_ids_shaper   c                 C   s  t tt | \}}tj||ft|j|d}tj|	d|d}|
||d |	ddk d ||}|dkrLtjtj||||d|gdd}|durt|| d }	tjtj|tjd|	d	}
t rj| }|
|
t|j |ddddddf |d||| S )
zJ
        Make causal mask used for bi-directional self-attention.
        )r   r   r   r   r   r   )dimNr   )diagonal)r   r   r   r   r-   fullr.   r/   arangesizemasked_fill_viewr*   catzerostril	ones_liker,   r   cloneexpand)r3   r   r   r   r
   bszr&   mask	mask_condr7   context_maskr   r   r   r!      s   "
 (z(AttentionMaskConverter._make_causal_maskrD   r&   c                 C   s   t tt |  \}}|dur|n|}| ddddddf |d|||}tjd|d| }|	|tj
t|jS )zg
        Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
        Nr         ?r6   )r   r   r   r   r:   rB   r*   r-   tensorr+   r,   r.   r/   )rD   r   r&   rC   src_lenexpanded_maskinverted_maskr   r   r   r)      s   *z#AttentionMaskConverter._expand_maskrJ   	min_dtypec                 C   s<   t tt | jtjkrtd| tj	| |kddd S )a  
        Attend to all tokens in masked rows from the expanded attention mask, for example the relevant first rows when
        using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
        Details: https://github.com/pytorch/pytorch/issues/110213

        `expanded_mask` is [bsz, num_masks, tgt_seq_len, src_seq_len] or [bsz, tgt_seq_len, src_seq_len].
        `attention_mask` is [bsz, src_seq_len].

        The dimension num_masks of `expanded_mask` is most often 1, but it can also be the number of heads in the case of alibi attention bias.

        For example, if `expanded_mask` is (e.g. here left-padding case)
        ```
        [[[[0, 0, 0],
           [0, 0, 0],
           [0, 0, 1]]],
         [[[1, 0, 0],
           [1, 1, 0],
           [1, 1, 1]]],
         [[[0, 0, 0],
           [0, 1, 0],
           [0, 1, 1]]]]
        ```
        then the modified `expanded_mask` will be
        ```
        [[[[1, 1, 1],   <-- modified
           [1, 1, 1],   <-- modified
           [0, 0, 1]]],
         [[[1, 0, 0],
           [1, 1, 0],
           [1, 1, 1]]],
         [[[1, 1, 1],   <-- modified
           [0, 1, 0],
           [0, 1, 1]]]]
        ```
        z\AttentionMaskConverter._unmask_unattended expects a float `expanded_mask`, got a BoolTensor.r   T)r5   keepdim)
r   r   r   r   r   r-   r,   r   mulall)rJ   rL   r   r   r   _unmask_unattended   s   )z)AttentionMaskConverter._unmask_unattendedFattention_maskinputs_embedsis_trainingc           
      C   s   t tt |jd |jd }}|| }t|}d}	| du r7|s#|s5|dks+||kr5|du s3||k r5d}	|	S |du s?||k r[t| jdkrHdS |s[t| dkr[|dksY||kr[d}	|	S )a9  
        Detects whether the optional user-specified attention_mask & the automatically created causal mask can be
        ignored in case PyTorch's SDPA is used, rather relying on SDPA's `is_causal` argument.

        In case no token is masked in the `attention_mask` argument, if `query_length == 1` or
        `key_value_length == query_length`, we rather rely on SDPA `is_causal` argument to use causal/non-causal masks,
        allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is
        passed).
        r   r   FNT   )	r   r   r   r   r'   r   lenr-   rO   )
rQ   rR   r   r
   rS   _r   r   is_tracing_ignore_causal_maskr   r   r   _ignore_causal_mask_sdpa  s*   z/AttentionMaskConverter._ignore_causal_mask_sdpaN)r   r   N)NF)__name__
__module____qualname____doc__r,   __annotations__intr   r-   r   r   r   Tensorr$   r2   staticmethodSizer!   r)   FloatTensorfloatrP   rY   r   r   r   r   r   &   s   
 
&
/#"2r   rQ   r"   rR   r   r
   c           	      C   s   t d|d}|d | }| dur%t| jdkr%|j| |d ||jd} | S | durdt| jdkrd|d d	|d	 |f}t| j|krOtd
t| j d| dd|  }||t	j
t	|jj} | S |j|d |d ||j|jd} | S )a  
    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
    `(batch_size, key_value_length)`

    Args:
        attention_mask (`torch.Tensor` or `None`):
            A 2D attention mask of shape `(batch_size, key_value_length)`
        input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
            The input shape should be a tuple that defines `(batch_size, query_length)`.
        inputs_embeds (`torch.Tensor`):
            The embedded inputs as a torch Tensor.
        past_key_values_length (`int`):
            The length of the key value cache.
        sliding_window (`int`, *optional*):
            If the model uses windowed attention, a sliding window should be passed.
    Tr	   r
   r   N   )r   r   rT   r   r   z#Incorrect 4D attention_mask shape: z; expected: .rG   r4   )r   rU   r'   r2   r   tupler   r+   r*   r-   r,   r.   r/   r$   r   )	rQ   r"   rR   r   r
   attn_mask_converterr   expected_shaperK   r   r   r   !_prepare_4d_causal_attention_maskG  s,   rm   c           
      C   s   t d|d}|d | }t|}t j| |||d}|rd}	|	S | du r5|j|d |d ||j|jd}	|	S |  dkr>| }	n|j| |d |j|d	}	|s^|	jjd
v r^t j	|	t
|jjd}	|	S )a  
    Prepares the correct `attn_mask` argument to be used by `torch.nn.functional.scaled_dot_product_attention`.

    In case no token is masked in the `attention_mask` argument, we simply set it to `None` for the cases `query_length == 1` and
    `key_value_length == query_length`, and rely instead on SDPA `is_causal` argument to use causal/non-causal masks,
    allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is passed).
    Trg   r   )rQ   rR   r   r
   Nr   r4   rT   )r   r   )cudaxpu)rL   )r   r   rY   r$   r   r   r5   r2   typerP   r-   r.   r/   )
rQ   r"   rR   r   r
   rk   r   rW   rX   r1   r   r   r   *_prepare_4d_causal_attention_mask_for_sdpa|  s<   
rq   rD   r   r&   c                 C   s   t j| ||dS )  
    Creates a non-causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
    `(batch_size, key_value_length)`

    Args:
        mask (`torch.Tensor`):
            A 2D attention mask of shape `(batch_size, key_value_length)`
        dtype (`torch.dtype`):
            The torch dtype the created mask shall have.
        tgt_len (`int`):
            The target length or query length the created mask shall have.
    rD   r   r&   )r   r)   rs   r   r   r   _prepare_4d_attention_mask  s   rt   c                 C   sP   t tt | j\}}|dur|n|}t| s t| dkr dS tj	| ||dS )rr   Nr   rs   )
r   r   r   r   r'   r   r-   rO   r   r)   )rD   r   r&   rV   r   r   r   r   #_prepare_4d_attention_mask_for_sdpa  s   
ru   r   r   c                 C   s8   t d|d}|| d  }|j| d | d |||d}|S )a/  
    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)`

    Args:
        input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
            The input shape should be a tuple that defines `(batch_size, query_length)`.
        dtype (`torch.dtype`):
            The torch dtype the created mask shall have.
        device (`int`):
            The torch device the created mask shall have.
        sliding_window (`int`, *optional*):
            If the model uses windowed attention, a sliding window should be passed.
    Trg   r   r   r4   )r   r$   )r"   r   r   r   r
   rk   r   rQ   r   r   r    _create_4d_causal_attention_mask  s   rv   rZ   r[   )r_   dataclassesr   typingr   r-   utilsr   utils.import_utilsr   r   
get_loggerr\   r   r   r   rb   rd   rj   listra   rm   rq   r   rt   ru   r   rv   r   r   r   r   <module>   sl   
  '
:
 : 