o
    wiF                     @   s   d dl mZ d dlZejfdejdejdededejdejfd	d
ZdejdefddZ	dedejdej
dejdejdeejejf fddZddejdedejfddZdS )    )TupleNmasks
num_chunkstotal_length
max_chunksdevicereturnc                 C   s   d}t | }tdd | D }tj||||f|||d}	tt| |D ]@\}
\}}tt||D ]2\}\}}t |dkrbt|d ||d< |d dkrO||d< |	|
|d |d |d	|f d
 q0q#|	S )a  
    Pads the provided masks to a uniform shape for batching.

    Args:
        masks (torch.Tensor): List of tensors containing attention masks for each batch.
        num_chunks (torch.Tensor): Tensor containing the number of chunks for each mask.
        total_length (int): Total sequence length for padding.
        max_chunks (int): Maximum number of chunks to pad each mask to.
        device (torch.device): Device to place the output tensor on.
        dtype (torch.dtype): Data type for the output tensor. Default is `torch.bfloat16`.

    Returns:
        torch.Tensor: A padded tensor of shape [B, total_length, max_num_media, max_chunks]
        where `B` is the batch size.
          ?c                 S   s   g | ]}t |qS  )len).0mr
   r
   d/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/vlm/mllama/model/utils.py
<listcomp>.   s    z(_pad_attention_masks.<locals>.<listcomp>)dtyper         r   Ng        )r   maxtorchfull	enumeratezipminfill_)r   r   r   r   r   r   
mask_value
batch_sizemax_num_mediapadded_masksidx
mask_groupchunks	media_idxmaskchunk_countr
   r
   r   _pad_attention_masks   s&   
&r%   attention_biasr   c                 C   s   | |kj dd| d S )a  
    Determines whether each row in the attention bias tensor contains masked values.

    Args:
        attention_bias (torch.Tensor): A 4D tensor of shape [B, H, S1, S2], where:
            - B: Batch size.
            - H: Number of attention heads.
            - S1: Length of the first sequence.
            - S2: Length of the second sequence.
        mask_value (float): The value used to represent masked positions in `attention_bias`.

    Returns:
        torch.Tensor: A 4D tensor of shape [B, H, S1, 1], containing boolean values (as a tensor)
        indicating if each row in the last dimension is fully masked (0 if fully masked, 1 otherwise).
    r   dim).N)anytype_as)r&   r   r
   r
   r   _get_full_row_masked_out_maskB   s   r+   text_token_counttext_device
text_dtypevision_tokenscross_attention_masksc                 C   s   |dusJ d|j d }|j d |j d ks#J d|j  d|j  |j d |j d ks9J d|j  d|j  | |j d ksMJ d	|  d
|j d  |j \}}}}	||| dd}t|dd}
|j|dd}||
9 }|j||d|
j||dfS )a  
    Generates a cross-attention mask for aligning text and vision tokens.

    Args:
        text_token_count (int): Number of tokens in the text sequence.
        text_device (torch.device): Device to place the output tensor on.
        text_dtype (torch.dtype): Data type for the output tensor.
        vision_tokens (torch.Tensor): Vision tokens tensor of shape [B, I, T, D] where:
            - B: Batch size.
            - I: Number of images.
            - T: Number of image tokens per image.
            - D: Dimension of each image token.
        cross_attention_masks (torch.Tensor): Cross attention masks of shape [B, N, I, C], where:
            - B: Batch size.
            - N: Number of text tokens.
            - I: Number of images.
            - C: Number of chunks.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
            - The adjusted cross-attention masks of shape [B, 1, N, I * T].
            - The full row mask status tensor of shape [B, 1, N, 1].
    NzVision tokens must be provided   r   r   zAMismatch in number of images given and number of masks provided: z vs z:Mismatch between vision tokens and cross-attention masks: zText sequence length z, does not match cross-attention mask length r   r	   )r   r'   )r   r   )shapeview	unsqueezer+   repeat_interleaveto)r,   r-   r.   r/   r0   vision_token_lengthr   _
num_imagesr   full_row_mask_statusr
   r
   r   _generate_cross_attention_maskX   s&   
r;     tokensvision_token_idc                 C   s   | |kj dd }| dkrtjddtjdS g }| dkr-|| t| g n)t	t|d D ]}|||  ||d   g q5||d  t| g |d d }t
|D ]}|d |d d krp||d< |d }q`tj|tjdS )a-  
    Create a vision mask from a tensor of tokens and a vision token ID.

    Args:
        tokens (torch.Tensor): A 1D tensor of token IDs.
        vision_token_id (int): The ID of the vision token.

    Returns:
        torch.Tensor: A tensor containing vision masks in the format [start, end].
    F)as_tupler   r   r   )r   r   )nonzerosqueezenumelr   emptylongappenditemr   rangereversedtensor)r=   r>   vision_token_locationsvision_masksilast_mask_endvision_maskr
   r
   r   create_vision_mask_tensor   s   $
rO   )r<   )typingr   r   bfloat16Tensorintr   r%   floatr+   r   r;   rO   r
   r
   r
   r   <module>   sF   	
-

 7