o
    wiP                     @   sl  d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	Z	 d dlm
Z
mZ d dlmZ d dlmZ z
d dlmZ eZW n eyV   d d	lmZ dZejjZed
 Y nw d dlmZ d dlmZ d dlmZmZ d dlm Z  d dl!m"Z" d dlm#Z# eG dd de#Z$G dd dZ%G dd dej&Z'dej(de)dej(fddZ*G dd de	Z+G dd deZ,dS )     N)	dataclass)CallableOptionalUnion)MegatronModule)einsumnn)ColumnParallelConv2dPatch)MultimodalProjectorConfig)TENorm)loggingzFailed to import Transformer Engine dependencies. `from megatron.core.extensions.transformer_engine import *`If using NeMo Run, this is expected. Otherwise, please verify the Transformer Engine installation.)VisionModule)	ModelType)
ModuleSpecbuild_module)TransformerBlock)TransformerConfig)CLIPViTConfigc                   @   sv  e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed< d	Zeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< ejjjZeed< dZeed< dZ eed < dZ!eed!< dZ"eed"< dZ#eed#< d$Z$eed%< d&Z%eed'< dZ&eed(< d)Z'eed*< dZ(eed+< d,Z)e*e+ ed-< d2d0d1Z,d,S )3Llama4VisionConfigz<Configuration class for the Llama4 Vision Transformer model.llama4vision_model_type   	patch_dimP  img_himg_w"   
num_layers   num_attention_headsnum_query_groupsX   kv_channelsTadd_bias_linearadd_qkv_biasi  hidden_sizeg        hidden_dropoutattention_dropouti   ffn_hidden_size   
output_dimFgated_linear_unitactivation_funclayernorm_zero_centered_gammaapply_query_key_layer_scalingbias_activation_fusionbias_dropout_fusionattention_softmax_in_fp32	LayerNormnormalizationgư>layernorm_epsilonapply_rope_fusiong      ?pixel_shuffle_ratiorotary_interleavedNtransformer_layer_specreturnLlama4ViTModelc                 C   sR   | j }t|tsddlm} |dd}t| || j| j| j| j	| j
| j| j| jd
S )z9Configures and returns an instance of the Llama4ViTModel.r   )get_layer_spec_teT)is_vit)ln_pre_implln_post_impladd_class_tokenclass_token_lenr   r   r   model_subtype)r8   
isinstancer    nemo.collections.vlm.layer_specsr;   r:   r=   r>   r?   r@   r   r   r   r   )selfr8   r;    rE   e/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/vlm/llama4/model/vision.pyconfigure_modelT   s    

z"Llama4VisionConfig.configure_model)r9   r:   )-__name__
__module____qualname____doc__r   str__annotations__r   intr   r   r   r   r    r"   r#   boolr$   r%   r&   floatr'   r(   r*   r+   torchr   
functionalgelur,   r   r-   r.   r/   r0   r1   r3   r4   r5   r6   r7   r8   r   r   rG   rE   rE   rE   rF   r   3   s>   
 r   c                   @   s<   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdS )PackingIndexz=Defines constant indices for accessing packed token metadata.r                           N)rH   rI   rJ   rK   ZYXTIMEHEIGHTWIDTHIDX	BATCH_IDXNUM_METADATAID_CLS_TOKENID_PAD_TOKENrE   rE   rE   rF   rT   k   s    rT   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	PixelShufflea8  Performs pixel shuffle operation on encoded patches.

    Rearranges elements in a tensor of shape [B, N, C] representing encoded image patches
    by moving spatial dimensions from the channel dimension.

    Args:
        ps_ratio (float): The ratio for pixel shuffling (e.g., sqrt of downscaling factor).
    ps_ratioc                    s   t    || _dS )zyInitializes the PixelShuffle module.

        Args:
            ps_ratio (float): The ratio for pixel shuffling.
        N)super__init__rk   )rD   rk   	__class__rE   rF   rm      s   

zPixelShuffle.__init__xr9   c                 C   s   | j dus	J d| dksJ dtt|jd  }}||jd ||d}t|| j d}||jd d|jd }|S )	aE  Applies the pixel shuffle operation.

        Args:
            x (torch.Tensor): Input tensor of shape [B, N, C], where N is the number of patches.

        Returns:
            torch.Tensor: Tensor after pixel shuffle, shape [B, N', C'],
                          where N' = N * ps_ratio^2 and C' = C / ps_ratio^2.
        Nz&ps_ratio is required for pixel shufflerW   z0pixel shuffle requires encoded patches [B, N, C]rU   r   r^   )rk   )rk   dimrN   mathsqrtshapereshapepixel_shuffle_op)rD   rp   hhwwpixel_shuffle_patchesrE   rE   rF   forward   s   zPixelShuffle.forward)
rH   rI   rJ   rK   rP   rm   rQ   Tensorrz   __classcell__rE   rE   rn   rF   rj      s    		rj   input_xrk   r9   c              	   C   s   |   \}}}}| ||t|| t|| } | dddd } | |t|| t|| t|||  } | dddd } | S )a  Helper function to perform the core pixel shuffle logic.

    Args:
        input_x (torch.Tensor): Input tensor of shape [n, w, h, c].
        ps_ratio (float): Pixel shuffle ratio.

    Returns:
        torch.Tensor: Output tensor after pixel shuffle.
    r   rV   rU   rW   )sizeviewrN   permute
contiguous)r}   rk   nwhcrE   rE   rF   rv      s   
 

rv   c                       sR   e Zd ZdZ		ddededededef
 fd	d
Zde	j
de	j
fddZ  ZS )PixelShuffleMLPa  Applies pixel shuffle followed by an MLP projection.

    Takes encoded patches, performs pixel shuffling, and then projects
    them using a configurable MLP.

    Args:
        config (TransformerConfig): Megatron core transformer configuration.
        ps_ratio (float): Ratio for the pixel shuffle operation.
        input_dim (int): Input dimension before pixel shuffle. The dimension after
                         pixel shuffle (input to MLP) will be input_dim / (ps_ratio**2).
        output_dim (int): Output dimension of the MLP projection. Defaults to 4096.
        add_fc (bool): Whether to add an additional fully connected layer (Not Implemented).
                       Defaults to False.
    r)   Fconfigrk   	input_dimr*   add_fcc                    sP   t  | t|| _tdt||d  ||ddd| _| j | _|r&t	dS )z'Initializes the PixelShuffleMLP module.	mcore_mlprV   F)projector_type
input_sizer%   r(   biasr/   N)
rl   rm   rj   pixel_shuffler
   rN   
mlp_configrG   mlpNotImplementedError)rD   r   rk   r   r*   r   rn   rE   rF   rm      s   	
zPixelShuffleMLP.__init__encoded_patchesr9   c                 C   s"   |  |}| jjj}|| |S )zForward pass through pixel shuffle and MLP.

        Args:
            encoded_patches (torch.Tensor): Input encoded patches of shape [B, N, C_in].

        Returns:
            torch.Tensor: Output tensor after projection, shape [B, N', C_out].
        )r   r   encoderr,   )rD   r   r,   rE   rE   rF   rz      s   
	
zPixelShuffleMLP.forward)r)   F)rH   rI   rJ   rK   r   rP   rN   rO   rm   rQ   r{   rz   r|   rE   rE   rn   rF   r      s     r   c                       s  e Zd ZdZeeddddddfdeded	eeef d
eeef de	de
de
de
de
deddf fddZdejfddZd-de
de
dejfddZejjddddejd ejdejfd!d"Zd#ejddfd$d%Zd.d&ejd'eej dejfd(d)Zd*ejdejfd+d,Z  ZS )/r:   aD  LLama vision model.

    Args:
        transformer_config (TransformerConfig): Transformer config.
        transformer_layer_spec (ModuleSpec): Specifies module to use for transformer layers.
        ln_pre_impl (ModuleSpec or type): Specifies the layer norm type to use for ln_pre.
        add_class_token (bool, optional): Include a class token. Defaults to True.
        class_token_len (int): Class token length. Defaults to 1 but 8 may be faster.
        patch_dim (int): Image patch size.
        img_h (int): Input image height.
        img_w (int): Input image width.
    TrU   r   r   r   transformer_configr8   r=   r>   r?   r@   r   r   r   rA   r9   Nc                    s  t  j|d || _|j| _|| _|| _|	| _| j| j dks!J | j| j dks+J | j| j | _| j| j | _	| j| j	 | _
|| _|| _| j
| jrN| jnd | _t||| j|jd| _t||| j|jd| _t|d| j| j| jdd| _t| jdd | _tj| j| j| _|| _| jr| jd	 }tj|td| j| j | _tj| _ t!||d
ddd| _"t#||j$|j|j%d| _&|j%| _%| ' | _(dS )a  Initializes the Llama4 Vision Transformer model.

        Args:
            transformer_config (TransformerConfig): Transformer configuration object.
            transformer_layer_spec (ModuleSpec): Specification for the transformer layer module.
            ln_pre_impl (Union[ModuleSpec, type]): Implementation for pre-layer normalization.
                                                    Defaults to NORM_IMPL.
            ln_post_impl (Union[ModuleSpec, type]): Implementation for post-layer normalization.
                                                     Defaults to NORM_IMPL.
            add_class_token (bool): Whether to prepend a class token to the patch sequence.
                                    Defaults to True.
            class_token_len (int): Length of the class token sequence. Defaults to 1.
            patch_dim (int): Dimension of the square patches the image is divided into.
                             Defaults to 14.
            img_h (int): Height of the input images. Defaults to 336.
            img_w (int): Width of the input images. Defaults to 336.
            model_subtype (str): Subtype identifier for the model. Defaults to "llama4".
        )r   r   )r   r%   epsrW   F)r   in_channelsout_channelskernel_sizestrider   rU   r^   g      T)r   specpre_processpost_processvp_stage)r   rk   r   r*   N))rl   rm   r@   r%   visual_hidden_sizer   r   r   num_patches_per_dim_hnum_patches_per_dim_wnum_patchesr?   
seq_lengthr   r4   ln_preln_postr	   conv1rQ   arangeexpandcudaposition_idsr   	Embeddingposition_embeddings	Parameterrandnclass_tokenr   encoder_or_decoder
model_typer   decoderr   r6   r*   adapterget_rope_embrotary_pos_emb)rD   r   r8   r=   r>   r?   r@   r   r   r   rA   scalern   rE   rF   rm     st   	
zLlama4ViTModel.__init__c                 C   sD  | j  }}| j| | j| }}tj| j| j ||  tjd}||| d}tj||dd gdd}tj	|d< tj
|jd |jd tjd tjd}|| |ddddtjf< || |ddddtjf< |ddddtjf | |ddddtjf | ||ddddtjf< |ddtjd }| | jj| jj d }| ||ddddtjf d }| ||ddddtjf d }	tj||	gdd  d	dddf }
|
|ddddtjdf dk d}
|
d}
tj|
dd|
ddfdd|
jd d}|ddddddf }| S )
a  Computes the Rotary Position Embedding (RoPE) based on image patch coordinates.

        Generates 2D RoPE embeddings using the packed image index metadata. The embeddings
        are computed separately for X and Y coordinates and concatenated. RoPE is disabled
        for padding and CLS tokens.

        Returns:
            torch.Tensor: The computed RoPE tensor of shape [seq_length, 1, 1, dim],
                          ready to be applied in the attention mechanism.
        )dtyperU   Nr   rq   )r^   r^   r^   rV   .)r   r   r   rQ   r   int32ru   catrT   rh   emptyrt   rg   r`   ra   rc   fill_rd   re   get_rope_freqsr   r%   r   compute_rope_freqsrP   r   masked_fillsqueezestackr   r   )rD   patch_hpatch_widx_hidx_wimg_idxpacked_img_idx	rope_freqfreqs_xfreqs_yfreqsr   rE   rE   rF   r   m  s6   
 
$$(&
0zLlama4ViTModel.get_rope_emb'  rq   thetac                 C   s.   d|t d|dd|d   |   }|S )ab  Calculates the base frequencies for RoPE.

        Args:
            dim (int): The dimension of the embeddings for which RoPE is calculated (usually head_dim // 2).
            theta (int): The base period for the sinusoidal embeddings. Defaults to 10000.

        Returns:
            torch.Tensor: A 1D tensor containing the RoPE frequencies.
        g      ?r   rV   N)rQ   r   rP   )rD   rq   r   r   rE   rE   rF   r     s   *
zLlama4ViTModel.get_rope_freqsr   F)enabledr   tc                 C   s&   t d||j|}|jddd}|S )a  Computes RoPE frequencies for given positions `t`.

        Applies the base frequencies to the position indices and interleaves them.

        Args:
            freqs (torch.Tensor): Base RoPE frequencies (output of get_rope_freqs).
            t (torch.Tensor): Tensor containing position indices (e.g., X or Y coordinates).

        Returns:
            torch.Tensor: RoPE frequencies corresponding to the input positions `t`.
        z..., f -> ... frV   r^   r   )r   typer   repeat_interleave)rD   r   r   rE   rE   rF   r     s   z!Llama4ViTModel.compute_rope_freqsinput_tensorc                 C   s   | j | dS )zSets input tensor to the model's decoder block.

        Args:
            input_tensor (torch.Tensor): Input tensor to set.
        N)r   set_input_tensor)rD   r   rE   rE   rF   r     s   zLlama4ViTModel.set_input_tensorrp   attention_maskc                 C   s   |  |}| jr| j|jd dd}tj||gdd}|jd | jks1J |jd  d| j || | j	 }| j
rA| 
|}|ddd}| }| j||| jd}|ddd}| }| jrh| |}|S )a  Forward function of the ViT Model. This function passes the input tensors
        through the embedding layer and then the transformer.

        Args:
            x (torch.Tensor): input data of shape [batch, img_h, img_w]
            attention_mask (torch.Tensor with dtype=bool): Attention mask to use.

        Returns:
            x (torch.Tensor): output after final transformer block of shape [b, s, h].
        r   r^   rU   r   z != rV   )r   )r   r?   r   r   rt   rQ   r   r   r   r   r   r   r   r   r   r   )rD   rp   r   r   rE   rE   rF   _encode  s(   
*

zLlama4ViTModel._encodeimagesc                 C   s2   |  |}|ddddddf }| |}|S )a  Processes input images through the ViT encoder and adapter.

        Args:
            images (torch.Tensor): Input image tensor of shape [batch, channels, height, width].

        Returns:
            torch.Tensor: Projected embeddings after passing through the encoder and adapter,
                          typically of shape [batch, num_output_patches, output_dim].
                          The CLS token output (if used) is removed before the adapter.
        Nr^   )r   r   )rD   r   	embeddingprojected_embeddingrE   rE   rF   rz     s   

zLlama4ViTModel.forward)r   )N)rH   rI   rJ   rK   	NORM_IMPLr   r   r   r   rO   rN   rL   rm   rQ   r{   r   r   ampautocastr   r   r   r   rz   r|   rE   rE   rn   rF   r:      sX    

	
k."&r:   )-rr   dataclassesr   typingr   r   r   rQ    megatron.core.transformer.moduler   r   r   (nemo.collections.vlm.mllama.model.visionr	    nemo.collections.vlm.vision.baser
   +megatron.core.extensions.transformer_enginer   r   ImportError
nemo.utilsr   r2   warning7megatron.core.models.common.vision_module.vision_moduler   megatron.core.transformer.enumsr   $megatron.core.transformer.spec_utilsr   r   +megatron.core.transformer.transformer_blockr   ,megatron.core.transformer.transformer_configr   r   r   rT   Modulerj   r{   rP   rv   r   r:   rE   rE   rE   rF   <module>   s@   7'5