o
    }oi6f                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
 d dlZd dlm  mZ d dlmZmZmZ d dlmZ d dlmZ d dlmZmZ zd dlmZmZmZ W n eyq   d d	l m!Z! dZdZ"dZe!#d
 Y nw d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d dl+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6 d dl7m8Z8 d dlm9Z9mZ erd dl:m;Z; zd dlm<Z<m"Z" dZ=e"Z>W n ey   d dl?m@Z@ dZ=e@Z>Y nw dd ZAdej9dej9deBdeBd eeeB  f
d!d"ZCd#e.fd$d%ZD							d;d&e9d'e9d(e9d)e9d*e9d+e9d,ed-ed.eeB fd/d0ZEG d1d2 d2e,ZFG d3d4 d4ejjGZHG d5d6 d6eZIG d7d8 d8e5ZJG d9d: d:e,ZKdS )<    N)nullcontext)TYPE_CHECKINGListOptionalTupleUnion)InferenceParamsparallel_statetensor_parallel)get_bias_dropout_add)PackedSeqParams)SelfAttentionSelfAttentionSubmodules)TEColumnParallelLinearTEDotProductAttentionTERowParallelLinear)loggingzFailed to import Transformer Engine dependencies. `from megatron.core.transformer.custom_layers.transformer_engine import *`If using NeMo Run, this is expected. Otherwise, please verify the Transformer Engine installation.)AttnMaskType)
IdentityOp)MLPMLPSubmodules)MegatronModule)
ModuleSpecbuild_module)TransformerBlock)TransformerConfig)TransformerLayerTransformerLayerSubmodules)make_viewless_tensor)Tensornn)CrossAttentionVisionConfig)TEDelayedScalingTENormT)WrappedTorchLayerNormFc                 C   s   t | tjjr	| S | | fS )z(
    Convert an input to a 2-tuple.
    )
isinstancecollectionsabcIterable)x r*   \/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/mllama/model/vision.py	to_2tupleI   s   r,   r)   ar_idsntok
num_chunkssupported_aspect_ratiosc           
      C   s   g }| j }|D ]C}||d  }tj|| jd | f| jd}	d|	d|d |d  d|f< |	|| jd  | d}	|	|	j }	|	d}	||	 qt	|
|t|j }|S )a  
    Build attention masks for a vision encoder to handle padding and token alignment.

    Args:
        x (torch.Tensor): Input tensor of shape (batch_size, sequence_length).
        ar_ids (torch.Tensor): Aspect ratio IDs for masking.
        ntok (int): Number of tokens.
        num_chunks (int): Number of chunks in the data.
        supported_aspect_ratios (List[List[int]]): List of supported aspect ratios.

    Returns:
        torch.Tensor: Tensor containing the attention mask.
       )devicer   N)dtypetorchonesshaper2   viewT	unsqueezeappendstacktofinfomin)
r)   r-   r.   r/   r0   masksr4   ar_idarxmask_ir*   r*   r+   build_encoder_attention_maskR   s    

rD   returnc                  C   sN   t tttdtjitttt	t
t
ddttttttt	ddtd} tt| dS )z@
    Create a specification for an image transformer layer.
    attn_mask_type)
linear_qkvcore_attentionlinear_projq_layernormk_layernorm)moduleparams
submodules)
linear_fc1
linear_fc2)rL   rN   )input_layernormself_attentionself_attn_bdapre_mlp_layernormmlpmlp_bda)r   r#   r   SelfAttentionNoBiasr   no_maskr   r   r   r   r   r   r   r   ImageTransformerLayer)image_transformer_submodulesr*   r*   r+    get_image_transformer_layer_specq   s2   r[   hidden_statesattention_maskcontextcontext_maskrotary_pos_embattention_biasinference_paramspacked_seq_paramsreturn_intermediatec
                 C   s<  | j s| j}t|ddd}| jjrt  }
nt }
| jj	reddl
}| jj	dkr0|jjjj}n| jj	dkr=|jjjj}ntdt| j|dd| jj fd	}d}t r[tjdd
}|jjd||d}nt }|
ok| | jjdkr| jr|	du s~J d| j|||||||d}nYg }t| jD ]Q\}}|	dur||	v r|| | j# |||||||||d\}}|du s| jjr| jrJ W d   n1 sw   Y  t ! r| jj"r| j#dur| #|}q| j$dur| $|}t|ddd}|	dur
|t j%|ddfW  d   S |W  d   S 1 sw   Y  dS )z
    Perform a forward pass through the transformer layers with optional intermediate outputs.
    Override regular MCore transformer layer forward pass.
    Tinprequires_grad
keep_graphr   Ne4m3hybridz3E4M3 and HYBRID are the only supported FP8 formats.F)config
fp8_formatoverride_linear_precision)with_context_parallel)enabled
fp8_recipe	fp8_groupfullzQConfig `return_intermediate` cannot be used with `recompute_granularity='full'`. )r\   r]   r^   r_   r`   ra   rc   )r\   r]   r^   r_   r`   ra   rb   rc   r3   dim)&pre_processinput_tensorr   rk   sequence_parallelr
   get_cuda_rng_trackerforkr   fp8transformer_enginecommonrecipeFormatE4M3HYBRID
ValueErrorr"   	fp8_wgradr	   model_parallel_is_initializedget_amax_reduction_grouppytorchfp8_autocastrecompute_granularitytraining_checkpointed_forward	enumeratelayersr;   offload_contextenable_cuda_graphr5   is_grad_enabledcpu_offloading#group_prefetch_offload_commit_asyncfinal_layernormr<   )selfr\   r]   r^   r_   r`   ra   rb   rc   rd   rng_contextr{   rl   rp   rq   fp8_contextintermediate_hidden_statesl_nolayerr*   r*   r+    forward_with_return_intermediate   s   









5&r   c                       s|   e Zd ZdZ	ddedededeeeeef f deeeeef f dee	 d	d
f fddZ
dejd	ejfddZ  ZS )ColumnParallelConv2dPatcha  
    Conv2D Patching layer with model parallelism. Applies convolution in a column-parallel fashion.

    Args:
        config (TransformerConfig): Configuration object for the layer.
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        kernel_size (Union[int, Tuple[int, int]]): Size of the convolution kernel.
        stride (Union[int, Tuple[int, int]]): Stride of the convolution.
        bias (Optional[bool], default=False): Whether to include a bias term.

    Input:
        torch.Tensor: Input tensor of shape (batch_size, in_channels, width, height).

    Output:
        torch.Tensor: Output tensor of shape (batch_size, num_tokens, out_channels).
    Frk   in_channelsout_channelskernel_sizestridebiasrE   Nc                    sh   t  j|d t|tr||f}tjj||d| _t||d  |d  ||| j	| j	j
ddddd	| _d S )Nrk   )r   r   r   r1   Fconv1)r   rk   init_methodgather_outputskip_bias_add	is_experttp_comm_buffer_name)super__init__r%   intr5   r    Unfold_unfoldr   rk   r   _linear)r   rk   r   r   r   r   r   	__class__r*   r+   r     s   	
z"ColumnParallelConv2dPatch.__init__r)   c                 C   s6   |  |}|ddd}t|| jj}t|}|S )Forward.r      r1   )r   permuteFlinearr   weightr
   (gather_from_tensor_model_parallel_regionr   r)   r*   r*   r+   forward(  s
   

z!ColumnParallelConv2dPatch.forwardF)__name__
__module____qualname____doc__r   r   r   r   r   boolr   r5   r   r   __classcell__r*   r*   r   r+   r      s&    r   c                       sJ   e Zd ZdZ	ddedef fddZdejdejd	ejfd
dZ	  Z
S ) PrecomputedTilePositionEmbeddingz
    Module to compute positional embeddings for tiles with optional gating.

    Args:
        config (TransformerConfig): Configuration object.
        gated (bool, default=False): Whether to apply gating to the embeddings.
    Frk   gatedc                    sb   t    |j| _|j| _|j| _t| jd | j| j | _|| _|r/t	t
d| _d S d S )Nr1   )r   r   max_num_tileshidden_sizemax_aspect_ratio_idr    	Embedding	embeddingr   	Parameterr5   zerosgate)r   rk   r   r   r*   r+   r   :  s   
z)PrecomputedTilePositionEmbedding.__init__r\   aspect_ratio_idsrE   c                 C   s>   |  |}|d| jd| j}| jr|| j  }|| }|S )r   r3   r1   )r   reshaper   r   r   r   tanh)r   r\   r   
embeddingsr*   r*   r+   r   I  s   
z(PrecomputedTilePositionEmbedding.forwardr   )r   r   r   r   r   r   r   r5   r   r   r   r*   r*   r   r+   r   1  s    $r   c                       s4   e Zd ZdZejfdededef fddZ	  Z
S )rW   ar  
    Self-attention layer implementation without bias.

    Args:
        config (TransformerConfig): Configuration for the transformer.
        submodules (SelfAttentionSubmodules): Submodules required for self-attention.
        layer_number (int): The layer number in the transformer stack.
        attn_mask_type (AttnMaskType): Type of attention mask to apply.
    rk   rN   layer_numberc                    s   t  jd	||||d| t|j| jj| jd| j  | j| jjdddddd
| _t|j	| j| jj| j| jj
dddddd
| _	d S )
N)rk   rN   r   rF   r   Fqkv)rk   r   r   r   r   r   r   Tproj)rk   r   r   input_is_parallelr   r   r   r*   )r   r   r   rG   rk   r   query_projection_sizekv_projection_sizer   rI   output_layer_init_method)r   rk   rN   r   rF   kwargsr   r*   r+   r   `  s@   	zSelfAttentionNoBias.__init__)r   r   r   r   r   paddingr   r   r   r   r   r*   r*   r   r+   rW   U  s    rW   c                	       sN   e Zd ZdZ		ddedededef fdd	Z						dd
dZ	  Z
S )rY   a  
    Transformer layer adapted for processing image data with optional gating.

    Args:
        config (TransformerConfig): Transformer configuration object.
        submodules (TransformerLayerSubmodules): Submodules to use in the layer.
        layer_number (int, default=1): Layer number in the transformer.
        hidden_dropout (float, optional): Dropout rate for hidden layers.
    r1   Nrk   rN   r   hidden_dropoutc                    sh   t  jd||||d| | jj| _| jr2ttjd| jjd| _	ttjd| jjd| _
d S d S )N)rk   rN   r   r   r1   )r4   r*   )r   r   rk   r   r    r   r5   r   params_dtype	gate_attngate_ffn)r   rk   rN   r   r   r   r   r*   r+   r     s   
zImageTransformerLayer.__init__c                    sf  |}	|  |}
| j|
|||||d}| jsdn| j  t|ts%J dt fdd|D }|   | | j	| j
j||	| j}W d   n1 sMw   Y  |}	| |}| |}| jscdn| j t|tsqJ dtfdd|D }|   | | j	| j
j||	| j}W d   n1 sw   Y  t||jd	d
}| j
jr| j	r|S ||fS )r   )r]   rb   r`   ra   rc   r1   z:`attention_output_with_bias` needs to be tuple for gating.c                 3   $    | ]}|d ur | nd V  qd S Nr*   .0output)
_gate_attnr*   r+   	<genexpr>      
z0ImageTransformerLayer.forward.<locals>.<genexpr>Nz4`mlp_output_with_bias` needs to be tuple for gating.c                 3   r   r   r*   r   )	_gate_ffnr*   r+   r     r   Tre   )rQ   rR   r   r   r   r%   tuplebias_dropout_add_exec_handlerrS   r   rk   bias_dropout_fusionr   rT   rU   r   rV   r   rg   external_cuda_graph)r   r\   r]   r^   r`   ra   rb   rc   r   residualinput_layernorm_outputattention_output_with_biaspre_mlp_layernorm_outputmlp_output_with_biasr   r*   )r   r   r+   r     sR   
	



zImageTransformerLayer.forward)r1   N)NNNNNN)r   r   r   r   r   r   r   floatr   r   r   r*   r*   r   r+   rY     s(    rY   c                       st   e Zd ZdZ						dddd	ed
edededef fddZdd Zdd Zde	j
de	j
de	j
fddZ  ZS )VisionEncoderag  
    Vision encoder module for processing image inputs with patch-based embeddings.

    Args:
        config ('CrossAttentionVisionConfig'): Configuration object for the encoder.
        image_size (int, default=560): Input image size.
        patch_size (int, default=14): Size of patches extracted from the image.
        in_channels (int, default=3): Number of input channels.
        pre_process (bool, default=True): Whether to preprocess input.
        post_process (bool, default=True): Whether to postprocess output.
        return_intermediate (Optional[bool]): Whether to return intermediate layers.
    0        TNrk   r!   
image_size
patch_sizer   ru   post_processc                    s  t  j|d || _t|| _t|| _| jd | jd  | jd | jd  f| _|| _|| _| j	j
| _
|j| _|j}t|||||dd| _|d }	t|	t| | _t|	t| jd | jd  d | | _t||d| _t||d| _t| j	t d| j| jd| _tt| j| j_t | j	}
| j	j!|
_"d	|
_#t|
t d| j| jd| _$t%|d	d
| _&t%|d	d
| _'t(| j
d | j| jd | jd  d  | | _)tt*d| _+d S )Nr   r   r1   F)rk   r   r   r   r   r   g      )rk   r   )rk   specpost_layer_normru   r   T)rk   r   ),r   r   rd   r,   r   r   	grid_sizeru   r   rk   r   r   r   r   r   r    r   r5   randnclass_embeddingpositional_embeddingLayerNormImplln_postln_prer   r[   transformertypes
MethodTyper   r   copydeepcopynum_global_layers
num_layersr   global_transformerr   pre_tile_pos_embedpost_tile_pos_embedr   gated_tile_positional_embeddingr   gated_positional_embedding_gate)r   rk   r   r   r   ru   r   rd   widthscaleglobal_configr   r*   r+   r      sn   



,
(zVisionEncoder.__init__c                 C   sx   |j \}}}}||| ||}|| jd| j    }|||||}| |}|||||}|| j |  }|S )z>Apply regular position embedding and tile positonal embedding.r1   )r7   r8   r   r  r   r  r   )r   r)   r   bszr/   
num_tokensrt   tile_position_embeddingr*   r*   r+   apply_positional_embeddingF  s   
z(VisionEncoder.apply_positional_embeddingc              	   C   sD   t j| j|jt j|jd d|jd |j|jd |gdd}|S )zConcat class embedding tokens.r   r1   r3   )r4   r2   rs   )r5   catr   r=   r4   r   r7   r2   r   r*   r*   r+   apply_class_embeddingQ  s   "z#VisionEncoder.apply_class_embeddingimagesr-   rE   c                 C   s  |j dkrd}|j\}}}}}n	|j\}}}}}}||| | |||}||| d}||| | |||}	| |	}	|	j\}
}}|	|| |||}	| |	|}	|	|| | ||}	| |	}	|d7 }|	|| |||}	| |	|}	| |	}	d|	jd d  d }ddd|f}tj	|	|ddd}	|	
|| d|}	t|	|||| jj}|	dd }	| j|	d	|| jd
\}	}|	dd |dd }	}| |	}	|	|| ||| |}	| |	|}	|	|| |||  |}	|	dd }	| j|	d	|d}	|	dd}	|	|| ||| |}	|	d	d	d	d	d	|f }	|	|||||}	||| ||| d}|d	d	d	d	d	|f }|||||d}tj|	|gdd}	|	S )r      r1      r   constant)modevaluer3   N)r\   r]   ra   rd   )r\   r]   ra   rs   )ndimr7   r   r   r  r  r  r   r   padr8   rD   rk   r0   	transpose
contiguousr   rd   r   r  r  r5   r  )r   r  r-   num_concurrent_mediar  r/   nchwhr)   _r.   rt   npadr   	attn_biasint_xr*   r*   r+   r   ]  sb   




"	
zVisionEncoder.forward)r   r   r   TTN)r   r   r   r   r   r   r   r  r  r5   r   r   r   r*   r*   r   r+   r     s0    F$r   )NNNNNNN)Lr&   r   r   
contextlibr   typingr   r   r   r   r   r5   torch.nn.functionalr    
functionalr   megatron.corer   r	   r
   (megatron.core.fusions.fused_bias_dropoutr   megatron.core.packed_seq_paramsr   #megatron.core.transformer.attentionr   r   :megatron.core.transformer.custom_layers.transformer_enginer   r   r   ImportError
nemo.utilsr   r#   warningmegatron.core.transformer.enumsr   %megatron.core.transformer.identity_opr   megatron.core.transformer.mlpr   r    megatron.core.transformer.moduler   $megatron.core.transformer.spec_utilsr   r   +megatron.core.transformer.transformer_blockr   ,megatron.core.transformer.transformer_configr   +megatron.core.transformer.transformer_layerr   r   megatron.core.utilsr   r   nemo.collections.vlmr!   r"   HAVE_TEr   *megatron.core.transformer.torch_layer_normr$   r,   r   rD   r[   r   r   Moduler   rW   rY   r   r*   r*   r*   r+   <module>   s   	

#	

l5$7f