o
    }oiP                     @   s  d dl mZmZ d dlmZ d dlmZ d dlmZm	Z	 d dl
mZ d dlZzd dlmZmZmZmZ W n eyP   d dlmZ dZeZdZdZed Y nw d d	lmZmZmZ d d
lmZmZ d dlm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d dl+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6m7Z7 G dd dej8j9Z:dGde;de,fddZ<dd Z=dd Z>d d! Z?G d"d# d#e1Z@G d$d% d%e ZAG d&d' d'eZBdHde,fd)d*ZCeG d+d, d,e4ZDeG d-d. d.eDZEeG d/d0 d0eDZFG d1d2 d2ejGe6jHe6jIZJe6KeJd3G d4d5 d5e6jLd6eJf ZMe6jNd7d8d9d:e6jOfd;d<ZPd=d> ZQe6jNd?d@d9d:e6jOfdAdBZRe6jNdCdDd9d:e6jOfdEdFZSdS )I    )	dataclassfield)partial)Path)CallableOptionalN)TEColumnParallelLinearTEDotProductAttentionTENormTERowParallelLinear)loggingzFailed to import Transformer Engine dependencies. `from megatron.core.extensions.transformer_engine import *`If using NeMo Run, this is expected. Otherwise, please verify the Transformer Engine installation.)get_tensor_model_parallel_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)ColumnParallelLinearRowParallelLinear)SelfAttentionSelfAttentionSubmodules)DotProductAttention)AttnMaskType)
IdentityOp)MLPMLPSubmodules)
ModuleSpecbuild_module)TransformerConfig)TransformerLayerTransformerLayerSubmodules)CLIPViTConfig)ioteardownc                	       sR   e Zd ZdZ			ddedededef fdd	Zd
d Zdd Z	dd Z
  ZS )InternViTRMSNormzCustomized Version of RMSNormư>Fhidden_sizeepssequence_parallelcompute_varc                    sN   t    || _|| _tjt|| _|| _	|rJ dt
| jd| dS )a  Custom RMSNorm for InternViT.

        Args:
            config (TransformerConfig): Config.
            hidden_size (int): Input hidden size.
            eps (float): epsilon to use for the norm, default to 1e-6
            sequence_parallel (bool): Set to true if sequence parallelism is being used,
              this marks the weights as needing to be allreduced.
            compute_var (bool): Indicator to compute statistic manually.
        z5Sequence parallelism is not supported with InternViT.r%   N)super__init__configr$   torchnn	Parameteronesweight_compute_varsetattr)selfr)   r#   r$   r%   r&   	__class__ Z/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/vision/intern_vit.pyr(   C   s   
zInternViTRMSNorm.__init__c                 C   s0   |du r| djddd}|t|| j  S )RMSNormN   Tkeepdim)powmeanr*   rsqrtr$   )r1   xvarr4   r4   r5   _norm_   s   zInternViTRMSNorm._normc                 C   s   d}| j rC| jj}|jd }||d|dd}|jd | }|dkr)d}n|dkr0d}ntd	| | 	d
|||| }| 
| ||}|| j }| j rd||d|dd|}|S )z7Run RMSNorm with an option to compute custom statistic.Nr8   r            @      zCannot infer number of heads.r7   )r/   r)   r#   shapereshapesize
ValueError_gather_varfloatr;   r@   type_asr.   )r1   r>   r?   unpadded_hidden_sizemax_dimtotal_headsvalid_headsoutputr4   r4   r5   forwardf   s"   

zInternViTRMSNorm.forwardc                    s   t  }t }|| }g }|}	t|D ]}
|	|kr|| n||	 |	|8 }	q| d }|||  }|dkrF|dd|f jddd n	|jdddd   fd	d
t|D } ||< tjj| t	 d tj
||d }|jdddS )z-Compute statistic across the non-dummy heads.rA   r   .Nr8   Tr9           c                    s   g | ]}t  qS r4   )r*   
empty_like).0_r?   r4   r5   
<listcomp>   s    z0InternViTRMSNorm._gather_var.<locals>.<listcomp>)group)dim)r   r   rangeappendrZ   sumr*   distributed
all_gatherr   cat
contiguous)r1   input_rN   rP   rO   
world_sizerankheads_per_rankvalid_heads_per_rankremaining_headsrV   last_dim	valid_dimtensor_listrQ   r4   rW   r5   rJ   ~   s(   

zInternViTRMSNorm._gather_var)r"   FF)__name__
__module____qualname____doc__intrK   boolr(   r@   rR   rJ   __classcell__r4   r4   r2   r5   r!   @   s"    r!   Tuse_tereturnc                 C   s$   t tt| rtnt| rtntddS )N)
linear_fc1
linear_fc2module
submodules)r   r   r   r   r   r   r   rr   r4   r4   r5   get_mlp_module_spec   s   

rz   c                 C   s|   |\}}|j |j kr|n||j }|dur,|| }tjjj|||d}|||   }|S tjjj|||d}|||   }|S )z!Handle InternViT's layer scaling.N)ptraining)dtypetor*   r+   
functionaldropout)lsx_with_biasresidualprobr|   r>   biasoutr4   r4   r5    _bias_dropout_add_func_internvit   s   r   c                    s    fdd}|S )EBias-dropout-add as in Megatron but with added LayerScaling handling.c                    s   t  | ||S N)r   )r   r   r   r   r|   r4   r5   _bias_dropout_add   s   z=bias_dropout_add_unfused_internvit.<locals>._bias_dropout_addr4   )r   r|   r   r4   r   r5   "bias_dropout_add_unfused_internvit   s   r   c                 C   s   |rJ dt | |S )r   z5Fused bias-dropout-add not implemented for InternViT.)r   )r   r|   fusedr4   r4   r5   get_bias_dropout_add_internvit   s   
r   c                           e Zd ZdZ fddZ  ZS )InternViTTransformerLayerz:Add InternViT specialties to our default TransformerLayer.c                    sf   t  j|i | tjt| jj| _tjt| jj| _	t
| j| j| _t
| j| j	| _d S r   )r'   r(   r*   r+   r,   r-   r)   r#   ls1ls2r   self_attn_bdamlp_bda)r1   argskwargsr2   r4   r5   r(      s
   z"InternViTTransformerLayer.__init__)rk   rl   rm   rn   r(   rq   r4   r4   r2   r5   r          r   c                       s*   e Zd ZdZdedef fddZ  ZS )InternViTSelfAttentionzaOverride a few things that are special in InternViT and not supported by the SelfAttention class.r)   rx   c                    s   t  j|||d| | jj}t|j| jj| jd| j  | j| jj	d|dddd
| _| j
| j }t|j|| j| jjdd| _t|j|| j| jjdd| _d S )N)r)   rx   r7   Fqkv)r)   init_methodgather_outputr   skip_bias_add	is_experttp_comm_buffer_nameT)r#   r)   r$   r&   )r'   r(   r)   add_qkv_biasr   
linear_qkvr#   query_projection_sizekv_projection_sizer   hidden_size_per_attention_head!num_attention_heads_per_partitionq_layernormlayernorm_epsilonk_layernorm)r1   r)   rx   r   r   qkv_biasqk_layernorm_hidden_sizer2   r4   r5   r(      s<   
zInternViTSelfAttention.__init__)rk   rl   rm   rn   r   r   r(   rq   r4   r4   r2   r5   r      s    r   c                       r   )InternViTTEDotProductAttentionz Adjusted Attention for InternViTc                    sx   t  j|i |}tj||j|jd}t }|jd }d}||kr.|d|df  d9  < n||kr6|d9 }||9 }|S )z?Regular TEDotProductAttention + zero-out dummy attention heads.)r}   devicer8      .NrS   )r'   rR   r*   	ones_liker}   r   r   rF   )r1   r   r   r   maskrd   rN   valid_ranksr2   r4   r5   rR     s   
z&InternViTTEDotProductAttention.forward)rk   rl   rm   rn   rR   rq   r4   r4   r2   r5   r     r   r   r6   c                 C   s   t td}t| }ttt|| ttdtjit	| rt
nt| rtnt| r$tnt|r+|| nt|r2|| ntddt|| |tddS )zGet InterViT's MCore layer spec)r6   	LayerNormattn_mask_type)r   core_attentionlinear_projr   r   )rw   paramsrx   )input_layernormself_attentionr   pre_mlp_layernormmlpr   rv   )r!   r
   rz   r   r   r   r   r   no_maskr   r   r   r	   r   r   r   r   r   )rr   add_qk_norm	norm_typeNORM2FNr   r4   r4   r5   get_internvit_layer_spec  s2   


r   c                   @   sN  e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed< d	Zeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< ejjjZeed< dZeed< dZeed< dZ eed< dZ!eed < dZ"eed!< d"Z#eed#< d$Z$eed%< dZ%eed&< e&d'd( d)Z'e(ed*< d+S ),InternViTConfigzIntern ViT Base Config	internvitvision_model_type   	patch_dimi  img_himg_w-   
num_layersrC   num_attention_headsnum_query_groupsrB   kv_channelsTadd_bias_linearFr   i  r#   rS   hidden_dropoutattention_dropouti 2  ffn_hidden_sizegated_linear_unitactivation_funclayernorm_zero_centered_gammaapply_query_key_layer_scalingbias_activation_fusionbias_dropout_fusionattention_softmax_in_fp32r6   normalizationr"   r   apply_rope_fusionc                   C   s
   t ddS )NTry   r   r4   r4   r4   r5   <lambda>T  s   
 zInternViTConfig.<lambda>default_factorytransformer_layer_specN))rk   rl   rm   rn   r   str__annotations__r   ro   r   r   r   r   r   r   r   rp   r   r#   r   rK   r   r   r   r*   r+   r   gelur   r   r   r   r   r   r   r   r   r   r   r   r   r4   r4   r4   r5   r   8  s6   
 r   c                   @   s   e Zd ZU dZdZeed< dS )InternViT_6B_448px_Configz Intern ViT 6B Config for >= v1.5r   r   N)rk   rl   rm   rn   r   r   r   r4   r4   r4   r5   r   W  s   
 r   c                   @   s   e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed< d	Zeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< edd dZeed< dS )InternViT_300M_448px_Configz"Intern ViT 300M Config for >= v1.5r   r      r   rE   r   r   rD   r   Tr   r   i   r#   rS   r   r   i   r   r   r   c                   C   s   t ddddS )NTFr   rr   r   r   r   r4   r4   r4   r5   r   o  s
    z$InternViT_300M_448px_Config.<lambda>r   r   N)rk   rl   rm   rn   r   r   r   r   ro   r   r   r   r   rp   r   r#   r   rK   r   r   r   r   r   r   r4   r4   r4   r5   r   ^  s"   
 r   c                       s6   e Zd ZdZd	dee f fddZd
ddZ  ZS )InternViTModelzInternViT NeMo WrapperNr)   c                    s   t    || _d S r   )r'   r(   r)   )r1   r)   r2   r4   r5   r(   z  s   

zInternViTModel.__init__rs   c                 C   s   t | ds| j | _d S d S )Nrw   )hasattrr)   configure_modelrw   r1   r4   r4   r5   r     s   
zInternViTModel.configure_modelr   )rs   N)	rk   rl   rm   rn   r   r   r(   r   rq   r4   r4   r2   r5   r   w  s    r   hfc                   @   sJ   e Zd ZdZdefddZdedefddZdd	 Ze	de
fd
dZdS )HFInternViTImporterzHF InternViT Importerrs   c                 C   s
   t | jS r   )r   r)   r   r4   r4   r5   init  s   
zHFInternViTImporter.initoutput_pathc                 C   sv   ddl m} |jt| dd}|  }| |}| || td|  | || td|  t	|| ~~|S )Nr   )	AutoModelTtrust_remote_codez-Converted InternViT model to Nemo, saving to z#Converted InternViT model saved to )
transformersr   from_pretrainedr   r   
nemo_setupconvert_stateprint	nemo_saver    )r1   r   r   sourcetargettrainerr4   r4   r5   apply  s   

zHFInternViTImporter.applyc                 C   s   i ddddddddd	d
dddddddddddddddddddddd d!d"}t j|||tttgd#S )$Nzembeddings.class_embeddingclass_tokenz!embeddings.patch_embedding.weightzconv1.weightzembeddings.patch_embedding.biasz
conv1.biaszencoder.layers.*.ls1zdecoder.layers.*.ls1zencoder.layers.*.ls2zdecoder.layers.*.ls2z#encoder.layers.*.attn.q_norm.weightz2decoder.layers.*.self_attention.q_layernorm.weightz#encoder.layers.*.attn.k_norm.weightz2decoder.layers.*.self_attention.k_layernorm.weightz!encoder.layers.*.attn.proj.weightz2decoder.layers.*.self_attention.linear_proj.weightzencoder.layers.*.attn.proj.biasz0decoder.layers.*.self_attention.linear_proj.biaszencoder.layers.*.mlp.fc1.weightz&decoder.layers.*.mlp.linear_fc1.weightzencoder.layers.*.mlp.fc1.biasz$decoder.layers.*.mlp.linear_fc1.biaszencoder.layers.*.mlp.fc2.weightz&decoder.layers.*.mlp.linear_fc2.weightzencoder.layers.*.mlp.fc2.biasz$decoder.layers.*.mlp.linear_fc2.biaszencoder.layers.*.norm1.weightz'decoder.layers.*.input_layernorm.weightzencoder.layers.*.norm1.biasz%decoder.layers.*.input_layernorm.biaszencoder.layers.*.norm2.weightz)decoder.layers.*.pre_mlp_layernorm.weightzencoder.layers.*.norm2.biasz'decoder.layers.*.pre_mlp_layernorm.bias)mapping
transforms)r   apply_transforms_import_position_embedding_import_qkv_import_qkv_bias)r1   r   r   r   r4   r4   r5   r     sR   	
z!HFInternViTImporter.convert_statec                 C   s   ddl m} |jt| dd}t|dd}|dkrd}t|j|j|j|j|j	|j
|j|j|j|j |j|j|td|j|d	d
}|S )Nr   )
AutoConfigTr   r   r6   
layer_normr   r   )r   r   r   r#   r   r   r   r   r   r   r   r   r   )r   r   r   r   getattrr   
patch_size
image_sizer#   intermediate_sizelayer_norm_epsr   r   num_hidden_layersr   qk_normalization)r1   r   r   r   rQ   r4   r4   r5   r)     s2   
zHFInternViTImporter.configN)rk   rl   rm   rn   r   r   r   r   r   propertyr   r)   r4   r4   r4   r5   r     s    "r   InternVisionModelzembeddings.position_embeddingzposition_embeddings.weight)
source_key
target_keyctxc                 C   s
   | dS )Nr   )squeeze)r  pos_embr4   r4   r5   r     s   
r   c                 C   sd  |   }||f|dd   }	||f|dd   }
| j|	 } |j|
 }|j|
 }g }t|D ]<}|| || |d | d d d d f  ||||d d d d d f  ||||d d d d d f  q-t|}|jdksyJ |j|jd |d | ksJ |j|jd |ksJ |j|jd |d ksJ |j|||d|   |g}|S )NrA      r   r7   )	rH   viewr[   r\   r*   r`   ndimrF   rG   )qkvhead_numr   heads_per_groupr#   	head_sizeold_tensor_shapenew_q_tensor_shapenew_kv_tensor_shapeqkv_weights_liqkv_weightsr4   r4   r5   
import_qkv  s$   


,$&
 r  z encoder.layers.*.attn.qkv.weightz1decoder.layers.*.self_attention.linear_qkv.weightc              
   C   s@   | j j}|d\}}}t||||j|j|j|j |j|jdS )Nr  r  r   r  r#   r  )r   r)   chunkr  r   r   r#   r   )r  r   megatron_configr  r  r  r4   r4   r5   r     s   
r   zencoder.layers.*.attn.qkv.biasz/decoder.layers.*.self_attention.linear_qkv.biasc              
   C   sV   | j j}|d\}}}t|d|d|d|j|j|j|j d|jddS )Nr  r8   rA   r  )	r   r)   r   r  	unsqueezer   r   r   r  )r  r   r!  q_biask_biasv_biasr4   r4   r5   r     s   
	r   )T)Tr6   )Tdataclassesr   r   	functoolsr   pathlibr   typingr   r   lightning.pytorchpytorchLr*   +megatron.core.extensions.transformer_enginer   r	   r
   r   ImportError
nemo.utilsr   objectwarningmegatron.core.parallel_stater   r   r   $megatron.core.tensor_parallel.layersr   r   #megatron.core.transformer.attentionr   r   /megatron.core.transformer.dot_product_attentionr   megatron.core.transformer.enumsr   %megatron.core.transformer.identity_opr   megatron.core.transformer.mlpr   r   $megatron.core.transformer.spec_utilsr   r   ,megatron.core.transformer.transformer_configr   +megatron.core.transformer.transformer_layerr   r    nemo.collections.vlm.vision.baser   nemo.lightningr   r    r+   Moduler!   rp   rz   r   r   r   r   r   r   r   r   r   r   LightningModuleIOMixinConnectorMixinr   model_importerModelConnectorr   state_transformTransformCTXr   r  r   r   r4   r4   r4   r5   <module>   s   _
+ 
\