o
    }oi ?                     @   s   d Z ddlZddlmZ ddlZddlZddlm  mZ	 ddl
mZ ddlmZ ddlmZmZmZ ddlmZ G dd	 d	eZG d
d deZdd Zdd ZG dd deZdS )zVision Transformer(VIT) model.    N)partial)get_layer_norm)MegatronModule)ApexGuardDefaultsinit_method_normalscaled_init_method_normal)ParallelVisionTransformerc                       s*   e Zd ZdZd fdd	Zdd Z  ZS )		DropPatchz*
    https://arxiv.org/abs/2212.00794
       Tc                    s@   d|  krdk sJ  J t t|   || _|| _|| _d S )Nr   g      ?)superr	   __init__probclass_token_lengthexclude_cls_tokens)selfr   r   r   	__class__ d/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vision/modules/vit/vit_backbone.pyr   '   s
   
zDropPatch.__init__c                 C   s   | j dks| js
|S | j}| jr%|d d d |f |d d |d f }}|j\}}}|j}tj||d}|d }d| j  }	tdt	||	 }
tj
|||d}|j|
ddj}|||f }| jritj||fdd}|S )N        device).N   dim)r   trainingr   r   shaper   torcharangemaxintrandntopkindicescat)r   xr   
cls_tokensbatch
num_tokens_r   batch_indices	keep_probnum_patches_keeprandpatch_indices_keepr   r   r   __call__.   s"   *
zDropPatch.__call__)r
   T)__name__
__module____qualname____doc__r   r0   __classcell__r   r   r   r   r	   "   s    r	   c                       s(   e Zd ZdZ fddZdd Z  ZS )
VitMlpHeada4  Pooler layer.

    Pool hidden states of a specific token (for example start of the
    sequence) and add a linear transformation followed by a tanh.

    Arguments:
        hidden_size: hidden size
        init_method: weight initialization method for the linear layer.
            bias is set to zero.
    c                    sR   t t|   tj||| _tj | _tj||| _	tjj
| j	jd d S )Ni)r   r6   r   r   nnLineardense_inReLUrelu	dense_outinit	constant_bias)r   hidden_sizenum_classesr   r   r   r   V   s
   zVitMlpHead.__init__c                 C   s"   |  |}t|}| |}|S )N)r9   r   tanhr<   )r   hidden_statesdense_in_resulttanh_resultdense_out_resultr   r   r   forward]   s   


zVitMlpHead.forward)r1   r2   r3   r4   r   rG   r5   r   r   r   r   r6   J   s    r6   c                 C   s*   | dkrt | }t|t| | kS dS )Nr   F)mathsqrtr!   )r&   srr   r   r   isPerfectSquaref   s   
rK   c	                 C   s  | j | j }	| j| j }
|	|
 }| j}| dd}|d }||v s,J | d|  ||v r|| }|jd }t|sEt|| sEJ t| }|rP|| n|}|}|}|rm|d |d d f }||d d d f }ntj	|||j
d}|}|jd |ksJ ||krtt|}|	|
f}|dd }|dd||f}| }|d | |d | f}tj||d	d
}| }|d|f}|dd }|jd |ksJ |}|jd |kr|jd |ksJ |rtj||fdd}|||< d S d S )Nr   r
   weightz not in r   r   r   r   bilinearscale_factormoder   )img_h	patch_dimimg_wr@   getkeysr   rK   r   zerosr   r!   rH   rI   	transpose
contiguousreshapefloatFinterpolatehalfr%   )	model_cfgclass_token_present
state_dictprefixlocal_metadatastrictmissing_keysunexpected_keys
error_msgsnum_patches_per_dim_hnum_patches_per_dim_wnum_patchesr@   r   keyinput_paraminput_seq_leninput_has_class_tokennum_tok_inputnum_tok_outputoutput_has_class_tokeninput_param_tokinput_param_gridgs_inputgs_newrO   r   r   r   )twod_interpolate_position_embeddings_hookm   sN   

 ru   c                       sF   e Zd ZdZ						d fdd	Zdd Zd	d
 Zdd Z  ZS )VitBackbonezVision Transformer Model.NTFc	                    s  t t| jdd |j| _|j}	|j}
|d u rt|
}|d u r$t|
|	}|| _|| _	|| _
|j| _|j| _|j| _|j| _|| _|dd| _|dd| _|dd}| j| j dks^J | j| j dkshJ | j| j | _| j| j | _| j| j | _| j
r|dd	nd| _| j| j | _| j| j |j | _d | _d | _d | _| jrY| j
rtjt d
| j| j| _!tjj"#| j! t$| j%d
d& | _tjj'|j| j| j| jf| j| jfdd| _(|dd| _)| j)dkrtj*| j| j| _+t|j| j+j, | j
}| j+-t.t/|| n%| j)dkr-tjt0| j| j| _+t|j| j+ n	t1d| j) dtj2|j3| _4t5| j| j| j
d| _6|rYt7|j|j8|j9|j:d| _t;d3i d|d|d|d|jd|jd|j<d|j=d|j>d|j?d| jd| j	d|j@d |jAd!|jBd"|jCd#|jDd$|j8d%|j3d&|jEd|jd'|d'dd(|d(dd)|j9d*|jFd+|jGd,|jHd-|jId.|jJd/|d/d0d1|d1dd2|d2d| _Kd S )4NF)share_token_embeddingsdrop_patch_rater   drop_path_ratepreprocess_layernormr   r   r
   r   r   )in_channelsout_channelskernel_sizestrider?   position_embedding_typelearned_absolutelearned_parametersz'Unrecognized positional embedding type !)r   r   )sequence_parallelconfiginit_methodoutput_layer_init_method
num_layersr@   num_attention_headsapply_query_key_layer_scalingkv_channelsffn_hidden_sizepre_processpost_process	precisionfp32_residual_connectionactivations_checkpoint_method!activations_checkpoint_num_layersnormalizationlayernorm_epsilonhidden_dropoutattention_dropout
layerscalebias_activation_fusionpersist_layer_normopenai_gelu	onnx_safemasked_softmax_fusionmegatron_legacy"activations_checkpoint_granularity
activationgeluub_tp_comm_overlapuse_flash_attentionr   )Lr   rv   r   fp16_lm_cross_entropyr   init_method_stdr   r   r   r   class_tokenr@   rR   rQ   rS   single_token_outputrT   rx   ry   rg   rh   ri   r   
seq_lengthnum_channelsflatten_diminput_tensorposition_idsrz   r   r7   	Parameterr"   	cls_tokenr=   zeros_r   expandcudaConv2dconv1r   	Embeddingposition_embeddingsrL   "_register_load_state_dict_pre_hookr   ru   empty
ValueErrorDropoutr   embedding_dropoutr	   
drop_patchr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   transformer)r   r^   model_parallel_configr   scaled_init_methodr   r   r   r   r   r   rz   r_   r   r   r   r      s   


	
	 
zVitBackbone.__init__c                 C   s   | j | dS )z1See megatron.model.transformer.set_input_tensor()N)r   set_input_tensor)r   r   r   r   r   r   0  s   zVitBackbone.set_input_tensorc           
      C   s   |j d }t|| j sJ || j }| j}||kr| jS | jd | j }| j| jd  }tt|}| j| j	f}|
dd }|dd|d |d f}| }||d  ||d  f}	tj||	dd}|d|f}|
dd }tj||fddS )Nr   r   r   bicubicrN   r   )r   rK   r   ri   r   r!   rH   rI   rg   rh   rW   rX   rY   rZ   r[   r\   r   r%   )
r   r&   output_seq_lenro   rn   	embed_tok
embed_gridrt   rs   rO   r   r   r   interpolate_pos_encoding4  s$   

z$VitBackbone.interpolate_pos_encodingc                 C   s>  | j r| |}||jd |jd d}|ddd}|}| jr5| j|jd dd}tj	||fdd}| j
dkrN|| | jd d d |jd f  }n| j
dkr[|| | }n	td| j
 d	| |}| jd urs| |}|dd }| |}n|}| |d }| jr| jr|d }|S |dd }|S )
Nr   r   r      r   r   r   z&Unrecognized position embedding type: .)r   r   rY   r   permuter   r   r   r   r%   r   r   r   r   r   r   rz   rW   rX   r   r   r   r   )r   inputrearranged_inputencoder_outputconcatenated_tokensr'   token_embeddingsrC   r   r   r   rG   R  s8   





zVitBackbone.forward)NNTTTF)	r1   r2   r3   r4   r   r   r   rG   r5   r   r   r   r   rv      s    ~rv   )r4   rH   	functoolsr   einopsr   torch.nn.functionalr7   
functionalr[   =nemo.collections.nlp.modules.common.megatron.fused_layer_normr   3nemo.collections.nlp.modules.common.megatron.moduler   2nemo.collections.nlp.modules.common.megatron.utilsr   r   r   Bnemo.collections.vision.modules.common.megatron.vision_transformerr   r	   r6   rK   ru   rv   r   r   r   r   <module>   s   (B