o
    wi,                     @   s  d dl Z d dlZd dlmZ d dlmZmZmZ d dlZ	d dl
m  mZ d dlmZ d dlmZ d dlmZ zd dlmZmZmZ W n ey]   d dlmZ dZdZdZed	 Y nw d d
lmZm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' d dl(m)Z) dd Z*dd Z+G dd de	jj,Z-eG dd de$e)j.Z/eG dd de&e)j.Z0eG dd de$e)j.Z1G dd deZ2dS )    N)	dataclass)CallableOptionalUnion)CLIPViTModel)MultimodalProjector)ColumnParallelLinear)TEColumnParallelLinearTENormTERowParallelLinear)loggingzFailed to import Transformer Engine dependencies. `from megatron.core.transformer.custom_layers.transformer_engine import *`If using NeMo Run, this is expected. Otherwise, please verify the Transformer Engine installation.)MLPMLPSubmodules)
ModuleSpec)TransformerConfig)CLIPVisionConfigCLIPVisionModel)ioc                 C   s   dS )z#Sets input tensor func place holderN )selftensorr   r   ]/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/vlm/vision/base.pyset_input_tensor6   s   r   c                 C   s*   | | }|| }|| }||r| S d S )zHGet image sequence length given image size, patch size, and class token.r   r   )img_himg_w	patch_dimadd_class_tokenclass_token_lennum_patches_per_dim_hnum_patches_per_dim_wnum_patchesr   r   r   get_image_sequence_length;   s   r!   c                   @   s    e Zd ZdZdd Zdd ZdS )DownSampleBlockz.Downsample block following the ViLA-VLM paper.c                 C   sd   | dd}|}t|jd d  }}||jd ||d}| |}||jd d|jd }|S )zDownsample the input tensor.r      g      ?)	transposeintshapereshapeflat_square)r   x
vit_embedshwr   r   r   forwardJ   s   
zDownSampleBlock.forwardc                 C   s(  |  \}}}}|d dkr/tj|tj|d||f|jd|jgdd }|  \}}}}|d dkrVtj|tj||d|f|jd|jgdd }|  \}}}}| }|||t	|d t	|d }|
dddd }||t	|d t	|d t	|d }|
dddd }|S )z,Flatten the input tensor and make it square.   r#   dtype)dimr         )sizetorchconcatzerosr1   todevice
contiguousviewr&   permute)r   r*   nr-   r,   cr   r   r   r)   T   s   22 (zDownSampleBlock.flat_squareN)__name__
__module____qualname____doc__r.   r)   r   r   r   r   r"   C   s    
r"   c                   @   s   e Zd ZU dZdZeed< dZee	 ed< dZ
ee ed< dZeed< dZeed	< ejZeed
< dZeed< dZeed< dZeed< dZeed< dddZdS )MultimodalProjectorConfigzl
    For MLP, fc1 in shape of input_size, ffn_hidden_size, fc2 in shape of ffn_hidden_size, hidden_size
    
mlp2x_geluprojector_typeN
layer_spec   
input_sizehidden_sizeffn_hidden_sizeactivation_funcTbiasbias_activation_fusionr#   
num_layers   num_attention_headsreturnMCoreMultimodalProjectorc                 C   s  | j drL| jd u rL| j| _| j dkr(d| _ ttttt	dd| _| jj
| _n| j dkr8d| _ ttd d| _n	td| j  d	t| | j| j | jd
S | j dkrtjt tjj| jd | jdtjj| jd | jd| jdtj tjj| j| jd| jd}ddlm} |t||_|S td| j }|rt|d}tjj| j| jd| jdg}t d|D ]}|!tj  |!tjj| j| jd| jd qtjj| }ddlm} |t||_|S td| j  d	)Nmcore	mcore_mlpmlp)
linear_fc1
linear_fc2)module
submodulesmcore_affineaffinezNot supported projector type ``)rF   rI   vila_downsample_mlpr4   r0   T)rM   r1   r   )
MethodTypez^mlp(\d+)x_gelu$r#   )"rF   
startswithrG   rM   add_bias_linearr   r   r   r	   r   rZ   r   NotImplementedErrorrS   rI   r6   nn
Sequentialr"   	LayerNormparams_dtypeLinearrJ   GELUtypesr_   r   rematchr&   grouprK   rangeappend)r   modelr_   mlp_gelu_match	mlp_depthmodules_r   r   r   configure_modelv   s^   


z)MultimodalProjectorConfig.configure_model)rR   rS   )r@   rA   rB   rC   rF   str__annotations__rG   r   r   rI   r&   rJ   rK   FgelurL   r   rM   boolrN   rO   rQ   rt   r   r   r   r   rD   e   s   
 rD   c                   @   st   e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
ee ed	< dZeeeejf  ed
< dddZdddZdS )HFCLIPVisionConfigzz
    https://github.com/huggingface/transformers/blob/v4.44.0/src/transformers/models/clip/configuration_clip.py#L261
    rH   rJ   Fr   r#   r   Nnum_image_embeddings_per_tilepretrained_model_name_or_pathrR   c                 O   s|   t j| g|R i |d| ji | jd ur-t | j}|  D ]
\}}t| || q"t| j	| j	| j
| j| jd| _d S )NrJ   r   r   r   r   r   )r   __init__rJ   r|   from_pretrainedto_dictitemssetattrr!   
image_size
patch_sizer   r   r{   )r   argskwargsconfigkeyvaluer   r   r   __post_init__   s   "
z HFCLIPVisionConfig.__post_init__r   c                 C   s0   t t_ | jd u rt| }nt| j}d|_|S )NT)r   r   r|   r   tensor_parallel_grad_reduce)r   ro   r   r   r   rt      s   

z"HFCLIPVisionConfig.configure_model)rR   N)rR   r   )r@   rA   rB   rC   rJ   r&   rv   r   ry   r   r{   r   r|   r   ru   osPathLiker   rt   r   r   r   r   rz      s   
 
rz   c                   @   s   e Zd ZU dZeZeeef e	d< eZ
eeef e	d< dZee	d< dZee	d< dZee	d	< d
Zee	d< d
Zee	d< dZee	d< dZee e	d< dZee	d< dZee	d< dZee	d< dd ZdddZdS )CLIPViTConfigzMCore CLIP ViT Configln_pre_implln_post_implTr   r#   r      r   iP  r   r   clipvision_model_typeNr{   transformer_layer_specrO   rP   rQ   c                 C   s8   | j dkrd| _d| _t| j| j| j| j| jd| _d S )NsiglipFr   r}   )r   r   r   r!   r   r   r   r{   )r   r   r   r   r      s   
zCLIPViTConfig.__post_init__rR   BaseCLIPViTModelc                 C   sR   | j }t|tsddlm} |dd}t| || j| j| j| j	| j
| j| j| jd
S )Nr   )get_layer_spec_teT)is_vit)r   r   r   r   r   r   r   model_subtype)r   
isinstancer    nemo.collections.vlm.layer_specsr   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   rt      s    

zCLIPViTConfig.configure_model)rR   r   )r@   rA   rB   rC   r
   r   r   r   typerv   r   r   ry   r   r&   r   r   r   r   ru   r{   r   r   rO   rQ   r   rt   r   r   r   r   r      s    
 r   c                	       s@   e Zd ZdZ	d
dejdeej dedejf fdd	Z  Z	S )r   zCLIP ViT vision model.Nr   r*   attention_masknum_unused_layersrR   c                    s^   |dkr(| j j| d  }| j jd |  | j _t ||}| j j| |S t ||S Nr   )decoderlayerssuperr.   extend)r   r*   r   r   unused_layers	__class__r   r   r.     s   zBaseCLIPViTModel.forwardr   )
r@   rA   rB   rC   r6   Tensorr   r&   r.   __classcell__r   r   r   r   r     s    r   )3r   rj   dataclassesr   typingr   r   r   torch.distributedr6   torch.nn.functionalrc   
functionalrw   *megatron.core.models.vision.clip_vit_modelr   MCoreCLIPViTModel0megatron.core.models.vision.multimodal_projectorr   rS   $megatron.core.tensor_parallel.layersr   :megatron.core.transformer.custom_layers.transformer_enginer	   r
   r   ImportError
nemo.utilsr   warningmegatron.core.transformer.mlpr   r   $megatron.core.transformer.spec_utilsr   ,megatron.core.transformer.transformer_configr   transformersr   r   nemo.lightningr   r   r!   Moduler"   IOMixinrD   rz   r   r   r   r   r   r   <module>   sF   "M*3