o
    }oi                     @   s   d dl mZ d dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ ed	d
\ZZeG dd deZeG dd deejZG dd deZdS )    )	dataclass)ColumnParallelLinear)TransformerConfig)MegatronModule)nn)SigLIPViT400M_14_384_Config)io)safe_import_fromz+megatron.core.extensions.transformer_engineTENormc                   @   s6   e Zd ZU dZdZeed< dZeed< dZeed< dS )Gemma3VLVisionConfigz"Gemma3 VL vision model base config  img_himg_wi   image_token_idN)	__name__
__module____qualname____doc__r   int__annotations__r   r    r   r   ^/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/gemma3vl/model/vision.pyr      s
   
 r   c                   @   s   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dZeed< dZeed< dZeed< dZeed< dZeed< dddZdS )!Gemma3VLMultimodalProjectorConfigz%Gemma3 VL multimodal projector configi  
input_sizei 
  hidden_sizer   
image_size   	patch_dim   tokens_per_imageRMSNormnormalizationTlayernorm_zero_centered_gammagư>layernorm_epsilon   
num_layers   num_attention_headsreturnGemma3VLMultimodalProjectorc                 C   s   t | S )z
Get module)r)   )selfr   r   r   configure_model9   s   z1Gemma3VLMultimodalProjectorConfig.configure_modelN)r(   r)   )r   r   r   r   r   r   r   r   r   r   r   r!   strr"   boolr#   floatr%   r'   r+   r   r   r   r   r   &   s   
 r   c                       s.   e Zd ZdZdef fddZdd Z  ZS )r)   zGemma3 VL multimodal projectorconfigc                    s~   t  j|d |j|j | _t|jd }| j| }tj||d| _	t
||j|jd| _t|j|j||jddddd d	| _d S )N)r/   g      ?)kernel_sizestride)epsTF)	r   output_sizer/   init_methodgather_outputbiasskip_bias_add	is_experttp_comm_buffer_name)super__init__r   r   patches_per_sider   r   r   	AvgPool2davg_poolr
   r   r#   mm_soft_embed_normr   r   r4   proj)r*   r/   tokens_per_sider0   	__class__r   r   r;   A   s"   
z$Gemma3VLMultimodalProjector.__init__c                 C   sl   |j \}}}|dd}|||| j| j }| |}|d}|dd}| |}| |\}}|S )zDownsample, norm and projectionr$      )	shape	transposereshaper<   
contiguousr>   flattenr?   r@   )r*   x
batch_size_r   r   r   r   forwardZ   s   


z#Gemma3VLMultimodalProjector.forward)r   r   r   r   r   r;   rM   __classcell__r   r   rB   r   r)   >   s    r)   N)dataclassesr   $megatron.core.tensor_parallel.layersr   megatron.core.transformerr    megatron.core.transformer.moduler   torchr   &nemo.collections.vlm.vision.siglip_vitr   nemo.lightningr   nemo.utils.import_utilsr	   r
   rL   r   IOMixinr   r)   r   r   r   r   <module>   s   