o
    wi?g                     @   s  d dl mZ d dlmZmZmZmZmZ d dlZd dl	Zd dl
m  mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' d dlmZ d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/m0Z0 d dl1m2Z2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z: e:dd\Z;Z<dZ=zd dl>Z?W n e@y   dZ=Y nw deeAejBf fddZCdejBfd d!ZDd"d# ZEeG d$d% d%e$e6jFZGG d&d' d'e"ZHG d(d) d)e3ZIg d*ZJdS )+    )	dataclass)AnyCallableDictOptionalTupleN)dist_checkpointing)parallel_state)has_config_logger_enabledlog_config_to_disk)	ModelType)BaseInferenceContext)InferenceParams)PackedSeqParams)get_context_parallel_group)#scatter_to_sequence_parallel_region)MegatronModule)TransformerConfig)deprecate_inference_paramsget_batch_on_this_cp_rank)nn)TokenizerSpec)get_packed_seq_params)Gemma3Config)!Gemma3VLMultimodalProjectorConfigGemma3VLVisionConfig)MODEL_CONFIG_ATTR	NevaModelrestore_model_weights)io)OptimizerModule)safe_import_fromz+megatron.core.extensions.transformer_engineTENormTFreturnc                    sz   t | }t|trt|dkr|d }n|}t   d t r' d t r0 d  fdd|	 D }|S )zGemma3 VL model data setp   r   )	input_idsposition_idstokens)pixel_values)labels	loss_maskc                    s2   i | ]\}}|| v r|d ur|j ddnd qS )NT)non_blocking)cuda).0keyvalrequired_keys e/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/vlm/gemma3vl/model/base.py
<dictcomp>M   s     z&gemma3vl_data_step.<locals>.<dictcomp>)
next
isinstancetuplelensetupdatepsis_pipeline_first_stageis_pipeline_last_stageitems)dataloader_iterbatch_batchr2   r0   r3   gemma3vl_data_step6   s    



rB   c                 C   s   | ddu r| ddu rtd| d| d| d| dd| dd| ddd	}d
|v r:t||d< | di |S )zGemma3 VL model forward stepr%   Nr'   z6Neither input_ids nor tokens is presented in the batchr&   r(   r*   r)   )r%   r&   r(   r*   r)   
cu_seqlenspacked_seq_paramsr2   )get
ValueErrorr   )modelr@   forward_argsr2   r2   r3   gemma3vl_forward_stepT   s   


rI   c                 C   s   dS )z@Placeholder for `set_input_tensor` method for PP implementation.Nr2   )selftensorr2   r2   r3   set_input_tensorf   s   rL   c                   @   s   e Zd ZU dZdZee ed< dZee	 ed< dZ
ee ed< dZeed< dZeed	< dZeed
< dZeed< dZee ed< dZee ed< dZee ed< dZeed< dZeed< dZeed< eZeed< eZeed< dd Zddee ddfddZdS )Gemma3VLConfigzGemma3 VL model base configNlanguage_transformer_configvision_transformer_configvision_projection_configr   $encoder_pipeline_model_parallel_size   "encoder_tensor_model_parallel_size
num_layers   num_attention_headslanguage_model_from_pretrainedvision_model_from_pretrained!vision_projection_from_pretrainedFfreeze_language_modelfreeze_vision_modelfreeze_vision_projectionforward_step_fndata_step_fnc                 C   s0   | j d urtD ]}t| |t| j | qd S d S N)rN   r   setattrgetattr)rJ   attrr2   r2   r3   __post_init__   s
   
zGemma3VLConfig.__post_init__vp_stager#   MCoreGemma3VLModelc                 C   s  d| j _d| j _| j| j _| j| j _| j| j _| j| j _| j| j_| j| j_| j	dkrR| j	dks4J d| j	| j_| j	| j_| j	| j _	| j
dkrR| j
| j_| j
| j_t| dddu r^|du sbJ dt| |t pot | j	kt t t pt | j	kd	}|S )
zConfigure Gemma3 VL modelTFr   rR   z&ViT can only live on 1 pipeline stage.$virtual_pipeline_model_parallel_sizeNzGVirtual pipeline model parallel size is not yet supported for Gemma3VL.)config	tokenizerpre_processpost_processadd_encoderadd_decoder)rN   is_vision_language#scatter_embedding_sequence_paralleltensor_model_parallel_sizesequence_parallelpipeline_model_parallel_sizecontext_parallel_sizerO   rP   rQ   rS   ra   re   r;   r<    get_pipeline_model_parallel_rankr=   )rJ   rh   rd   rG   r2   r2   r3   configure_model   s>   











zGemma3VLConfig.configure_modelr_   ) __name__
__module____qualname____doc__rN   r   r   __annotations__rO   r   rP   r   rQ   intrS   rT   rV   rW   strrX   rY   rZ   boolr[   r\   rI   r]   r   rB   r^   rc   rt   r2   r2   r2   r3   rM   l   s&   
 rM   c                       sn  e Zd ZdZ					d-dedee dededed	ed
df fddZdd Z	d.ddZ
dededefddZ						d/dddejdejdeej deej deej dee dee dee dee d
ejfd d!Z			"d0dejd#eej d$eej d%efd&d'Z				d1d(eej deej deej dee fd)d*Zdejd
eejejf fd+d,Z  ZS )2re   zGemma3 VL model base classNTrg   rh   ri   rj   rk   rl   r#   c           
         s  t  j|d t|rt|t t| jd |j}|j}|j	}	|| _
|| _|| _|| _d | _d | _d | _d | _|j| _|j| _|j| _|j| _|jdk| _| jdkret | _tj| j| jksdJ dnd | _|j| _ t!j"| _#|j$| _$|j%| _%d| _&| jr|' | _|	' | _t(| j|j) | jr|j'|||d| _| jj&| _&t(| j|j* n|j*d urt+j,t-i d|j*dd | j.|j/|j0|j1d	 d S )
N)rg   )prefixrR   z5CP Group size should match the Language Model CP sizeF)rh   ri   rj   )
state_dict)sharded_state_dictcheckpoint_dirvalidate_access_integrity)rZ   r[   r\   )2super__init__r
   r   localstyperu   rN   rO   rP   ri   rj   rk   rl   encoder_hidden_statevision_modelvision_projectionlanguage_modelrp   is_sequence_paralleltp_comm_overlaprr   ro   tensor_parallel_sizerq   is_pipeline_parallelr   cp_grouptorchdistributedget_world_size
seq_lengthmax_seq_lenr   encoder_or_decoder
model_typeimage_token_id
vocab_size#share_embeddings_and_output_weightsrt   r   rX   rW   r   loaddictfreezerZ   r[   r\   )
rJ   rg   rh   ri   rj   rk   rl   rN   rO   rP   	__class__r2   r3   r      sj   






zMCoreGemma3VLModel.__init__c                 C   s   | j r| j S dS )zThis is a convenience method to surface the language model's word embeddings, which is
        necessary for `finalize_model_grads._allreduce_word_embedding_grads`.N)rl   r   !shared_embedding_or_output_weight)rJ   r2   r2   r3   r     s   
z4MCoreGemma3VLModel.shared_embedding_or_output_weightc                 C   s   t |ts|g}t|dksJ | jr | jr | j|d  dS | jr-| j|d  dS | jr7|d | _dS | j	|d  dS )zSet model chunk input tensor.rR   r   N)
r6   listr8   rk   rl   r   rL   ri   r   r   )rJ   input_tensorr2   r2   r3   rL     s   
z#MCoreGemma3VLModel.set_input_tensorrZ   r[   r\   c                 C   st   g }|r| j dur|| j  |r| jdur|| j |r)| jdur)|| j |D ]}| D ]}d|_q1q+dS )ac  Freeze model modules.

        Make specific modules non-trainable by setting requires_grad to False.

        Args:
            freeze_language_model (bool): Freeze the language model module.
            freeze_vision_model (bool): Freeze the vision model module.
            freeze_vision_projection (bool): Freeze the vision projection module.
        NF)r   appendr   r   
parametersrequires_grad)rJ   rZ   r[   r\   modulesmoduleparamr2   r2   r3   r     s   
zMCoreGemma3VLModel.freeze)inference_paramsr%   r&   r(   r)   r*   inference_contextruntime_gather_outputrD   r   c	             
   C   s  t ||	}|duod|jv }
|jd }|du}|
rd}nH| jra|s$d}n@|d|jd |jd |jd  }| |}| |}||d|jd |jd }|dur`|jd |jd  |jd	< n| j	}| j
sk||fS |jd }|| jkr| jr|ddd| jf }|ddd| jf }| jr|ddd| jf }|ddd| jf }| jr|| jk r|du r| j| }| jrt|d|f}t|d|f}| jrt|d|f}t|d|f}| jr|}| j| jkr|| jk}| }d||< | jj||d
}|dd }|jd |jd ksJ nd}| j||||
d\}}| jdks,| jr9| ||||\}}}}n|durF|dd }| jdd||||||d}|du s]|du r_|S ||fS )zForward of the Gemma3VL modelNimage_tokens_countr      r$      rR   media_tokens_count)r%   r&   )r%   image_embeddinglanguage_embeddinguse_inference_kv_cache)r%   r&   decoder_inputattention_maskr)   r   r   rD   )r   key_value_memory_dictshaperk   reshape
contiguousr   r   viewr   rl   r   ri   rj   r   Fpadr   r   cloner   	embedding	transpose_preprocess_datarr   r   _process_sequence_parallel)rJ   r%   r&   r(   r)   r*   r   r   rD   r   r   
batch_size
has_imagesr   language_seq_lenpadded_seq_lensafe_input_idsimage_token_maskr   combined_embeddingr   outputr2   r2   r3   forward3  s   









zMCoreGemma3VLModel.forwardFr   r   r   c                 C   st   | j sdS | |}|r||fS |}|d ur6|| jk}|d|}||jd d|jd }|||}||fS )N)NNr   r   )ri   _compute_attention_maskr   	unsqueeze	expand_asr   r   masked_scatter)rJ   r%   r   r   r   r   final_embedding
image_maskr2   r2   r3   r     s   

z#MCoreGemma3VLModel._preprocess_datar   c                 C   s  | j s| js||||fS | j rU| jdkr | jr | j| j d }n| jdkr+| jd }n| j}|jd | dks9J | jrU|jd | j dksHJ | jrU|jd | jksUJ | jdkrt }| j rd||d< | jro||d< ||d< |d u sx|j	dkr}t
|}n'| j }| j }| D ]\}	}
t|j|
d||}|
d|||	< q| j r|d }| jr|d }|d }|d ur|dd }| jr| j rt|}||||fS )NrR   r   r   r   r)   r*   sbhd)ri   rj   rr   r   r   r   r   r   r   
qkv_formatr   r   sizerankr>   texthd_get_partitioned_indicescu_seqlens_q_paddedindex_selectr   r   r   )rJ   r   r)   r*   rD   shard_factorr@   cp_sizecp_rankr.   dataindexr2   r2   r3   r     sN   




z-MCoreGemma3VLModel._process_sequence_parallelc                 C   s   | j sd S |j\}}tt|d||f|j}|| jk}tj	|ddd}|d d dd f |d d d df k}tj
|dd}|| }	|	}
t|
d d d d d f |	dk|	ddk}t||d }|S )NrR   )rR   r   r   )valuer   )dim)ri   r   r   trilonestodevicer   r   r   cumsumlogical_andr   
logical_or)rJ   r%   r   seq_lencausal_maskr   padded_maskboundarynumbered_boundaryq_block_indiceskv_block_indicesbidirectional_maskr   r2   r2   r3   r     s    
 
(z*MCoreGemma3VLModel._compute_attention_mask)NTTTT)r#   N)NNNNNN)NNF)NNNN)ru   rv   rw   rx   rM   r   r   r|   r   r   rL   r   r   Tensor
LongTensorr   r   r   r   r   r   r   r   __classcell__r2   r2   r   r3   re      s    N
	
 
(
7re   c                       s   e Zd ZdZ			ddedee ded deeej	gej	f  f fdd	Z
					dd
ejdeej deej deej deej dee dejfddZ  ZS )Gemma3VLModelz$Lightning wrapper for Gemma3VL modelNrg   optimrh   r   model_transformc                    s   t  j||||d d S )N)rg   r   rh   r   )r   r   )rJ   rg   r   rh   r   r   r2   r3   r   6  s   
zGemma3VLModel.__init__r%   r&   r(   r*   r)   r   r#   c                 C   s   | j ||||||d}|S )N)r%   r&   r(   r*   r)   r   )r   )rJ   r%   r&   r(   r*   r)   r   output_tensorr2   r2   r3   r   E  s   
	zGemma3VLModel.forward)NNN)NNNNN)ru   rv   rw   rx   rM   r   r    r   r   Moduler   r   r   r   r   r   r   r2   r2   r   r3   r   3  sD    r   )r   rM   rB   rI   )Kdataclassesr   typingr   r   r   r   r   r   torch.distributedtorch.nn.functionalr   
functionalr   megatron.corer   r	   r;   megatron.core.config_loggerr
   r   megatron.core.enumsr    megatron.core.inference.contextsr   megatron.core.inference_paramsr   megatron.core.packed_seq_paramsr   megatron.core.parallel_stater   megatron.core.tensor_parallelr   megatron.core.transformerr   ,megatron.core.transformer.transformer_configr   megatron.core.utilsr   r   1nemo.collections.common.tokenizers.tokenizer_specr   #nemo.collections.llm.gpt.model.baser   %nemo.collections.llm.gpt.model.gemma3r   *nemo.collections.vlm.gemma3vl.model.visionr   r   $nemo.collections.vlm.neva.model.baser   r   r   nemo.lightningr   nemo.lightning.pytorch.optimr    nemo.utils.import_utilsr!   r"   _HAVE_TEXtransformer_engine_torchr   ImportErrorr{   r   rB   rI   rL   IOMixinrM   re   r   __all__r2   r2   r2   r3   <module>   sX   G   (