o
    }oim?                     @   s   d dl mZ d dlmZmZmZ d dlZd dlZd dlm	Z
 d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZmZ d dlmZ deeejf fddZdejfddZeG dd deZG dd deZ G dd deZ!dS )    )	dataclass)CallableDictOptionalNparallel_state)InferenceParams)PackedSeqParams)TransformerConfig)nn)TokenizerSpec)MODEL_CONFIG_ATTRMCoreNevaModel
NevaConfig	NevaModel)OptimizerModulereturnc                    s  ddl m} t| }t|trt|dkr|d }n|}t   d | r- d |	 r6 d |
dd} fd	d
| D }|durddD ]}t||d}|durct|||jdd qM||d< t dkrd}d|v r|d dur|d  }||d< |S )zLlama4 Omni Data Stepr   r      )tokensattention_maskmedianum_media_tiles)position_ids)labels	loss_maskpacked_seq_paramsNc                    s2   i | ]\}}|| v r|d ur|j ddnd qS )NTnon_blocking)cuda).0keyvalrequired_keys Z/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/llama4/model/base.py
<dictcomp>@   s     z$llama4_data_step.<locals>.<dictcomp>)cu_seqlens_qcu_seqlens_kvcu_seqlens_q_paddedcu_seqlens_kv_paddedTr      r   num_valid_tokens_in_ub)megatron.corer   next
isinstancetuplelensetupdateis_pipeline_first_stageis_pipeline_last_stagegetitemsgetattrsetattrr   psget_context_parallel_world_sizesum)dataloader_iterr   batch_batchr   attrvaluer,   r$   r"   r%   llama4_data_step   s@   


rB   c                 C   sb   |d |d |d | dd| dd| dd| dd| d	d| d
dd	}| di |S )zLlama4 Omni Forward Stepr   r   r   r   Nr   r   r   image_token_maskr   )	images	input_idsr   r   r   r   num_image_tilesrC   r   r$   )r6   )modelr>   forward_argsr$   r$   r%   llama4_forward_stepS   s   





rI   c                   @   s2  e Zd ZU dZdZee ed< dZee ed< dZ	ee ed< dZ
eed< dZeed	< d
Zeed< d
Zeed< dZeed< dZeed< dZee ed< dZee ed< dZee ed< dZeed< dZeed< dZeed< dZeed< ejZejed< eZe ed< e!Z"e ed< dd Z#d#dee dd fd!d"Z$dS )$Llama4OmniConfigzLlama4 Model Base ConfigNlanguage_transformer_configvision_transformer_configvision_projection_configTdrop_vision_class_tokenr   $encoder_pipeline_model_parallel_sizer+   "encoder_tensor_model_parallel_size
num_layers   num_attention_headsi    
seq_lengthlanguage_model_from_pretrainedvision_model_from_pretrained!vision_projection_from_pretrainedFfreeze_language_modelfreeze_vision_modelfreeze_vision_projectionbf16params_dtypeforward_step_fndata_step_fnc                 C   sD   | j d ur tD ]}t| |t| j | qt| j dt| d d S d S )Nuse_te_rng_tracker)rK   r   r9   r8   )selfr@   r$   r$   r%   __post_init__   s
   
zLlama4OmniConfig.__post_init__vp_stager   r   c              
   C   s   d| j _| j| j _| j| j _| j| j_| j| j_| j| j _| j| j _| j| j _| j	| j _	| j
dkrX| j
dks:J d| j
| j_| j
| j_| j
| j _
| jdkrX| j| j_| j| j_| j| j _| j| j _|ped}t| |tjd|dtjd|dtjd|dtjd|dpt | j
k| j|d}|S )NFr   r+   z&ViT can only live on 1 pipeline stage.ignore_virtualrb   )config	tokenizerpre_processpost_processadd_encoderadd_decoderrN   rb   )rK   #scatter_embedding_sequence_paralleltensor_model_parallel_sizesequence_parallelrL   rM   pipeline_model_parallel_sizecontext_parallel_sizeexpert_tensor_parallel_sizeexpert_model_parallel_sizerO   rP    moe_pad_expert_input_to_capacitymoe_expert_capacity_factorLlama4OmniBaseModelr:   r4   r5    get_pipeline_model_parallel_rankrN   )r`   rf   rb   rG   r$   r$   r%   configure_model   sB   















z Llama4OmniConfig.configure_model)N)%__name__
__module____qualname____doc__rK   r   r
   __annotations__rL   rM   rN   boolrO   intrP   rQ   rS   rT   rU   strrV   rW   rX   rY   rZ   r[   torchbfloat16r\   dtyperI   r]   r   rB   r^   ra   rv   r$   r$   r$   r%   rJ   d   s.   
 rJ   c                   @   s|   e Zd ZdZ							ddejdejdeej deej deej deej d	ee d
ee dee	 dejfddZ
dS )rt   zHllama4 base model combining vision and text models with cross-attention.NrE   r   r   r   rD   r   inference_paramsruntime_gather_outputr   r   c
              
   K   sj  |duod|j v }|duot|dk}|rd}n@| jr4|s4t| j }tjg |j|j	d
ddd}n$| jrU|rU| |}| |}|durT|jd |jd  |j d< n| j}| js]|S d}| jr| }d||dk < | jj||d}|dd }|r|j}|d|d}|| jdkd}||j	}|d|d}|d	 
d}| }||dkrtd
| d|d d|dd|d}|||}||}n|}|	dur|	jdks|dd }n|}|||}}}| j dks| j!r| "||||	\}}}}	| jdd||||||	d}t#j$d| j%ds(|S |du r/|S || fS )a)  Forward function of the Llama4 model.

        Args:
            input_ids (torch.Tensor): Input text token IDs of shape [batch, text_seq_len].
            position_ids (torch.Tensor): Positional IDs for the input text tokens of shape [batch, text_seq_len].
            loss_mask (Optional[torch.Tensor]): Mask indicating which tokens should contribute to the loss,
                of shape [batch, text_seq_len].
            attention_mask (Optional[torch.Tensor]): Attention mask for the model of shape
                [batch, 1, combined_seq_len, combined_seq_len].
            images (Optional[torch.Tensor]): Input images represented as a list of image tile tensors
                per sample. Each tile tensor is of shape [C, H, W].
            labels (Optional[torch.Tensor]): Target labels for language modeling, of shape [batch, combined_seq_len].
            inference_params (Optional[InferenceParams]): Parameters for inference, such as KV cache.
            runtime_gather_output (Optional[bool]): Whether to gather outputs during runtime. If None, falls back to
                the `parallel_output` setting from the constructor.
            packed_seq_params (Optional[PackedSeqParams]): Parameters for handling packed sequences, including
                padding information (used for SP/CP).

        Returns:
            output (torch.Tensor): Loss of shape [b, s] if labels are provided,
                otherwise logits of shape [b, s, vocab_size].
            loss_mask (torch.Tensor): Loss mask expanded to combined sequence length. Shape [b, s].
        Nimage_tokens_countr   )r   devicer+   )rE   r   z	<|patch|>).r   zMismatch: final_mask wants z& embeddings, but image_embeddings has z embeddings.thd)rE   r   r   decoder_inputr   r   r   r   Frc   )&key_value_memory_dictr1   ri   r.   vision_model
parametersr   tensorr   r   reshapevision_projectionshapeencoder_hidden_staterj   rg   clonelanguage_model	embedding	transpose
contiguousviewsizerf   token_to_id	unsqueezetor<   
ValueErrorexpandmasked_scatter
qkv_formatcontext_parallel_lmsequence_parallel_lm!_process_embedding_token_parallelr:   r5   rb   )r`   rE   r   r   r   rD   r   r   r   r   kwargsuse_inference_kv_cache
has_imagesimage_embeddingsvision_paramlanguage_embeddingsinput_ids_textoriginal_inputs_embeds_shapeimage_embeddings_flattenedspecial_image_mask
final_maskcombined_embeddingsfinal_mask_1dnum_tokens_to_fillexpanded_maskfinal_labelsfinal_loss_maskfinal_attention_maskoutputr$   r$   r%   forward   s   '
 





zLlama4OmniBaseModel.forward)NNNNNNN)rw   rx   ry   rz   r   Tensorr   r   r|   r	   r   r$   r$   r$   r%   rt      s<    	
rt   c                       sT   e Zd ZdZ			d
dedee ded deeej	gej	f  f fdd	Z
  ZS )Llama4OmniModelz&Lightning Module for the Llama4 model.Nre   optimrf   r   model_transformc                    s   t  j||||d d S )N)re   r   rf   r   )super__init__)r`   re   r   rf   r   	__class__r$   r%   r   U  s   
zLlama4OmniModel.__init__)NNN)rw   rx   ry   rz   rJ   r   r   r   r   Moduler   __classcell__r$   r$   r   r%   r   R  s    r   )"dataclassesr   typingr   r   r   r   torch.distributedr-   r   r:   megatron.core.inference_paramsr   megatron.core.packed_seq_paramsr	   ,megatron.core.transformer.transformer_configr
   r   1nemo.collections.common.tokenizers.tokenizer_specr   $nemo.collections.vlm.neva.model.baser   r   r   r   nemo.lightning.pytorch.optimr   r~   r   rB   rI   rJ   rt   r   r$   r$   r$   r%   <module>   s&   4R 