o
    }oi,                     @   s  d dl mZ d dlmZmZmZmZ d dlmZ	 d dl
Z
d dlZ
d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dl
mZ d dlmZ d dl m!Z! d dl"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z,m-Z- d dl.m/Z/ g dZ0d%ddZ1dee2e
j3f fddZ4de
j3fddZ5eG dd dee&j6Z7G d d! d!eZ8G d"d# d#e	j9e&j6e&j:e!j;Z<g d$Z=dS )&    )	dataclass)CallableDictListOptionalN)InferenceParamsdist_checkpointingparallel_state)tensor_parallel)	ModelType)
LLaVAModel)OptimizerConfig)PackedSeqParams)TransformerConfig)nn)TokenizerSpec)fn)IGNORE_INDEXIMAGE_TOKEN_INDEX)io)ckpt_to_weights_subdir)$MaskedTokenLossReductionWithLossMask)MegatronOptimizerModuleOptimizerModule)logging)
num_layershidden_sizenum_attention_headsnum_query_groupsffn_hidden_sizekv_channelshidden_dropoutattention_dropoutfp32_residual_connection(apply_residual_connection_post_layernorminit_method_stdlayernorm_epsilonlayernorm_zero_centered_gammaadd_bias_linearadd_qkv_biasgated_linear_unitactivation_funcactivation_func_fp8_input_storenum_moe_expertsrotary_interleavedwindow_sizenormalizationqk_layernormposition_embedding_typerotary_base	test_modecalculate_per_token_loss
seq_length#share_embeddings_and_output_weightsmoe_token_dispatcher_typemoe_router_load_balancing_typeFc                 C   sr   |dur7t | jddd}tjd|t|dddd|sdd	ini }d
d |d  D }| j||d dS dS )z
    Restores model weights from a checkpoint.

    Args:
        model: The model to restore weights for.
        checkpoint_path: Path to the checkpoint.
        strict: Whether to restore weights even if they are not the same.
    Nmodule.)prefix
state_dictF)	is_savingsharded_state_dictcheckpoint_dirvalidate_access_integritystrictlog_allc                 S   s   i | ]
\}}| d |qS )r;   )removeprefix).0kv rJ   X/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/neva/model/base.py
<dictcomp>]   s    z)restore_model_weights.<locals>.<dictcomp>r>   )rD   rJ   )dictrA   r   loadr   itemsload_state_dict)modelcheckpoint_pathrD   rA   loaded_state_dictrJ   rJ   rK   restore_model_weightsL   s   	
rT   returnc                    s  ddl m} t| }t|trt|dkr|d }n|}t   d | r- d |	 r6 d |
dd} fd	d
| D }|durddD ]}t||d}|durct|||jdd qM||d< t dkrd}d|v r|d dur|d  }||d< |S )zNeva Data Stepr   r	      )tokensattention_maskmedianum_media_tiles)position_ids)labels	loss_maskpacked_seq_paramsNc                    s@   i | ]\}}|| v r|d urt |tr|n|jddnd qS )NTnon_blocking)
isinstancelistcuda)rG   keyvalrequired_keysrJ   rK   rL      s    z"neva_data_step.<locals>.<dictcomp>)cu_seqlens_qcu_seqlens_kvcu_seqlens_q_paddedcu_seqlens_kv_paddedTr_      r]   num_valid_tokens_in_ub)megatron.corer
   nextra   tuplelensetupdateis_pipeline_first_stageis_pipeline_last_stagegetrO   getattrsetattrrc   psget_context_parallel_world_sizesum)dataloader_iterr
   batch_batchr^   attrvaluerm   rJ   rf   rK   neva_data_stepa   s@   


r   c                 C   sb   |d |d |d | dd| dd| dd| dd| d	d| d
dd	}| di |S )zNeva Forward SteprY   rW   r[   rX   Nr]   r\   rZ   image_token_maskr^   )	images	input_idsr[   rX   r]   r\   num_image_tilesr   r^   rJ   )rv   )rQ   r}   forward_argsrJ   rJ   rK   neva_forward_step   s   





r   c                   @   s"  e Zd ZU dZdZee ed< dZee ed< dZ	ee ed< dZ
eed< dZeed	< d
Zeed< dZeed< dZeed< dZeed< dZeed< dZee ed< dZee ed< dZee ed< dZeed< dZeed< dZeed< eZeed< eZeed< dd Zd#dee dd fd!d"Z dS )$
NevaConfigzNeva Model Base ConfigNlanguage_transformer_configvision_transformer_configvision_projection_configTdrop_vision_class_tokenvision_feature_layerr   $encoder_pipeline_model_parallel_sizerl   "encoder_tensor_model_parallel_sizer      r   i   r7   language_model_from_pretrainedvision_model_from_pretrained!vision_projection_from_pretrainedFfreeze_language_modelfreeze_vision_modelfreeze_vision_projectionforward_step_fndata_step_fnc                 C   s0   | j d urtD ]}t| |t| j | qd S d S N)r   MODEL_CONFIG_ATTRrx   rw   )selfr   rJ   rJ   rK   __post_init__   s
   
zNevaConfig.__post_init__vp_stagerU   MCoreNevaModelc              
   C   s   d| j _| j| j _| j| j _| j| j_| j| j_| j| j _| j| j _	 | jdkrO| jdks1J d| j| j_| j| j_| j| j _| j	dkrO| j	| j_| j	| j_t
| |tjd|dtjd|dtjd|dtjd|dpqt | jk| j|d}|S )NFzPNEVA `encoder_pipeline_model_parallel_size` has bug for now. Fix will come soon.r   rl   z&ViT can only live on 1 pipeline stage.)ignore_virtualr   )config	tokenizerpre_processpost_processadd_encoderadd_decoderr   r   )r   #scatter_embedding_sequence_paralleltensor_model_parallel_sizesequence_parallelr   r   pipeline_model_parallel_sizecontext_parallel_sizer   r   r   ry   rt   ru    get_pipeline_model_parallel_rankr   )r   r   r   rQ   rJ   rJ   rK   configure_model   s:   











zNevaConfig.configure_modelr   )!__name__
__module____qualname____doc__r   r   r   __annotations__r   r   r   boolr   intr   r   r   r   r7   r   strr   r   r   r   r   r   r   r   r   r   r   r   rJ   rJ   rJ   rK   r      s,   
 r   c                       s  e Zd ZdZ							d%dededededed	ed
edee ddf fddZdddddde	dddf
de
jde
jdee
j dee
j dee
j dee
j dee deee  dee dee dee
j dee de
jfddZd&ddZdd  Zd!d" Zd#d$ Z  ZS )'r   zNeva Model Base Model ClassNTr   r   r   r   r   r   r   r   rU   c	                    s  t t| j|d |j}	|j}
|j}|| _|| _|| _|| _	|| _
|| _d | _d | _d | _d | _|	j| _|	j| _|	j| _|	j| _d| _| j	rq|	j||||d| _| jj| _| jj| _|	jdk| _t| j|j t !d|j  n|jd urt"j#t$i d|jdd | jr|
 | _| | _|| _%t| j|j& t !d|j&  | j'|j(|j)|j*d	 t+j,| _-t.|
d
| _/|
j0| _1|r|
j2r|  j1|
j38  _1d S d S d S )Nr   F)r   r   r   r   rl   z%Restored language model weights from r=   r@   z#Restored vision model weights from )r   r   r   
image_size)4superMCoreLLaVAModel__init__r   r   r   r   r   r   r   r   r   encoder_hidden_statevision_modelvision_projectionlanguage_modelr   sequence_parallel_lmtp_comm_overlaptp_comm_overlap_lmr   context_parallel_lmr   tensor_model_parallel_size_lmr8   r   max_sequence_length_language_max_sequence_lengthr   _language_is_pipeline_parallelrT   r   r   infor   rN   rM   _drop_vision_class_tokenr   freezer   r   r   r   encoder_or_decoder
model_typehasattrvision_model_from_hfnum_image_embeddings_per_tile_img_seq_lenadd_class_tokenclass_token_len)r   r   r   r   r   r   r   r   r   r   r   r   	__class__rJ   rK   r      sf   





zMCoreNevaModel.__init__r   r[   r]   rX   r   r\   inference_paramsr   image_token_indexruntime_gather_outputr   r^   c                 C   s  |duod|j v }|duot|dk}|rd}n| jr4|s4t| j }tjg |j|j	d
ddd}no| jr|r|t| j j}| jr]| j | _| j|dd}|d | jj }n| j|| jj d d	}| jrt| jd
d}|dd|dddf }|ddd }| |}|dur|jd |jd  |j d< n| j}| js|S d}| jr| }d||dk < | jj||d}|dd }|du rtj|jd tj|j	d}nt|t rtj|tj|j	d}| !|||||||	|||
\}}}}| j"dks| j#r)| j"dkr|dur|j$dkr|dd }| %||||\}}}}| jdd|||||
|d}|du s@|du rB|S || fS )a  Forward function of the LLaVA model.

        Args:
            images (torch.Tensor): input image of shape [num_tiles, img_h, img_w]. num_tiles means the number of
            image tiles in this batch.
            input_ids (torch.Tensor): input text ids [batch, text_seq_len].
            position_ids (torch.Tensor): input text position ids [batch, text_seq_len].
            attention_mask (torch.Tensor): Attention mask for the language model [batch, 1, combined_seq_len,
            combined_seq_len].
            labels (torch.Tensor): Optional target text labels [batch, combined_seq_len].
            loss_mask (torch.Tensor): Text loss mask [batch, text_seq_len].
            inference_params (InferenceParams): Inference-time parameters including KV cache.
            num_image_tiles (list of int): Number of tiles per image. Default 1 tile per image.
            image_token_index (int): ID for input images. Default None means `image_token_index`
                arg in the constructor will be used.
            runtime_gather_output (bool): Gather output at runtime. Default None means
                `parallel_output` arg in the constructor will be used.
            image_token_mask (torch.Tensor): Tensor indicating the location of
                image token index in input_ids.
            packed_seq_params (PackedSeqParams): Dict with padded token information.
                Required for using SP/CP with padding mask type.

        Returns:
            output (torch.Tensor): Loss of shape [b, s] if labels are provided,
                otherwise logits of shape [b, s, vocab_size].
            loss_mask (torch.Tensor): Loss mask expanded to combined sequence length. Shape [b, s].
        Nimage_tokens_countr   dtypedeviceT)output_hidden_statesrl   )num_unused_layersr      )r   r[   thd)r   r[   rX   decoder_inputr\   r   r   r^   )&key_value_memory_dictrq   r   ro   r   
parameterstorchtensorr   r   reshapetor   evalr   r   r   rw   permute
contiguousr   shaper   r   r   cloner   	embedding	transposeonesr   ra   rb   _preprocess_datar   r   
qkv_format!_process_embedding_token_parallel)r   r   r[   r]   rX   r   r\   r   r   r   r   r   r^   use_inference_kv_cache
has_imagesimage_embeddingsvision_paramr   language_embeddingsinput_ids_textcombined_embeddingsfinal_labelsfinal_loss_maskfinal_attention_maskoutputrJ   rJ   rK   forwardC  s   -
 




zMCoreNevaModel.forwardc                 C   s   t |ts|g}t|dksJ d| jr"| jr"| j|d  dS | jr/| j|d  dS | jr9|d | _dS | j	|d  dS )zSet model chunk input tensor.rl   z.input_tensor should only be length 1 for llavar   N)
ra   rb   rq   r   r   r   set_input_tensorr   r   r   )r   input_tensorrJ   rJ   rK   r     s   
zMCoreNevaModel.set_input_tensorc           8      C   s  | j sJ d| js| js||||	fS |r||||	fS | j}|j\}}|du}|r;|j|jks;J d|j d|j |
duoC|
jdk}t # ||k}tj|dd}|j	|
 dd}tjd	d
 |D |jd}|| | | }| }| |
\}}|d | | | }| jr|| jk r| j}| jr| jr|| jk r| j}nt }t||d  | | }|}|r|
jd |kr|
jd |
jd  }||
jd  }||ksJ d||
jd< ||
jd< t||
j|
_t||
j|
_t||k\}}|  }|| d ||< tj|d ddd }|||f }|r5|d } | dk}!| |! } ||! }"|d }#|#dk}$|#|$ }#tj||fdtj|jd}%d|%||f< |dddf d }&d|%tj ||&jd!|d|&"dk< W d   n	1 spw   Y  d}'| jr|jd }(tj#|||(|j$|jd}'|||f |'||f< |%ddd&d|(' |'|%< d\})}*|rtj||ft(|j$|jd})tj||fd|j$|jd}*||"|#f |)|"| f< |||f |*||f< |d }+t t)|+},||,df |)|,|+f< d|*|%< t|\}-}.|.d }/|/dk}0|-|0 }1|/|0 }2||1|2f }2d|*|1|2f< |'dur9|r9|'jdd |)j  kr4|*jks9J d J d|oC|)jd | jk}3|3r]|)ddd| jf })|*ddd| jf }*|'durz|'*dd' }'|'jd | jkrz|'d| j }'|r|
jd | jkr|
jd | j }4|
jd |
jd  }5|
jd |
jd  }6|5|6 }7|4|78 }4| j|
jd< | j|
jd< |4dkr|
jd  |48  < |
j+d  |48  < |
jd |
jd ksJ d|'|)|*|	fS )a  Preprocess input data before input to language model.

        This function is adopted from
        https://github.com/huggingface/transformers/blob/85817d98fb60977c97e3014196a462b732d2ed1a/src/transformers/models/llava_next/modeling_llava_next.py#L409
        for our input data conventions.

        image_token_index = -200 indicates the image position in the input_ids = [0, 1, -200, 2, 3]
        and labels = [1, -200, 2, 3, 4], for example.
        We want to replace the image position (-200) with image_embeddings and return the following:
        - final_embeddings = [0, 1, image_embeddings, 2, 3],
        - final_labels = [1, -100, 2, 3, 4]
        - final_loss_mask = [1, 0, 0, 1, 1]

        This function handles samples without images (text-only sample). It also handles samples
        with images that are split into multiples tiles.

        If pipeline parallelism is not used, then self.pre_process and self.post_process
        are both True and we update both input embeddings, labels and loss masks (if available).

        If pipeline parallelism is used, then we do the following
        - the first language model chunk has self.pre_process = True and
          self.post_process = False. We update input embeddings.
        - the middle language model chunk(s) has self.pre_process = False and
          self.post_process = False. We don't need to update anything.
        - the last language model chunk has self.pre_process = False and
          self.post_process = True. We update labels and loss mask.

        TODO: This function should adjust the attention mask too.
        Currently, we assume the language model uses a causal mask.

        Returns:
            final_embedding (torch.Tensor): image and text embeddings [combined_seq_len, b, h].
            final_labels (torch.Tensor): labels for image and text positions [b, combined_seq_len].
            final_loss_mask (torch.Tensor): loss mask [b, combined_seq_len].
        z>input text preprocessing is only needed for the language modelNzmismatching labels shape z and loss mask shape r   r   )dimr   c                 S   s   g | ]}|  qS rJ   )r{   )rG   xrJ   rJ   rK   
<listcomp>=  s    z3MCoreNevaModel._preprocess_data.<locals>.<listcomp>)r   rl   r   zW`language_max_sequence_length` needs to increase for sequence packing to work properly.Tr   Fr   )NNz*unexpected shapes after data preprocessingzLwith packed sequence, the truncation can only truncate on the last sequence.),r   r   r   r   r   r   r   no_gradr{   splittolistr   r   max_get_shard_factorr   r   r   r   ry   $get_tensor_model_parallel_world_sizer   rh   rj   rk   max_seqlen_qmax_seqlen_kvwherer   cumsumfullr   arangerepeat	unsqueezezerosr   r   r   r   r   rq   r   ri   )8r   r   r   r   r]   r\   r   r   r   rX   r^   img_seq_len
batch_sizetext_seq_len
has_labelspacked_sequencer   num_images_per_samplenum_image_tiles_batchseq_lensmax_seq_lenshard_factor_padded_seq_lentp_world_sizelast_seqlenlast_seqlen_paddedbatch_indicesnon_image_indicesimage_token_mask_lensnew_position_idstext_position_idslabel_text_position_idsvalid_label_text_position_idslabel_batch_indiceslabel_non_image_indicesvalid_label_non_image_indicesimages_maskfirst_padding_idxfinal_embedding	embed_dimr   r   label_extra_text_position_idsbatch_rangebatch_image_indicesimage_indicesbefore_image_indicesvalidvalid_batch_image_indicesvalid_before_image_indicestruncate_labelstruncate_lenfinal_seq_len_paddedfinal_seq_len_unpaddedfinal_paddingrJ   rJ   rK   r     s   0




R
	&

zMCoreNevaModel._preprocess_datac                 C   s   |dur|j dkrd}nd}| jdkr"| jr"| j| j d }||fS | jdkr0| jd }||fS | jr:| j}||fS d}d}||fS )z&Get shard factor of sequence dimensionNr   rl   r   r   )r   r   r   r   )r   r^   seq_dimr  rJ   rJ   rK   r    s   	

z MCoreNevaModel._get_shard_factorc              
   C   s  | j s| js||||fS | j r?| |\}}|j| | dks-J d| d|j d| | jr?| jr?|j| | jks?J d| jdkrt }| j rQ|	d|i | jr\|	||d |d	u se|j
d
kr}ddlm} | j rx|d dd|d< ||}n?zdd	l}	W n ty }
 ztd |
d	}
~
ww t }t }| D ]\}}|	|j|d||}|d|||< q| j r|d }|dd }| jr|d }|d }| jr| j r|d	ur|j
dkr|dd }t|}||||fS )z7Processes the input data for model parallelism support.r   z'Sequence length should be divisible by z2 for                 Sequence/Context parallelism z
 with dim zhTP Comm overlap either requires Vision+Text token length                 == language_max_sequence_lengthrl   r   )
new_labelsnew_loss_maskNsbhd)get_batch_on_this_cp_rankztPlease update Transformer Engine to >= 1.10 to use                             Context Parallel with THD format datar=  r>  r   )r   r   r  r   r   r   r   r   rM   rs   r   megatron.core.utilsr@  r   transformer_engine_torchModuleNotFoundErrorr   errorry   rz   get_context_parallel_rankrO   thd_get_partitioned_indicesrj   sizeindex_selectr   r   #scatter_to_sequence_parallel_region)r   r   r=  r>  r^   r  r<  r}   r@  texecp_sizecp_rankrd   dataindexrJ   rJ   rK   r     s|   

z0MCoreNevaModel._process_embedding_token_parallel)NTTTTTN)rU   N)r   r   r   r   r   r   r   r   r   r   r   Tensorr   r   r   r   r   r   r  r   __classcell__rJ   rJ   r   rK   r      s    	
R
	

 
 wr   c                       sl  e Zd ZdZ			d(dedee ded deeej	gej	f  f fdd	Z
d)d
ee ddfddZddddddedddf
dejdejdeej deej deej deej dee deee  dee dee deej dee dejfddZdeeejf fddZdejfddZd)dejfd d!Zd)dejfd"d#Zedefd$d%Zedefd&d'Z  ZS )*	NevaModelz Lightning Wrapper for Neva ModelNr   optimr   r   model_transformc                    sP   t    || _|| _|pttdddd| _| j|  || _d | _	d | _
d S )Ng-C6?T)lruse_distributed_optimizerr   )r   r   r   r   r   r   rS  connectrT  _training_loss_reduction_validation_loss_reduction)r   r   rS  r   rT  r   rJ   rK   r   H  s   

zNevaModel.__init__r   rU   c                 C   s&   t | ds| jj| j|d| _d S d S )Nmodule)r   )r   r   r   r   rZ  )r   r   rJ   rJ   rK   r   X  s   
zNevaModel.configure_modelr   r[   r]   rX   r   r\   r   r   r   r   r   r^   c                 C   s&   | j |||||||||	|
||d}|S )N)r   r   r[   r]   rX   r\   r   r   r   r   r   r^   )rZ  )r   r   r[   r]   rX   r   r\   r   r   r   r   r   r^   output_tensorrJ   rJ   rK   r   ]  s   zNevaModel.forwardc                 C   s   | j |S r   )r   r   )r   r|   rJ   rJ   rK   	data_step~  s   zNevaModel.data_stepc                 C   s   | j | |S r   )r   r   )r   r}   rJ   rJ   rK   forward_step  s   zNevaModel.forward_stepc                 C   
   |  |S r   r]  r   r}   	batch_idxrJ   rJ   rK   training_step  s   
zNevaModel.training_stepc                 C   r^  r   r_  r`  rJ   rJ   rK   validation_step  s   
zNevaModel.validation_stepc                 C   s   | j st | _ | j S r   )rX  r   r   rJ   rJ   rK   training_loss_reduction  s   z!NevaModel.training_loss_reductionc                 C   s   | j s	tdd| _ | j S )NT)rc  )rY  r   rd  rJ   rJ   rK   validation_loss_reduction  s   z#NevaModel.validation_loss_reduction)NNNr   ) r   r   r   r   r   r   r   r   r   Moduler   r   r   r   r   rP  r   r   r   r   r   r   r   r\  r]  rb  rc  propertyr   re  rf  rQ  rJ   rJ   r   rK   rR  E  sx    	
	

!rR  )rR  r   r   r   )F)>dataclassesr   typingr   r   r   r   lightning.pytorchpytorchLr   torch.distributedrn   r   r   r
   ry   r   megatron.core.enumsr   +megatron.core.models.multimodal.llava_modelr   r   megatron.core.optimizerr   megatron.core.packed_seq_paramsr   ,megatron.core.transformer.transformer_configr   r   1nemo.collections.common.tokenizers.tokenizer_specr   nemo.collections.llmr   0nemo.collections.vlm.neva.data.multimodal_tokensr   r   nemo.lightningr   nemo.lightning.io.plr    nemo.lightning.megatron_parallelr   nemo.lightning.pytorch.optimr   r   
nemo.utilsr   r   rT   r   rP  r   r   IOMixinr   r   LightningModuleConnectorMixinFNMixinrR  __all__rJ   rJ   rJ   rK   <module>   sF   
#8G    W]