o
    wiRb                     @   s^  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	 d dl
mZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z' d dlm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8m9Z9 d dl:m;Z; d dl<m=Z= d dl>m?Z?m@Z@ d dlAmBZB d dlCmDZDmEZE d dlFmGZG deeHejIf fdd ZJdejIfd!d"ZKd#d$ ZLeG d%d& d&e"e@jMZNeG d'd( d(e3ZOeG d)d* d*e"e@jMZPG d+d, d,eZQG d-d. d.eZRG d/d0 d0ejSe@jMe@jTe,jUZVg d1ZWdS )2    N)	dataclass)CallableDictListOptionalTuple)	rearrange)	ModelType)InferenceParams)MultimodalProjector)OptimizerConfig)ColumnParallelLinear)MegatronModule)MLPSubmodules)
ModuleSpec)TransformerConfig)get_batch_on_this_cp_rank)Image)nn)TokenizerSpec)fn)local_layer_spectransformer_engine_layer_spec)get_packed_seq_params)Llama31Configapply_rope_scaling)CrossAttentionTextModel)_generate_cross_attention_mask_pad_attention_masks)VisionEncoder)MODEL_CONFIG_ATTR)get_vocab_sizeio)MaskedTokenLossReduction)MegatronOptimizerModuleOptimizerModule)loggingreturnc                    s   ddl m} t| }t|trt|dkr|d }n|}t   d | r- d |	 r6 d  fdd|
 D }t|}|S )	zMllama data step.r   parallel_state   )attention_masktokensbatch_masksposition_ids
num_chunks)batch_imagesaspect_ratio_ids)labels	loss_maskc                    s6   i | ]\}}|| v rt |tjr|jd dn|qS )T)non_blocking)
isinstancetorchTensorcuda).0keyvalrequired_keys c/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/vlm/mllama/model/base.py
<dictcomp>\   s    $z$mllama_data_step.<locals>.<dictcomp>)megatron.corer)   nextr5   tuplelensetupdateis_pipeline_first_stageis_pipeline_last_stageitemsr   )dataloader_iterr)   batch_batchoutputr>   r<   r?   mllama_data_step4   s,   
	
rN   c              
   C   sV   |d |d |d |d |d |d | ddd	}d
|v r$t||d< | di |S )zMllama model forward step.r0   r-   r,   r.   r1   r/   r2   N)r0   r-   r,   r.   r1   r/   r2   
cu_seqlenspacked_seq_paramsr>   )getr   )modelrK   forward_configr>   r>   r?   mllama_forward_stepf   s   

rT   c                 C   s   dS )z@Placeholder for `set_input_tensor` method for PP implementation.Nr>   selftensorr>   r>   r?   set_input_tensorx      rX   c                   @   s   e Zd ZU dZdZeed< dZeed< dZe	ed< dZ
e	ed< d	Ze	ed
< dZe	ed< dZe	ed< dZe	ed< dZe	ed< dZe	ed< dZeed< dZeed< dZeed< dZeed< dZeee	e	f df ed< ede	fddZd$d!d"Zd#S )%CrossAttentionVisionConfigz%Configuration for llama vision model.Tbias_activation_fusionbias_dropout_add_fusion    
num_layers   hidden_size   num_attention_headsvision_chunk_size   vision_max_num_chunks   num_global_layersmax_num_tiles   text_hidden_sizeg        hidden_dropoutattention_dropoutffn_dropoutFgated))   rp   )rp      )rp   r*   )rp   re   )rq   rp   )rq   rq   )r*   rp   )re   rp   .supported_aspect_ratiosr'   c                 C   s
   t | jS N)rD   rr   rV   r>   r>   r?   max_aspect_ratio_id      
z.CrossAttentionVisionConfig.max_aspect_ratio_idCrossAttentionVisionModelc                 C   s   t | S )zConfigure mllama vision model.)rw   rt   r>   r>   r?   configure_model   s   z*CrossAttentionVisionConfig.configure_modelN)r'   rw   )__name__
__module____qualname____doc__r[   bool__annotations__r\   r^   intr`   rb   rd   rf   rh   ri   rk   rl   floatrm   rn   ro   rr   r   propertyru   rx   r>   r>   r>   r?   rZ   }   s(   
 rZ   c                   @   s   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dZeed< dZeed< dZeed< dZeed< dedee fddZddee fddZdS )CrossAttentionTextConfigzc
    Configuration for llama model with cross-attention layers to take in multimodal features.
    i  rotary_basei    
seq_lengthr]   r^   rj   r`   i 8  ffn_hidden_sizerb   rg   num_cross_attention_layersi  
vocab_sizeFapply_rope_fusionr'   c                 C   sJ   t t| j}tt|| }|ddd dd| d| ddd S )zGInitialize self-attention layer / cross-attention layer fusion scheduleNrc   )listranger^   mathceilrD   )rV   r^   mllama_layerskr>   r>   r?   _init_fusion_schedule   s   *z.CrossAttentionTextConfig._init_fusion_scheduleTNvp_stagec           
      C   s   |  | j| _| j}|r| j}| j| | dksJ d| j}t|ts(|| }t	| drD| j
}td| d|j
 d||j
  d nt| |j
| j}|pOd}t| ||| j| j| j| j| j| j| j| j|||d}	t|	jj| j| j| j| jd	|	j_|	S )
zConfigure mllama text model.r   zLMake sure the number of model chunks is the same across all pipeline stages.r   zUse preset vocab_size: z, original vocab_size: z, dummy tokens: .)transformer_layer_specr   max_sequence_lengthfp16_lm_cross_entropyparallel_output#share_embeddings_and_output_weightsposition_embedding_typerotary_percentr   seq_len_interpolation_factorpre_processpost_processr   )factorlow_freq_factorhigh_freq_factorold_context_len)r   r   fusion_schedule$virtual_pipeline_model_parallel_sizepipeline_model_parallel_sizer^   r   r5   r   hasattrr   r&   infor!   make_vocab_size_divisible_byr   r   r   r   r   r   r   r   r   r   rotary_pos_embinv_freqscale_factorr   r   r   )
rV   	tokenizerr   r   r   vp_sizep_sizer   r   rR   r>   r>   r?   rx      s^   


z(CrossAttentionTextConfig.configure_model)TTN)ry   rz   r{   r|   r   r   r~   r   r^   r`   r   rb   r   r   r   r}   r   r   r   rx   r>   r>   r>   r?   r      s   
 r   c                   @   s   e Zd ZU dZdZee ed< dZee	 ed< dZ
eed< dZeed< d	Zeed
< dZeed< dZeed< dZee ed< dZee ed< eZeed< eZeed< dd ZddddZdS )MLlamaModelConfigz<Combined configuration for multimodal vision-language model.Nlanguage_model_configvision_model_configr   $encoder_pipeline_model_parallel_sizerp   "encoder_tensor_model_parallel_sizerc   !vision_num_cross_attention_layersr^   rg   rb   language_model_from_pretrainedvision_model_from_pretrainedforward_step_fndata_step_fnc                 C   s0   | j d urtD ]}t| |t| j | qd S d S rs   )r   r    setattrgetattr)rV   attrr>   r>   r?   __post_init__  s
   
zMLlamaModelConfig.__post_init__r'   MLlamaBaseModelc              
   C   s   ddl m} | j| j_| j| j_| j| j_| jdkr7| jdks#J d| j| j_| j| j_| jdkr7| j| j_|p:d}t| ||j	d|dpK|
 | jk|jd|d|j	d|d|jd|dpd|
 | jk|d}|S )zConfigure mllama model.r   r(   rp   z&ViT can only live on 1 pipeline stage.F)ignore_virtualr   )configr   r   r   add_encoderadd_decoderr   )rA   r)   tensor_model_parallel_sizer   r   r   r   r   r   rG    get_pipeline_model_parallel_rankrH   )rV   r   r   psrR   r>   r>   r?   rx     s.   







z!MLlamaModelConfig.configure_modelrs   )r'   r   )ry   rz   r{   r|   r   r   r   r~   r   rZ   r   r   r   r   r^   rb   r   strr   rT   r   r   rN   r   r   rx   r>   r>   r>   r?   r      s   
 r   c                       sF   e Zd ZdZd fddZdejdejdejfdd	Zd
d Z  Z	S )rw   zMllama vision model.r'   Nc                    s   t  j|d d}d| _|j| _|j| _|d ur,dd |dD }t|d | j | _d| _	t
||j| j	|d	|j| _t|}|j|_ttd d
}t||d| jd| _d| jj_d S )Nr   z3,7,15,23,30r_   c                 S   s   g | ]}t |qS r>   )r   )r9   l_nor>   r>   r?   
<listcomp>7  s    z6CrossAttentionVisionModel.__init__.<locals>.<listcomp>,rp      )r   
image_size
patch_sizereturn_intermediate)
linear_fc1
linear_fc2affine)r   
submodulesprojector_type
input_sizeF)super__init__vision_input_dimrd   	image_resrf   max_num_chunkssplitrD   r   r   toparams_dtypevision_encodercopydeepcopyrk   r`   r   r   r   vision_projectionencoderskip_bias_add)rV   r   r   projection_configaffine_layer_spec	__class__r>   r?   r   0  s6   
z"CrossAttentionVisionModel.__init__imagesr1   c                 C   s\   |  |jtjd|}|j}| |jdg|dd R  }|jg |dd dR  }|S )Forward.)dtyperc   N)r   r   r6   bfloat16shaper   reshape)rV   r   r1   vision_tokensvision_shaper>   r>   r?   forwardL  s
    z!CrossAttentionVisionModel.forwardc                 C   s   d S rs   r>   rU   r>   r>   r?   rX   W  rY   z*CrossAttentionVisionModel.set_input_tensorr'   N)
ry   rz   r{   r|   r   r6   r7   r   rX   __classcell__r>   r>   r   r?   rw   -  s
    rw   c                       s$  e Zd ZdZ						d"dededededed	ed
ee ddf fddZde	j
deeeeeef de	j
de	j
dedeee	j
e	j
f fddZ									d#de	j
de	j
dee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee	j
 dee dede	j
fddZd$d d!Z  ZS )%r   zHMllama base model combining vision and text models with cross-attention.NTr   r   r   r   r   r   r   r'   c           
         s   t  j|d |j}|j}	|| _|| _d | _d | _d | _d| _	|d uo$|| _
|	d uo+|| _|| _| j
rB|j||||d| _| jj	| _	| jrJ|	 | _tj| _d| _d| _|	j| _|	j| _d S )Nr   F)r   r   r   r   Tr   )r   r   r   r   r   r   encoder_hidden_statevision_modellanguage_modelr   r   r   r   rx   r	   encoder_or_decoder
model_typexattn_neededr   rd   r   rf   r   )
rV   r   r   r   r   r   r   r   r   r   r   r>   r?   r   _  s0   


zMLlamaBaseModel.__init__r   vision_orig_shaper-   r/   	total_lenc                    sx   |\}}}}	}
 fdd| j jjD }t|||| j j}t d|||	d t|dt| j 	 j
 |d\}}|||fS )z.Compute xattn caches masks used in text model.c                    s   g | ]}|  qS r>   )compute_xattn_kv_cache)r9   layerr   r>   r?   r     s    
z>MLlamaBaseModel.compute_xattn_caches_masks.<locals>.<listcomp>z.(nimg nchk ntok) b dim -> b nimg nchk ntok dim)nimgnchkntokr8   )text_token_counttext_device
text_dtyper   cross_attention_masks)r   decoderxattn_layersr   r   devicer   r   rB   
parametersr   )rV   r   r   r-   r/   r   bszr   nchunkr   image_token_dimxattn_cachespadded_masksr   full_text_row_masked_out_maskr>   r   r?   compute_xattn_caches_masks  s,   	



z*MLlamaBaseModel.compute_xattn_caches_masksr.   r,   r2   r0   r1   r   r
  r  inference_paramsc                 C   s  |
du r{| d| d}}||| jt| j| j d d | jjf}d}|dkr2| j||dk< d}| jdur;| j}n|rGtj	|dtj
d}n| ||}t|d	 }| jsY|S | j|||||jd d
\}
}}	|d }|durz|
|_||_|	|_n|jd d g}| jsJ dd}| jr| j|}|dd }|dur|	dddd|f dddddnd}	| j||||d|dur|dddd|f nd|	|
|d	}|S )r   Nr   rp   rq   FTr8   )r  r   z.b nimg nchk ntok dim -> (nimg nchk ntok) b dim)r   r   r-   r/   r   z)Language model required for forward pass.r*   )		input_idsr.   r2   decoder_inputr+   r   r
  r  r  )sizer   r   r   r   r   r`   r   r6   zerosr   r   r   
contiguousr   r  r   r  r   r
  r   r   !get_partially_trainable_embedding	transposepermutesqueeze)rV   r.   r,   r2   r0   r-   r/   r1   r   r
  r  r  r  max_num_imagesr   skip_vision_encoderr   xattn_mask_indexlanguage_embeddingsrM   r>   r>   r?   r     s~   
( zMLlamaBaseModel.forwardc                 C   sv   t |ts|g}| jr| j|d  dS | jr"| jr"|d | _dS t|dks,J d| j	|d  |d | _dS )zSet model chunk input tensor.r   rq   z+input_tensor should contain encoder output.rp   N)
r5   r   r   r   rX   r   r   r   rD   r   )rV   input_tensorr>   r>   r?   rX     s   
z MLlamaBaseModel.set_input_tensor)NTTTTN)	NNNNNNNNNr   )ry   rz   r{   r|   r   r   r}   r   r   r6   r7   r   r   r  r
   r   rX   r   r>   r>   r   r?   r   \  s    	*
'	

Zr   c                       sb  e Zd ZdZ				d&dee dee ded deeej	gej	f  f fdd	Z
d'd
ee ddfddZ							d(deeej  dejdejdeej deej deej deej deej deej deej dejfddZdeeejf fddZdejfddZd'dejfddZd'dejfd d!Zedefd"d#Zedefd$d%Z  ZS ))MLlamaModelz&Lightning Module for the MLlama model.Nr   optimr   r   model_transformc                    sP   t    || _|| _|pttdddd| _| j|  || _d | _	d | _
d S )Ng-C6?T)lruse_distributed_optimizerr   )r   r   r   r   r$   r   r  connectr  _training_loss_reduction_validation_loss_reduction)rV   r   r  r   r  r   r>   r?   r     s   

zMLlamaModel.__init__r   r'   c                 C   s&   t | ds| jj| j|d| _dS dS )zConfigure mllama modelmodule)r   N)r   r   rx   r   r#  )rV   r   r>   r>   r?   rx   (  s   
zMLlamaModel.configure_modelr0   r,   r.   r-   r/   r1   r2   r   r
  r  c                 C   s"   | j |||||||||	|
d
}|S )r   )
r.   r,   r0   r-   r/   r1   r2   r   r
  r  )r#  )rV   r0   r,   r.   r-   r/   r1   r2   r   r
  r  output_tensorr>   r>   r?   r   -  s   zMLlamaModel.forwardc                 C   s   | j |S rs   )r   r   )rV   rJ   r>   r>   r?   	data_stepJ  s   zMLlamaModel.data_stepc                 C   s   | j | |S rs   )r   r   )rV   rK   r>   r>   r?   forward_stepN  s   zMLlamaModel.forward_stepc                 C   
   |  |S rs   r&  rV   rK   	batch_idxr>   r>   r?   training_stepR  rv   zMLlamaModel.training_stepc                 C   r'  rs   r(  r)  r>   r>   r?   validation_stepW  rv   zMLlamaModel.validation_stepc                 C   s   | j st | _ | j S rs   )r!  r#   rt   r>   r>   r?   training_loss_reduction\  s   z#MLlamaModel.training_loss_reductionc                 C   s   | j s	tdd| _ | j S )NT)r,  )r"  r#   rt   r>   r>   r?   validation_loss_reductiond  s   z%MLlamaModel.validation_loss_reduction)NNNNrs   )NNNNNNN)ry   rz   r{   r|   r   r   r%   r   r   Moduler   r   rx   r   	PIL_Imager   r6   
LongTensorr7   r   r   r   r%  r&  r+  r,  r   r#   r-  r.  r   r>   r>   r   r?   r    sl    
	

r  )r  r   r   rZ   rN   rT   r   r   )Xr   r   dataclassesr   typingr   r   r   r   r   lightning.pytorchpytorchLr6   torch.distributedeinopsr   megatron.core.enumsr	   megatron.core.inference_paramsr
   0megatron.core.models.vision.multimodal_projectorr   megatron.core.optimizerr   $megatron.core.tensor_parallel.layersr   megatron.core.transformerr   megatron.core.transformer.mlpr   $megatron.core.transformer.spec_utilsr   ,megatron.core.transformer.transformer_configr   megatron.core.utilsr   PILr   r0  r   1nemo.collections.common.tokenizers.tokenizer_specr   nemo.collections.llmr   nemo.collections.llm.gpt.modelr   r   #nemo.collections.llm.gpt.model.baser   $nemo.collections.llm.gpt.model.llamar   r   *nemo.collections.vlm.mllama.model.languager   'nemo.collections.vlm.mllama.model.utilsr   r   (nemo.collections.vlm.mllama.model.visionr   $nemo.collections.vlm.neva.model.baser    nemo.lightningr!   r"    nemo.lightning.megatron_parallelr#   nemo.lightning.pytorch.optimr$   r%   
nemo.utilsr&   r   r7   rN   rT   rX   IOMixinrZ   r   r   rw   r   LightningModuleConnectorMixinFNMixinr  __all__r>   r>   r>   r?   <module>   s^   2+J8/ :X