o
    }oiD                     @   s   d dl Z d dlmZmZ d dlmZmZmZmZ d dl	Z	d dl
Z	d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d	eee	jf fd
dZd	e	jfddZd dlmZmZ eG dd deZG dd deZdgZdS )    N)	dataclassfield)CallableDictListOptionalparallel_state)InferenceParams)PackedSeqParams)pack_image_features)IMAGE_TOKEN_INDEXreturnc                    s  ddl m} t| }t|trt|dkr|d }n|}t   d | r- d |	 r6 d |
dd} fd	d
| D }|durddD ]}t||d}|durct|||jdd qM||d< t dkrd}d|v r|d dur|d  }||d< |S )a  
    Processes a batch of data from the dataloader for the LLaVA Next model.

    Args:
        dataloader_iter (Iterator): An iterator that provides batches of data from the dataloader.

    Returns:
        Dict[str, torch.Tensor]: A dictionary containing the processed batch, ready for input into the model.

    Notes:
        - Filters and moves required keys to the appropriate device.
        - Slices the batch along the sequence dimension for context parallelism.
    r   r      )tokensattention_maskmedianum_media_tilesimage_sizes)position_idsr   )labels	loss_maskr   packed_seq_paramsNc                    s2   i | ]\}}|| v r|d ur|j ddnd qS )NTnon_blocking)cuda).0keyvalrequired_keys ^/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/llava_next/model/base.py
<dictcomp>G   s     z(llava_next_data_step.<locals>.<dictcomp>)cu_seqlens_qcu_seqlens_kvcu_seqlens_q_paddedcu_seqlens_kv_paddedTr      r   num_valid_tokens_in_ub)megatron.corer	   next
isinstancetuplelensetupdateis_pipeline_first_stageis_pipeline_last_stagegetitemsgetattrsetattrr   psget_context_parallel_world_sizesum)dataloader_iterr	   batch_batchr   attrvaluer)   r!   r   r"   llava_next_data_step   s<   
	


r?   c                 C   sb   |d |d |d | dd| dd| dd| dd| d	d| d
dd	}| di |S )a  
    Performs the forward step for the LLaVA Next model.

    Args:
        model (torch.nn.Module): The LLaVA Next model instance.
        batch (Dict[str, torch.Tensor]): A dictionary containing input tensors for the forward step.

    Returns:
        torch.Tensor: The output from the model's forward computation.

    Notes:
        - Constructs the forward arguments based on the provided batch.
        - Includes optional parameters like packed sequence parameters if available.
    r   r   r   r   Nr   r   r   r   r   )	r   	input_idsr   r   r   r   r   r   r   r!   )r3   )modelr;   forward_argsr!   r!   r"   llava_next_forward_step[   s   





rC   )MCoreNevaModel
NevaConfigc                   @   sB   e Zd ZU dZeedZeed< ee	dZ
eed< d
ddd	ZdS )LlavaNextConfigzx
    Configuration class for the LLaVA Next model.
    Overrides NevaConfig and modifies forward and data step fn.

    )defaultforward_step_fndata_step_fnNr   MCoreLlavaNextModelc              
   C   s  d| j _| j| j _| j| j _| j| j_| j| j_| j| j _| j| j _| jdkrN| jdks0J d| j| j_| j| j_| j| j _| j	dkrN| j	| j_| j	| j_|pQd}t
| |tjd|dpbt | jktjd|dtjd|dtjd|dp{t | jk| j|d}|S )z
        Configures the LLaVA Next model with the appropriate settings.

        Args:
            tokenizer: Tokenizer instance to be used with the model.

        Returns:
            MCoreLlavaNextModel: An instance of the LLaVA Next model.
        Fr   r(   z&ViT can only live on 1 pipeline stage.)ignore_virtualvp_stageconfig	tokenizerpre_processpost_processadd_encoderadd_decoderdrop_vision_class_tokenrL   )language_transformer_config#scatter_embedding_sequence_paralleltensor_model_parallel_sizesequence_parallelvision_transformer_configvision_projection_configpipeline_model_parallel_sizecontext_parallel_size$encoder_pipeline_model_parallel_size"encoder_tensor_model_parallel_sizerJ   r7   r1    get_pipeline_model_parallel_rankr2   rT   )selfrO   rL   rA   r!   r!   r"   configure_model   s<   











zLlavaNextConfig.configure_model)N)r   rJ   )__name__
__module____qualname____doc__r   rC   rH   r   __annotations__r?   rI   ra   r!   r!   r!   r"   rF   |   s
   
 rF   c                       s   e Zd ZdZ							ddedededed	ed
edee ddf fddZdddddde	ddf	de
jde
jde
jdee
j dee
j dee
j dee
j dee deee  dee dee dee de
jfddZ  ZS )rJ   z
    The LLaVA Next model class, extending MCoreNevaModel.

    Attributes:
        image_newline (torch.nn.Parameter): A learnable parameter for handling image newlines.
    NTFrN   rP   rQ   rR   rS   rT   rL   r   c	           
   
      sN   t  j||||||||d dt|jj }	tjt	|jj|	 | _
dS )a  
        Initializes the LLaVA Next model.
        Calls the super class init and initialize image_newline parameter

        Args:
            config (LlavaNextConfig): Model configuration instance.
            tokenizer: Optional tokenizer instance.
            pre_process (bool): Whether to enable preprocessing.
            post_process (bool): Whether to enable postprocessing.
            add_encoder (bool): Whether to add the encoder module.
            add_decoder (bool): Whether to add the decoder module.
            drop_vision_class_token (bool): Whether to drop the vision class token.
            vp_stage (Optional[int]): Virtual pipeline stage.
        rM   r(   N)super__init__mathsqrtrZ   hidden_sizetorchnn	Parameterrandnimage_newline)
r`   rN   rO   rP   rQ   rR   rS   rT   rL   	embed_std	__class__r!   r"   rh      s    zMCoreLlavaNextModel.__init__r@   r   r   r   r   r   r   inference_paramsr   media_token_indexruntime_gather_outputr   c              
   C   s  |duod|j v }|jd dk}|rd}n| jr*|s*tjg |j|jdddd}nj| jr|r| jrH| j	
 | _	| j	|dd}|d | jj }n|t| j	 j}| j	|| jj d d	}| jrvt| j	d
d}|dd|dddf }| }| |}|dur|jd |jd  |j d< n| j}| js|S d}| jr| }d||dk < | jj||d}|dd }|	du rtj|jd tj|jd}	nt|	trtj|	tj|jd}	tj||	  dd}t!||d| j"d\}}||
k# $ }|jd }||kr
t%d| d| ||
k&d}|'||j}||j|j}|(||}|}|}|)ddd}| }| j*dksA| j+rY| j*dkrM|dd}| ,||||\}}}}| jddd|||||d}|du sp|du rr|S || fS )at  Forward function of the LLaVA Next model.

        Args:
            images (torch.Tensor): input image of shape [num_tiles, img_h, img_w].
                                    num_tiles means the number of image tiles in this batch.
            input_ids (torch.Tensor): input text ids [batch, text_seq_len].
            position_ids (torch.Tensor): input text position ids [batch, text_seq_len].
            image_sizes (torch.Tensor): Raw image sizes  before tiling (N,2).
            attention_mask (torch.Tensor): Attention mask for the language model [batch, text seq length].
            labels (torch.Tensor): Optional target text labels [batch, combined_seq_len].
            loss_mask (torch.Tensor): Text loss mask [batch, text_seq_len].
            inference_params (InferenceParams): Inference-time parameters including KV cache.
            num_media_tiles (list of int): Number of tiles per image. Default None assumes 1 tile per image.
            image_token_index (int): ID for input images.
            packed_seq_params (PackedSeqParams): Dict with padded token information.
                Required for using SP/CP with padding mask type.
        Returns:
            output (torch.Tensor): Loss ([b, s]) if labels are provided; logits ([b, s, vocab_size]) otherwise.
            loss_mask (torch.Tensor): Loss mask expanded to combined sequence length. Shape [b, s].
        Nimage_tokens_countr   )dtypedeviceT)output_hidden_statesr(   )num_unused_layersclass_token_lenmedia_tokens_count)r@   r   )dimrG   )vision_feature_select_strategyrp   z6Image features and image tokens do not match: tokens: z, features    )r@   r   r   decoder_inputr   rt   rv   r   )-key_value_memory_dictshaperR   rl   tensorrx   ry   reshapevision_model_from_hfvision_modelevalrN   vision_feature_layertor+   
parameters_drop_vision_class_tokenr5   
contiguousvision_projectionencoder_hidden_staterS   rP   clonelanguage_model	embedding	transposeonesintr,   listsplittolistr   rp   r9   item
ValueError	unsqueeze	expand_asmasked_scatterpermutecontext_parallel_lmsequence_parallel_lm!_process_embedding_token_parallel)r`   r@   r   r   r   r   r   r   rt   r   ru   rv   r   use_inference_kv_cache
has_imagesmedia_embeddingsr}   language_embeddingsinput_ids_textfeature_lensn_image_tokensn_image_featuresspecial_image_maskcombined_embeddingsfinal_labelsfinal_loss_maskoutputr!   r!   r"   forward   s   $
 






zMCoreLlavaNextModel.forward)NTTTTFN)rb   rc   rd   re   rF   boolr   r   rh   r   rl   Tensorr
   r   r   r   __classcell__r!   r!   rr   r"   rJ      s|    
	
,	

rJ   ) ri   dataclassesr   r   typingr   r   r   r   rl   torch.distributedr*   r	   r7   megatron.core.inference_paramsr
   megatron.core.packed_seq_paramsr   +nemo.collections.vlm.llava_next.model.utilsr   0nemo.collections.vlm.neva.data.multimodal_tokensr   strr   r?   rC   $nemo.collections.vlm.neva.model.baserD   rE   rF   rJ   __all__r!   r!   r!   r"   <module>   s&   >; I