o
    }oiE                     @   s  d dl mZ d dlmZ d dlmZmZmZmZm	Z	m
Z
mZ d dlZd dlmZ d dlZd dlm  mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d dlm  m   m!  m"  m#Z$ d d
l%m&Z&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3m4Z4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z= erd dl>m?Z@ d dlAmBZB d dlCmDZD e=d\ZEZFdddefddZGdddefddZHdd ZIdeeJejf fd d!ZKd"ejLd#eeJejf dejfd$d%ZMeG d&d' d'e,ZNeG d(d) d)e-ZOd*ed+efd,d-ZPG d.d/ d/e/ZQe4ReQd0G d1d2 d2e+ZSe4TeQd0G d3d4 d4e4jUeQd5f ZVd'd/gZWdS )6    )	dataclass)Path)TYPE_CHECKING	AnnotatedCallableDictLiteralOptionalUnionN)parallel_state)AttnMaskType)
ModuleSpec)get_batch_on_this_cp_rank)Tensornn),BERTInBatchExclusiveHardNegativesRankingLossHardNegativeRankingLoss)	GPTConfig)HFLlamaImporterLlama32Config1BLlama32Config3BLlamaConfig
LlamaModel)Config)OptimizerModuleioteardown)TransformFns)dtype_from_hf)logging)safe_import)GPTModel)TokenizerSpecLlamaBidirectionalModeltransformer_engineconfigr   returnc                 C      t | }tj|jjjd< |S Nattn_mask_type)GPTBaselocal_layer_specr   padding
submodulesself_attentionparamsr&   gpt_layer_spec r3   b/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/gpt/model/llama_embedding.py_local_layer_spec6      
r5   c                 C   r(   r)   )r+   transformer_engine_layer_specr   r-   r.   r/   r0   r1   r3   r3   r4   _transformer_engine_layer_spec<   r6   r8   c                 C   s   t rt| S t| S )zzCustomized Layer Spec for NV Embedding Llama Model.
    Bidirectional attention is enabled instead of causal masking.
    )HAVE_TEr8   r5   )r&   r3   r3   r4   get_nv_embedding_layer_specB   s   r:   c                    sz   t | }t|trt|dkr|d }n|}t   d t r, d  d  fdd| D }t	|}|S )z/Setup NVEmbedding Llama Model dataloader batch.   r   attention_mask	input_idsposition_idsc                    s*   i | ]\}}|| v r|j d dndqS )T)non_blockingN)cuda).0keyvalrequired_keysr3   r4   
<dictcomp>]   s   * z*nv_embedding_data_step.<locals>.<dictcomp>)
next
isinstancetuplelensetaddr   is_pipeline_first_stageitemsr   )dataloder_iterbatch_batchoutputr3   rD   r4   nv_embedding_data_stepL   s   



rS   modelrP   c                 C   s,   |d |d |d d}| j di |}|S )a  
    This subsets the batch keys to the ones actually used by forward pass of the model,
    and then calls the model's forward pass. if "cu_seqsens" are defined in the batch,
    then the packed sequence parameters are also passed to the model for forward pass efficiency.
    r=   r<   r>   )r=   r<   r>   Nr3   )encode)rT   rP   forward_argsembr3   r3   r4   nv_embedding_forward_stepd   s   rX   c                          e Zd ZU dZeZeeedgef f e	d< e
Zee	d< eZee	d< dZed e	d< d	Zee	d
< dZee	d< dZee	d< dZee	d< dZed e	d< dZee	d< dZee	d< dd fddZ  ZS )Llama32EmbeddingConfig1BzLlama3.2 Embedding 1B Configr   transformer_layer_specforward_step_fndata_step_fnrightleftr^   truncation_method   num_hard_negatives2   ce_loss_scale        label_smoothingFin_batch_negativesfirstrandomri   negative_sample_strategyTadd_bosadd_eosNr'   MCoreGPTModelc                       t  ||||}d|_|S )z,Configure the NV Embedding Llama3.2 1B ModelFsuperconfigure_modelpost_processself	tokenizerpre_processrt   vp_stagerT   	__class__r3   r4   rs         z(Llama32EmbeddingConfig1B.configure_modelNNNr'   ro   __name__
__module____qualname____doc__r:   r[   r
   r   r   __annotations__rX   r\   rS   r]   ra   r   rc   intre   floatrg   rh   boolrl   rm   rn   rs   __classcell__r3   r3   rz   r4   rZ   s      
 rZ   c                       rY   )Llama32EmbeddingConfig3BzLlama3.2 Embedding 3B Configr   r[   r\   r]   r^   r_   ra   rb   rc   rd   re   rf   rg   Frh   ri   rj   rl   Trm   rn   Nr'   ro   c                    rp   )z,Configure the NV Embedding Llama3.2 3B ModelFrq   ru   rz   r3   r4   rs      r|   z(Llama32EmbeddingConfig3B.configure_modelr}   r~   r   r3   r3   rz   r4   r      r   r   last_hidden_statesr<   c                 C   s>   t | d} | |d   d}|jdd|jddd  S )z4Average the hidden states on the non-masking tokens.zs b h -> b s h).Nrf      )dim)einops	rearrangemasked_fillr   sum)r   r<   last_hiddenr3   r3   r4   _average_pool   s   r   c                       s   e Zd ZdZ				ddeee ee f dee ded dee	e
jge
jf  f fdd	Zed
d Z	ddejdejdejdeej fddZedefddZedefddZ  ZS )LlamaEmbeddingModelzNV Embedding Llama ModelNr&   optimrw   r"   model_transformc                    s   t  j|pt |||d d S )N)r   rw   r   )rr   __init__r   )rv   r&   r   rw   r   rz   r3   r4   r      s   zLlamaEmbeddingModel.__init__c                 C   s   | j j| j j| j j| j jdS )z+Getter for dataset_kwargs from model config)rc   rl   rm   rn   )r&   rc   rl   rm   rn   rv   r3   r3   r4   dataset_kwargs   s
   z"LlamaEmbeddingModel.dataset_kwargsr=   r>   r<   decoder_inputc                 C   s   |j dkr|dddk }n$|j dkr0|jd dkr#|jd dks'J d|}| dk }ntd| j||||d}t||}tj|ddd}|S )	zGenerate the embedding for the inputs.
        It runs the forward and apply average pooling on the last hidden states of the model.
           r   g      ?rb   zAttention mask shape incorrectzAttention_mask shape incorrect)r=   r>   r<   r   )pr   )	ndim	unsqueezeshapesqueeze
ValueErrorforwardr   F	normalize)rv   r=   r>   r<   r   extended_maskrR   
embeddingsr3   r3   r4   rU      s    


$
zLlamaEmbeddingModel.encoder'   c                 C   <   | j s| jjr
t}nt}|d| jj| jj| jjd| _ | j S )NFvalidation_steprc   scalerg   )_training_loss_reductionr&   rh   r   r   rc   re   rg   rv   	loss_funcr3   r3   r4   training_loss_reduction      z+LlamaEmbeddingModel.training_loss_reductionc                 C   r   )NTr   )_validation_loss_reductionr&   rh   r   r   rc   re   rg   r   r3   r3   r4   validation_loss_reduction   r   z-LlamaEmbeddingModel.validation_loss_reduction)NNNNN)r   r   r   r   r   r	   r   r   r   r   r   Moduler   propertyr   torch
LongTensorr   rU   r   r   r   r   r3   r3   rz   r4   r      s>    	

!r   hfc                   @   sB   e Zd ZdZdefddZedefddZde	de	fdd	Z
d
S )LlamaEmbeddingImporterz%HF Importer for Llama Embedding Modelr'   c                 C   s   t | j| jdS )N)rw   )r   r&   rw   r   r3   r3   r4   init  s   zLlamaEmbeddingImporter.initc                 C   s   ddl m} |t| }dd }t|j|j|j|j|j	|j
|j|jd||jt|ddt|tjkt|tjkt|d}|S )	Nr   )r   c                 S   s(   d}| | dkr|d }| | dks|S )N   r   r   r3   )
vocab_sizebaser3   r3   r4   make_vocab_size_divisible_by  s
   zCLlamaEmbeddingImporter.config.<locals>.make_vocab_size_divisible_byTtie_word_embeddingsF)
num_layershidden_sizeffn_hidden_sizenum_attention_headsinit_method_stdlayernorm_epsilonnum_query_groupsrotary_basegated_linear_unitr   #share_embeddings_and_output_weightsfp16bf16params_dtype)transformersr   from_pretrainedstrrZ   num_hidden_layersr   intermediate_sizer   initializer_rangerms_norm_epsnum_key_value_heads
rope_thetar   getattrr   r   float16bfloat16)rv   HFLlamaConfigsourcer   rR   r3   r3   r4   r&     s(   
zLlamaEmbeddingImporter.configoutput_pathc                 C   s   ddl m}m} z|jt| ddd}W n   |jt| ddd}G dd dtj}|||j}Y |  }| 	|}| 
|| | || td| d	 t|| ~~|S )
zApply the conversion from HF to NeMo format.
        Args:
            output_path: Path where the converted model will be saved
        Returns:
            Path: Path to the saved NeMo model
        r   )	AutoModelAutoModelForCausalLMautoT)torch_dtypetrust_remote_codec                       s    e Zd ZdZ fddZ  ZS )z2LlamaEmbeddingImporter.apply.<locals>.ModelWrapperzXWrap the source in a model so that the key mapping is consistent with LlamaModelImporterc                    s   t    || _|| _d S r   )rr   r   rT   r&   )rv   rT   r&   rz   r3   r4   r   D  s   

z;LlamaEmbeddingImporter.apply.<locals>.ModelWrapper.__init__)r   r   r   r   r   r   r3   r3   rz   r4   ModelWrapperA  s    r   z7Converted LlamaEmbedding model to Nemo, model saved to .)r   r   r   r   r   r   r   r&   r   
nemo_setupconvert_state	nemo_saveprintr   )rv   r   r   r   r   r   targettrainerr3   r3   r4   apply2  s   

zLlamaEmbeddingImporter.applyN)r   r   r   r   r   r   r   r   r&   r   r   r3   r3   r3   r4   r     s    r   c                   @   sT   e Zd ZdZejfdddZdedefddZe	d	d
 Z
dd Ze	dddZdS )LlamaEmbeddingExporterzHF Exporter for NV Embedding Llama Model.
    Note that NV Embedding LLama uses customized LlamaBidirectionalConfig config.
    r'   r$   c                 C   s\   ddl m} ddlm} |d |  |j| j|dW  d    S 1 s'w   Y  d S )Nr   )no_init_weightsr#   r   )r   )transformers.modeling_utilsr   1nemo.collections.llm.gpt.model.hf_llama_embeddingr$   register_for_auto_class_from_configr&   )rv   dtyper   r$   r3   r3   r4   r   _  s   
$zLlamaEmbeddingExporter.initr   c                 C   s   |  t| \}}|jjjjj}| |}| ||}|	 }|
| z| jj}|jd u r7|j|_|jj|_|
| W |S  tyM   td Y |S w )NzFailed to save tokenizer)	nemo_loadr   module	embeddingword_embeddingsweightr   r   r   cpusave_pretrainedrw   	pad_token	eos_tokenr&   ra   padding_side	Exceptionr   warning)rv   r   r   _source_dtyper   rw   r3   r3   r4   r   h  s"   



zLlamaEmbeddingExporter.applyc                 C   s^   t jt| dd}ddlm} |d ||j|j|j|j	|j
|j|j|j|j| jj|jdS )z!Get HF NV Embedding Llama Config.zmodel.configsubpathr   )LlamaBidirectionalConfig
AutoConfig)r   r   r   r   max_position_embeddingsr   r   r   r   r   r   )r   load_contextr   r   r   r   r   r   r   r   
seq_lengthr   r   r   r   rw   r   r   )rv   r   r   r3   r3   r4   r&   |  s    
zLlamaEmbeddingExporter.configc                 C   sV   dddddd}t jddtjd	t jd
dtjd	t jddtjd	g}t j||||dS )zConvert NeMo State dict to HF.z layers.*.self_attn.o_proj.weightzlayers.*.mlp.down_proj.weightzlayers.*.input_layernorm.weightz(layers.*.post_attention_layernorm.weightznorm.weight)z2decoder.layers.*.self_attention.linear_proj.weightz&decoder.layers.*.mlp.linear_fc2.weightz<decoder.layers.*.self_attention.linear_qkv.layer_norm_weightz1decoder.layers.*.mlp.linear_fc1.layer_norm_weightzdecoder.final_layernorm.weightz1decoder.layers.*.self_attention.linear_qkv.weight)z layers.*.self_attn.q_proj.weightz layers.*.self_attn.k_proj.weightz layers.*.self_attn.v_proj.weight)
source_key
target_keyfnz&decoder.layers.*.mlp.linear_fc1.weight)zlayers.*.mlp.gate_proj.weightzlayers.*.mlp.up_proj.weightz embedding.word_embeddings.weightzembed_tokens.weight)mapping
transforms)r   state_transformr   	split_qkv	split_fc1prune_paddingapply_transforms)rv   r   r   r  r	  r3   r3   r4   r     s8   	z$LlamaEmbeddingExporter.convert_stater"   c                 C   s   t jt| ddjS )zGet NeMo TokenizerrT   r   )r   r  r   rw   r   r3   r3   r4   rw     s   z LlamaEmbeddingExporter.tokenizerN)r'   r$   )r'   r"   )r   r   r   r   r   r   r   r   r   r   r&   r   rw   r3   r3   r3   r4   r   Y  s    	
&r   r$   )Xdataclassesr   pathlibr   typingr   r   r   r   r   r	   r
   r   lightning.pytorchpytorchLr   torch.nn.functionalr   
functionalr   megatron.corer   megatron.core.transformer.enumsr   $megatron.core.transformer.spec_utilsr   megatron.core.utilsr   r   #nemo.collections.llm.gpt.model.basecollectionsllmgptrT   r   r+   nemo.collections.llm.bert.lossr   r   nemo.collections.llm.gpt.modelr   $nemo.collections.llm.gpt.model.llamar   r   r   r   r   nemo.collections.llm.utilsr   nemo.lightningr   r   r   nemo.lightning.io.stater   nemo.lightning.pytorch.utilsr   
nemo.utilsr   nemo.utils.import_utilsr    "megatron.core.models.gpt.gpt_modelr!   ro   1nemo.collections.common.tokenizers.tokenizer_specr"   r   r$   r   r9   r5   r8   r:   r   rS   LightningModulerX   rZ   r   r   r   model_importerr   model_exporterModelConnectorr   __all__r3   r3   r3   r4   <module>   sZ   $$
$
X
Me