o
    }oic                     @   s\  d dl mZ d dlmZ d dlmZmZmZmZm	Z	m
Z
mZmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZm Z  d dl!m"Z"m#Z# d dl$m%Z% d dl$m&Z' d dl(m)Z) d dl*m+Z+m,Z, d dl-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5 erd dl6m Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z= d dl!m>Z> dee?ej@f fddZAdejBdee?ej@f dej@fdd ZCeG d!d" d"ZDeG d#d$ d$e#eDZEG d%d& d&eEZFG d'd( d(e ZGe,HeGd)G d*d+ d+e,jId,eGf ZJe,KeGd)G d-d. d.e,jIeGd,f ZLG d/d0 d0e1ZMd1d2 ZNdS )3    )	dataclass)Path)TYPE_CHECKING	AnnotatedCallableDictLiteralOptionalTupleUnionNparallel_state)ColumnParallelLinear)Float16Modulefloat16_to_fp32fp32_to_float16)
ModuleSpec)get_batch_on_this_cp_rank)nn)	GPTConfigGPTModel)HFLlamaImporterLlama32Config1B)LlamaEmbeddingExporter)get_nv_embedding_layer_spec)Config)OptimizerModuleio)TransformFns)DDPMegatronLossReduction)dtype_from_hf)logging)r   "AutoModelForSequenceClassificationAutoTokenizer)TokenizerSpec)LlamaConfigreturnc                    sz   t | }t|trt|dkr|d }n|}t   d t r, d  d  fdd| D }t	|}|S )z Setup Reranker dataloader batch.   r   attention_mask	input_idsposition_idsc                    s*   i | ]\}}|| v r|j d dndqS )T)non_blockingN)cuda).0keyvalrequired_keys [/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/gpt/model/reranker.py
<dictcomp>C   s   * z&reranker_data_step.<locals>.<dictcomp>)
next
isinstancetuplelensetaddr   is_pipeline_first_stageitemsr   )dataloder_iterbatch_batchoutputr5   r3   r6   reranker_data_step2   s   



rD   modelrA   c                 C   s,   |d |d |d d}| j di |}|S )a  
    This subsets the batch keys to the ones actually used by forward pass of the model,
    and then calls the model's forward pass. if "cu_seqsens" are defined in the batch,
    then the packed sequence parameters are also passed to the model for forward pass efficiency.
    r,   r+   r-   )r,   r+   r-   Nr5   )forward)rE   rA   forward_argsscorer5   r5   r6   reranker_forward_stepJ   s   rI   c                   @   s   e Zd ZU dZdZed ed< dZeed< dZ	e
ed< d	Ze
ed
< dZeed< dZed ed< dZeed< dZeed< dZeed  ed< dZe
ed< dS )ReRankerBaseConfigz:
    Base config for Reranker Models Training configs
    right)leftrK   truncation_method   num_hard_negatives2   ce_loss_scale        label_smoothingFin_batch_negativesfirst)randomrU   negative_sample_strategyTadd_bosadd_eosavgclsrZ   lastweighted_avg	pool_typeg      ?temperatureN)__name__
__module____qualname____doc__rM   r   __annotations__rO   intrQ   floatrS   rT   boolrW   rX   rY   r_   r	   r`   r5   r5   r5   r6   rJ   Y   s   
 rJ   c                       sx   e Zd ZU dZeZeeedgef f e	d< e
Zee	d< eZee	d< eZeje	d< eZeje	d< dd fddZ  ZS )Llama32Reranker1BConfigz"Config for Llama32Reranker1B modelr   transformer_layer_specforward_step_fndata_step_fnimporter_clsexporter_clsNr)   MCoreGPTModelc                    s   t  ||||}d|_|S )zConfigure the Reranker ModelF)superconfigure_modelpost_process)self	tokenizerpre_processrr   vp_stagerE   	__class__r5   r6   rq   v   s   z'Llama32Reranker1BConfig.configure_model)NNN)r)   ro   )ra   rb   rc   rd   "bidirectional_attention_layer_specrj   r   r   r   re   rI   rk   rD   rl   r   rm   r   ModelConnectorr   rn   rq   __classcell__r5   r5   rw   r6   ri   l   s   
 ri   c                   @   s   e Zd ZU dZdZeed< dS )Llama32Reranker500MConfigz$Config for Llama32Reranker500M model   
num_layersN)ra   rb   rc   rd   r~   rf   re   r5   r5   r5   r6   r|      s   
 r|   c                       s   e Zd ZdZ				d deee ee f dee ded dee	e
jge
jf  f fdd	Zed
d Zd!dee ddf fddZdd Z	d!dejdejdejdeej f fddZedd Zdd Zedd Zedd Z  ZS )"ReRankerModelzUBase model for Reranking that extends GPTModel with reranking-specific functionality.Nconfigoptimrt   r'   model_transformc                    s   t  j|pt |||d d S )N)r   rt   r   )rp   __init__ri   )rs   r   r   rt   r   rw   r5   r6   r      s   
zReRankerModel.__init__c                 C   s   | j j| j j| j j| j jdS )z,Getter for dataset_kwargs from model config.)rO   rW   rX   rY   )r   rO   rW   rX   rY   rs   r5   r5   r6   dataset_kwargs   s
   zReRankerModel.dataset_kwargsrv   r)   c              	      s`   | j jdv s| j jdu sJ d| j j dt | t| j jd| j | j jdddd| j_dS )	zConfigure the underlying model if not already configured.

        This method ensures the model is instantiated from the configuration.
        r[   NInvalid pool type: z4 should be in [cls, avg, last, weighted_avg] or None   FT)r   init_methodbiasskip_bias_addgather_output)	r   r_   rp   rq   r   hidden_sizer   modulerH   )rs   rv   rw   r5   r6   rq      s   zReRankerModel.configure_modelc           	      C   s  t |d}||d   d}| jj}|dkr)|jdd|jddd  }|S |dkr5|jdd}|S |dkrC|d	d	d
f }|S |dkr}|d	d	df  |jd
 k}|rb|d	d	df }|S |jddd }|jd
 }|tj	||j
d|f }|S td| )a  Pool the hidden states based on the configured pooling strategy.

        Args:
            last_hidden_states: The hidden states from the transformer
            attention_mask: The attention mask for the input

        Returns:
            The pooled embeddings
        zs b h -> b s h).NrR   rZ   r   dimr^   r\   Nr   r]   devicer   )einops	rearrangemasked_fillrh   r   r_   sumshapetorcharanger   
ValueError)	rs   last_hidden_statesr+   last_hiddenr_   embleft_paddingsequence_lengths
batch_sizer5   r5   r6   pool   s,   
zReRankerModel.poolr,   r-   r+   decoder_inputc                    s   |j dkr|dddk }n$|j dkr0|jd dkr#|jd dks'J d|}| dk }ntdt j||||d}| ||}|  rU| j	j	}t
||j}d}	nd	}	| |d
 }
|
| jj }
|	rjt|
}
|
S )a2  Forward pass of the reranker model.

        Args:
            input_ids: Input token IDs
            position_ids: Position IDs for the input
            attention_mask: Attention mask for the input
            decoder_input: Optional decoder input

        Returns:
            The pooled logits
           r   g      ?rN   zAttention mask shape incorrectzAttention_mask shape incorrect)r,   r-   r+   r   TFr   )ndim	unsqueezer   squeezer   rp   rF   r   has_float16_module_wrapperr   r   float16_convertorrH   r   r`   r   )rs   r,   r-   r+   r   extended_maskrC   pooled_hidden_statesfloat16_moduleneed_to_convert_backpooled_logitsrw   r5   r6   rF      s0   

$zReRankerModel.forwardc                 C   sP   t | jdr
| jjS t | jjdr| jjjS t | jjjds"J d| jjjjS )z$Get the score module from the model.rH   zScore module not found)hasattrr   rH   r   r5   r5   r6   rH     s   
zReRankerModel.scorec                 C   s"   t | jtrt | jjtrdS dS )z0Check if the model has a float16 module wrapper.TF)r9   r   r   r   r   r5   r5   r6   r     s   z(ReRankerModel.has_float16_module_wrapperc                 C   $   | j std| jj| jjd| _ | j S )z'Get the training loss reduction module.Fvalidation_steprO   rS   )_training_loss_reductionReRankerLossr   rO   rS   r   r5   r5   r6   training_loss_reduction     z%ReRankerModel.training_loss_reductionc                 C   r   )z)Get the validation loss reduction module.Tr   )_validation_loss_reductionr   r   rO   rS   r   r5   r5   r6   validation_loss_reduction(  r   z'ReRankerModel.validation_loss_reduction)NNNN)N)ra   rb   rc   rd   r   r	   r   r   r   r   r   Moduler   propertyr   rf   rq   r   r   
LongTensorTensorrF   rH   r   r   r   r{   r5   r5   rw   r6   r      sH    
	%:
	

r   hfc                   @   sf   e Zd ZdZdefddZdedefddZede	fdd	Z
edddZdddeddfddZdS )ReRankerImporterzHF Importer for Reranker Modelr)   c                 C   s   t | j| jdS )N)rt   )r   r   rt   r   r5   r5   r6   init8  s   zReRankerImporter.initoutput_pathc                 C   sN   ddl m} |  }| |}|jt| ddd}| || | || |S )zApply the conversion from HF to NeMo format.

        Args:
            output_path: Path where the converted model will be saved

        Returns:
            Path: Path to the saved NeMo model
        r   r#   autoT)torch_dtypetrust_remote_code)transformersr$   r   
nemo_setupfrom_pretrainedstrconvert_state	nemo_save)rs   r   r$   targettrainersourcer5   r5   r6   apply;  s   	

zReRankerImporter.applyc                 C   sH   ddl m} |jt| dd}tt|tjkt|tjkt||j	dS )z:Create a NeMo ReRankerBaseConfig from the HF model config.r   )
AutoConfigT)r   )fp16bf16params_dtyper~   )
r   r   r   r   ri   r!   r   float16bfloat16num_hidden_layers)rs   r   r   r5   r5   r6   r   P  s   zReRankerImporter.configr&   c                 C   s   ddl m} || t| S )zGet the tokenizer for the HF model.

        Returns:
            AutoTokenizer: Tokenizer instance initialized from the HF model's tokenizer
        r   r%   )=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr&   save_hf_tokenizer_assetsr   )rs   r&   r5   r5   r6   rt   ]  s   zReRankerImporter.tokenizerr   r$   r   Nc              	   C   s   |j  }|||}|jjjj|jjjks&J d|jjj d|jjj t : z|jjj	|jj W n t
yS   td td |j |jjj Y n	w W d   |S W d   |S 1 sgw   Y  |S )z:Convert the state of the source model to the target model.zScore weight dtype mismatch: z != zoFailed to copy score weight. This is expected if you are trying to convert model without score weights to NeMo.zinit the score weight...N)r   rm   r   r   rH   weightdtyper   no_gradcopy_	Exceptionr"   warninginfor   )rs   r   r   target_connectorr5   r5   r6   r   h  s.   




zReRankerImporter.convert_state)r)   r&   )ra   rb   rc   rd   r   r   r   r   r   rJ   r   rt   r   r5   r5   r5   r6   r   4  s    
r   r$   c                   @   sT   e Zd ZdZejfdddZdedefddZe	d	d
 Z
dd Ze	dddZdS )ReRankerExporterzExporter for converting NeMo Llama models to Hugging Face format.

    This class handles the conversion of NeMo's ReRankerModel to Hugging Face's
    AutoModelForSequenceClassification format, including weight mapping and configuration translation.
    r)   r$   c                 C   sR   ddl m} ddlm} |  |j| j|dW  d   S 1 s"w   Y  dS )a  Initialize a HF AutoModelForSequenceClassification instance.

        Args:
            dtype: Data type for model parameters

        Returns:
            LlamaBidirectionalForSequenceClassification: Initialized HF Llama Bidirection reranker model
        r   )no_init_weights)+LlamaBidirectionalForSequenceClassification)r   N)transformers.modeling_utilsr   1nemo.collections.llm.gpt.model.hf_llama_embeddingr   _from_configr   )rs   r   r   r   r5   r5   r6   r     s
   	$zReRankerExporter.initr   c                 C   s   |  t| \}}|jjjjj}| |}| ||}|	 }|
| z| jj}|jdu r7|j|_|jj|_|
| W |S  tyM   td Y |S w )z,Apply the conversion from NeMo to HF format.NzFailed to save tokenizer)	nemo_loadr   r   	embeddingword_embeddingsr   r   r   r   cpusave_pretrainedrt   	pad_token	eos_tokenr   rM   padding_sider   r"   r   )rs   r   r   _source_dtyper   rt   r5   r5   r6   r     s"   



zReRankerExporter.applyc              
   C   s   t jt| dd}ddlm}m} |d |d |di d|jd|jd	|j	d
|j
d|jd|jd|jd|jd|jd| jjd|jddd| jjd| jjd| jjd|jd|j|j|j|jddS )z@Create a NeMo LlamaBidirectionalConfig from the HF model config.zmodel.configsubpathr   )LlamaBidirectionalConfigr   r   r$   r   r   intermediate_sizenum_attention_headsmax_position_embeddingsinitializer_rangerms_norm_epsnum_key_value_heads
rope_theta
vocab_sizetie_word_embeddings
num_labelsr   bos_token_ideos_token_idpad_token_idr`   rope_scalingllama3)factorhigh_freq_factorlow_freq_factor original_max_position_embeddings	rope_typeNr5   )r   load_contextr   r   r   r   register_for_auto_classr~   r   ffn_hidden_sizer   
seq_lengthinit_method_stdlayernorm_epsilonnum_query_groupsrotary_basert   r   #share_embeddings_and_output_weightsbos_ideos_idr`   scale_factorr  r  old_context_len)rs   r   r   r   r5   r5   r6   r     sX   

	
zReRankerExporter.configc                 C   sX   ddddddd}t jdd	tjd
t jddtjd
t jddtjd
g}t j||||dS )zConvert NeMo State dict to HF.z&model.layers.*.self_attn.o_proj.weightz#model.layers.*.mlp.down_proj.weightz%model.layers.*.input_layernorm.weightz.model.layers.*.post_attention_layernorm.weightzmodel.norm.weightscore.weight)z2decoder.layers.*.self_attention.linear_proj.weightz&decoder.layers.*.mlp.linear_fc2.weightz<decoder.layers.*.self_attention.linear_qkv.layer_norm_weightz1decoder.layers.*.mlp.linear_fc1.layer_norm_weightzdecoder.final_layernorm.weightr  z1decoder.layers.*.self_attention.linear_qkv.weight)z&model.layers.*.self_attn.q_proj.weightz&model.layers.*.self_attn.k_proj.weightz&model.layers.*.self_attn.v_proj.weight)
source_key
target_keyfnz&decoder.layers.*.mlp.linear_fc1.weight)z#model.layers.*.mlp.gate_proj.weightz!model.layers.*.mlp.up_proj.weightz embedding.word_embeddings.weightzmodel.embed_tokens.weight)mapping
transforms)r   state_transformr   	split_qkv	split_fc1prune_paddingapply_transforms)rs   r   r   r  r  r5   r5   r6   r     s:   		zReRankerExporter.convert_stater'   c                 C   s   t jt| ddjS )zGet NeMo TokenizerrE   r   )r   r  r   rt   r   r5   r5   r6   rt     s   zReRankerExporter.tokenizerN)r)   r$   )r)   r'   )ra   rb   rc   rd   r   r   r   r   r   r   r   r   rt   r5   r5   r5   r6   r   }  s    
%(r   c                       s   e Zd ZdZ				ddededed	ed
df
 fddZdee	e
jf de
jd
ee
jee	e
jf f fddZd
e
jfddZ  ZS )r   a~  Loss function for reranking models that learns to score passages by relevance.

    This loss function implements a cross-entropy based approach for learning to rank passages.
    For each example, it takes a positive passage and multiple hard negative passages,
    and learns to assign higher scores to the positive passage compared to the negatives.

    The loss is computed by treating the scoring task as a classification problem where
    the positive passage should be ranked first among all passages (positive + negatives).
    Cross-entropy loss is used to learn this ranking behavior.

    Args:
        validation_step (bool, optional): Whether this is being used in validation. Defaults to False.
        val_drop_last (bool, optional): Whether to drop the last batch in validation. Defaults to True.
        num_hard_negatives (int, optional): Number of hard negative passages per positive passage. Defaults to 1.
        label_smoothing (float, optional): Label smoothing factor for cross-entropy loss. Defaults to 0.0.

    Note:
        - The input logits should be organized such that for each example, the first score
          corresponds to the positive passage, followed by scores for hard negative passages.
        - The loss assumes all examples have the same number of passages (1 positive + num_hard_negatives).
        - Context parallelism (CP) is not currently supported.
    FTr   rR   r   val_drop_lastrO   rS   r)   Nc                    s.   t    || _|| _|| _tj|d| _d S )N)rS   )rp   r   r   r  rO   r   CrossEntropyLosscross_entropy_loss)rs   r   r  rO   rS   rw   r5   r6   r     s
   
zReRankerLoss.__init__rA   forward_outc                 C   s   ddl m} | }|dkrtd| j dd| j }|jd | }|d|}tj	|tj
|jd}| ||}	t|	g}
|	d|
ifS )	Nr   r   r   zCP is not supported for z yet.r   )r   r   rZ   )megatron.corer   get_context_parallel_world_sizeNotImplementedErrorrx   rO   r   viewr   zeroslongr   r  )average_losses_across_data_parallel_group)rs   rA   r   r   cp_sizenum_tensors_per_exampler   logitslabelsce_lossreduced_lossr5   r5   r6   rF   #  s   

zReRankerLoss.forwardc                 C   s   |rVd|d v rdd |D }t | }|S ddlm} dd |D }t|dkr4t |jddnt jddgt j	
 d	}t jj||jd
dd |d |d  }|S t jdt j	
 d	S )zTaken from: https://github.com/NVIDIA/NeMo/blob/main
        /nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py#L535-L552 .rZ   r   c                 S   s   g | ]}|d  qS )rZ   r5   r0   xr5   r5   r6   
<listcomp>>  s    z'ReRankerLoss.reduce.<locals>.<listcomp>r   c                 S   s$   g | ]}|d  d dkr|d  qS )loss_sum_and_ub_sizer   r   r5   r.  r5   r5   r6   r0  D  s    r   rR   r   T)with_context_parallelgroupr   )r   catmeanr!  r   r;   vstackr   tensorr/   current_devicedistributed
all_reduceget_data_parallel_group)rs   losses_reduced_per_micro_batchrZ   lossr   r1  r5   r5   r6   reduce8  s(   
zReRankerLoss.reduce)FTr   rR   )ra   rb   rc   rd   rh   rf   rg   r   r   r   r   r   r
   rF   r?  r{   r5   r5   rw   r6   r     s2    
r   c                 C   sN   ddl m} tdd | D }tjj|| d |tjj| d }|S )z*Reduce a tensor of losses across all GPUs.r   r   c                 S   s   g | ]}|   d qS )r   )clonedetachr$  )r0   r>  r5   r5   r6   r0  [  s    z=average_losses_across_data_parallel_group.<locals>.<listcomp>r3  )r!  r   r   r5  r:  r;  r<  get_world_size)lossesr   averaged_lossesr5   r5   r6   r'  W  s   r'  )Odataclassesr   pathlibr   typingr   r   r   r   r   r	   r
   r   r   pytorch_lightningLr   r!  r   $megatron.core.tensor_parallel.layersr    megatron.core.transformer.moduler   r   r   $megatron.core.transformer.spec_utilsr   megatron.core.utilsr   r   #nemo.collections.llm.gpt.model.baser   r   $nemo.collections.llm.gpt.model.llamar   r   .nemo.collections.llm.gpt.model.llama_embeddingr   r   ry   nemo.collections.llm.utilsr   nemo.lightningr   r   nemo.lightning.io.stater    nemo.lightning.megatron_parallelr   r    nemo.lightning.pytorch.utilsr!   
nemo.utilsr"   "megatron.core.models.gpt.gpt_modelro   r   r$   r   r&   1nemo.collections.common.tokenizers.tokenizer_specr'   r(   r   r   rD   LightningModulerI   rJ   ri   r|   r   model_importerrz   r   model_exporterr   r   r'  r5   r5   r5   r6   <module>   sX   ($ 
/
H Y