o
    wi                      @   s  d dl Z d dlmZ d dlmZmZmZmZ d dlm	Z
 d dlZd dlm  mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dl m!Z! d dl"m#Z# dee$ejf fddZ%de
j&dee$ejf dejfddZ'eG dd deZ(eG dd de(Z)eG dd de(Z*G dd dej+Z,G dd deZ-e!.e-dG d d! d!eZ/dS )"    N)	dataclass)CallableDictLiteralOptional)parallel_state)get_batch_on_this_cp_rank)Tensornn)TokenizerSpec),BERTInBatchExclusiveHardNegativesRankingLoss)
BertConfig	BertModel)get_packed_seq_params)HuggingFaceBertImporter)io)OptimizerModulereturnc                    sz   t | }t|trt|dkr|d }n|}t   d  d t r, d  fdd| D }t	|}|S )zSetup BERT dataloader batch.   r   attention_masktoken_type_ids	input_idsc                    s*   i | ]\}}|| v r|j d dndqS )T)non_blockingN)cuda).0keyvalrequired_keys f/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/bert/model/embedding.py
<dictcomp>3   s   * z,bert_embedding_data_step.<locals>.<dictcomp>)
next
isinstancetuplelensetaddr   is_pipeline_first_stageitemsr   )dataloder_iterbatch_batchoutputr   r   r    bert_embedding_data_step"   s   



r.   modelr+   c                 C   sL   |d |d d}| j jdkr|d |d< d|v rt||d< | d
i |S )a  
    This subsets the batch keys to the ones actually used by forward pass of the model,
    and then calls the model's forward pass. if "cu_seqsens" are defined in the batch,
    then the packed sequence parameters are also passed to the model for forward pass efficiency.
    r   r   )r   r   r   r   tokentype_ids
cu_seqlenspacked_seq_paramsNr   )confignum_tokentypesr   )r/   r+   forward_argsr   r   r    bert_embedding_forward_step:   s   r6   c                   @   s   e Zd ZU dZdZed ed< dZeed< dZ	eed< d	Z
eed
< d	Zeed< dZeed< dZeed< dZeed< dZed ed< eZeed< eZeed< dS )BertEmbeddingConfigzBert Embedding Confighuggingface)r8   megatron	bert_type   ce_loss_scaleg        label_smoothingFadd_lm_headbert_binary_head   num_hard_negatives   r4   Tglobal_in_batch_negativeslocal)rD   globalbackprop_typeforward_step_fndata_step_fnN)__name__
__module____qualname____doc__r:   r   __annotations__r<   floatr=   r>   boolr?   rA   intr4   rC   rF   r6   rG   r   r.   rH   r   r   r   r    r7   N   s   
 r7   c                   @   B   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
S )BertEmbeddingLargeConfigz5Bert Embedding model follows Bert-large architecture.   
num_layersi   hidden_sizei   intermediate_size   num_attention_headsN
rI   rJ   rK   rL   rT   rP   rM   rU   rV   rX   r   r   r   r    rR   _      
 rR   c                   @   rQ   )BertEmbeddingMiniConfigzFBert Embedding model follows Bert-mini (384 hidden size) architecture.   rT   i  rU   i   rV      rX   NrY   r   r   r   r    r[   i   rZ   r[   c                       s@   e Zd ZdZ	ddedef fddZdedefd	d
Z  Z	S )BertEmbeddingHeadz.Performs mean pooling on the token embeddings.Tword_embedding_dimensionpooling_mode_mean_tokensc                    s(   t t|   ddg| _|| _|| _d S )Nr_   r`   )superr^   __init__config_keysr_   r`   )selfr_   r`   	__class__r   r    rb   v   s   
zBertEmbeddingHead.__init__token_embeddingsr   c                 C   sj   | ddd}|d|  }t|| d}|d}tj|dd}|| }tj	|ddd}|S )z;Forward function for embedding head. Performs mean pooling.r@   r   rB   g&.>)min)pdim)
permute	unsqueezeexpandsizerN   torchsumclampF	normalize)rd   rg   r   input_mask_expandedsum_embeddingssum_maskoutput_vectorr   r   r    forward   s   
zBertEmbeddingHead.forward)T)
rI   rJ   rK   rL   rP   rO   rb   r	   ry   __classcell__r   r   re   r    r^   s   s    r^   c                       s   e Zd ZdZ			ddedee ded deeej	gej	f  f fdd	Z
dddZd
ejfddZed
efddZed
efddZ  ZS )BertEmbeddingModelzBert Lightning ModuleNr3   optim	tokenizerr   model_transformc                    s   t  |||| d S )N)ra   rb   )rd   r3   r|   r}   r~   re   r   r    rb      s   zBertEmbeddingModel.__init__r   c                 C   s4   t | ds| j| j| _t| jjdd| _dS dS )z0Setup the BERT Model based on config definition.moduleT)r_   r`   N)hasattrr3   configure_modelr}   r   r^   rU   embedding_headrd   r   r   r    r      s   
z"BertEmbeddingModel.configure_modelc                 O   s8   d|v sJ d| j |ddi|}| ||d }|S )zPCall the forward method of the underlying model, and return whatever it outputs.r   z4attention mask is required for BERT Embedding Model.hidden_states_onlyT)r   r   )rd   argskwargsoutput_tensorembeddings_outr   r   r    ry      s   zBertEmbeddingModel.forwardc                 C   s6   | j std| jj| jj| jj| jj| jjd| _ | j S )NF)validation_steprA   scaler=   rC   rF   )_training_loss_reductionr   r3   rA   r<   r=   rC   rF   r   r   r   r    training_loss_reduction   s   	z*BertEmbeddingModel.training_loss_reductionc                 C   s*   | j std| jj| jj| jjd| _ | j S )NT)r   rA   r   r=   )_validation_loss_reductionr   r3   rA   r<   r=   r   r   r   r    validation_loss_reduction   s   z,BertEmbeddingModel.validation_loss_reduction)NNN)r   N)rI   rJ   rK   rL   r   r   r   r   r
   Modulerb   r   rp   r	   ry   propertyr   r   r   rz   r   r   re   r    r{      s,    

	
r{   hfc                       s.   e Zd ZdZ fddZdefddZ  ZS )BertEmbeddingImporterz
    Importer for BertEmbedding Model.
    HuggingFace uses same model for Bert Embedding model and Bert model, thus the connector is identical.
    c                    s    t jdkrt j|  d| _d S )N)r      r/   )sysversion_infora   rb   type)rd   r   r   re   r   r    rb      s   

zBertEmbeddingImporter.__init__r   c                 C   s   t | j| jdS )N)r}   )r{   r3   r}   r   r   r   r    init   s   zBertEmbeddingImporter.init)rI   rJ   rK   rL   rb   r{   r   rz   r   r   re   r    r      s    r   )0r   dataclassesr   typingr   r   r   r   lightning.pytorchpytorchLrp   torch.nn.functionalr
   
functionalrs   megatron.corer   megatron.core.utilsr   r	   "nemo.collections.common.tokenizersr   nemo.collections.llm.bert.lossr   nemo.collections.llm.bert.modelr   r   $nemo.collections.llm.bert.model.baser   $nemo.collections.llm.bert.model.bertr   nemo.lightningr   nemo.lightning.pytorch.optimr   strr.   LightningModuler6   r7   rR   r[   r   r^   r{   model_importerr   r   r   r   r    <module>   s8   $		 
>