o
    }oi<                     @   s   d dl mZmZmZ d dlZd dlm  mZ d dlm	Z	mZ d dl
mZ d dlmZ d dlmZmZ G dd deZG dd	 d	eZG d
d deZde	de	fddZde	de	fddZdd ZdS )    )DictLiteralTupleN)Tensornn)
all_gather)MaskedTokenLossReductionMegatronLossReductionc                
       sz   e Zd ZdZddedededdf fd	d
Zdeeej	f dej	de
ej	eeej	f f fddZdej	fddZ  ZS )BERTLossReductionzYBert Loss Function.
    when add_sop_loss = False, only calculate Masked token loss.
    FTvalidation_stepval_drop_lastadd_sop_lossreturnNc                    s4   t    || _|| _|| _|st||| _d S d S )N)super__init__r   r   r   r   mlm)selfr   r   r   	__class__ R/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/bert/loss.pyr      s   
zBERTLossReduction.__init__batchforward_outc                 C   s   |d |d< | j s| j||d S ddlm} |d |d }}|dus)J d| }|dkr@t||d	 }t||d }ntd
|| }	t	|	g}
|	d|
ifS )zqPerform Loss calculation on batch.
        Currently, Context parallelism is not supported for SOP loss.
        	loss_masklm_lossr   parallel_statebinary_logitsNzAttempting to calculate Sentence Order Prediction Loss but SOP logits are not provideds, Please Make sure you have added binary head.   	is_randomz$CP is not supported for SOP loss yetavg)
r   r   forwardmegatron.corer   get_context_parallel_world_sizesentence_order_prediction_lossmasked_token_with_zeroNotImplementedError)average_losses_across_data_parallel_group)r   r   r   r   lm_loss_
sop_logitscp_sizesop_loss_for_ublm_loss_for_ubloss_for_ubreduced_lossr   r   r   r!   (   s    


zBERTLossReduction.forwardc                 C      |rVd|d v rdd |D }t | }|S ddlm} dd |D }t|dkr4t |jddnt jddgt j	
 d	}t jj||jd
dd |d |d  }|S t jdt j	
 d	S )Taken from: https://github.com/NVIDIA/NeMo/blob/main
        /nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py#L535-L552 .r    r   c                 S      g | ]}|d  qS r    r   .0xr   r   r   
<listcomp>O       z,BERTLossReduction.reduce.<locals>.<listcomp>r   c                 S   $   g | ]}|d  d dkr|d  qS loss_sum_and_ub_sizer   r   r   r3   r   r   r   r6   U       dim        deviceTwith_context_parallelgroupr   torchcatmeanr"   r   lenvstacksumtensorcudacurrent_devicedistributed
all_reduceget_data_parallel_groupr   losses_reduced_per_micro_batchr    lossr   r:   r   r   r   reduceI   (   
zBERTLossReduction.reduce)FTT)__name__
__module____qualname____doc__boolr   r   strrF   r   r   r!   rU   __classcell__r   r   r   r   r
      s     	
!r
   c                       s   e Zd ZdZ					ddeded	ed
ededdf fddZdee	e
jf de
jdee
jee	e
jf f fddZde
jfddZ  ZS )HardNegativeRankingLossaE  
    This loss uses hard-negative samples.
    The difference of this loss to the default MultipleNegativesRankingLoss
    from Sentence Transformers is that the latter shares the hard negatives
    as negatives for all examples, whereas this loss uses hard negatives
    exclusively for the example they are associated.
    FTr   2   r>   r   r   num_hard_negativesscalelabel_smoothingr   Nc                    s4   t    || _|| _|| _|| _tj|d| _d S N)rb   )	r   r   r   r   r`   ra   r   CrossEntropyLosscross_entropy_loss)r   r   r   r`   ra   rb   r   r   r   r   q   s   
z HardNegativeRankingLoss.__init__r   r   c                 C   sv  ddl m} | }|dkrtd| j dd| j }d| j }|jd | }||}t	dd |D }	t
d	d |D }
|
jd |	jd  dks[J d
|
jd |	jd |
jd |	jd  |ksvJ d|
jd |	jd ||	j}|	dd||d | |d }tj||
 dd|d |}tj|d tj|	jd}|| j9 }| ||}t|g}|d|ifS )Nr   r   r   CP is not supported for  yet.   c                 S   r1   r   r   r4   itemr   r   r   r6      r7   z3HardNegativeRankingLoss.forward.<locals>.<listcomp>c                 S   s   g | ]}|d d qS )r   Nr   rj   r   r   r   r6          z{} % {} > 0z{} / {} != {}r<   dtyper@   r    )r"   r   r#   r&   r   r`   shapechunkrF   stackrG   formatrepeatreshaperK   zeroslongr@   ra   re   r'   )r   r   r   r   r*   num_tensors_per_examplecurrent_train_n_passages
batch_sizechunksquerykeyquery_shaperepeated_queryscoreslabelsce_lossr.   r   r   r   r!      s0   


4

zHardNegativeRankingLoss.forwardc                 C   r/   )r0   r    r   c                 S   r1   r2   r   r3   r   r   r   r6      r7   z2HardNegativeRankingLoss.reduce.<locals>.<listcomp>r   c                 S   r8   r9   r   r3   r   r   r   r6      r;   r<   r>   r?   TrA   rC   r   rE   rR   r   r   r   rU      rV   zHardNegativeRankingLoss.reduce)FTr   r_   r>   )rW   rX   rY   rZ   r[   intfloatr   r   r\   rF   r   r   r!   rU   r]   r   r   r   r   r^   h   s8    

!r^   c                       s   e Zd ZdZ							dded	ed
ededededed ddf fddZdd Z	de
eejf dejdeeje
eejf f fddZdejfddZ  ZS ),BERTInBatchExclusiveHardNegativesRankingLossa  
    This loss uses in-batch negative samples + hard-negative samples.
    The difference of this loss to the default MultipleNegativesRankingLoss
    from Sentence Transformers is that the latter shares the hard negatives
    as negatives for all examples, whereas this loss uses hard negatives
    exclusively for the example they are associated.

    This loss is also capable of using in-batch negatives from all ranks during training.
    FTr      r>   localr   r   r`   ra   rb   global_in_batch_negativesbackprop_type)r   globalr   Nc                    s@   t    || _|| _|| _|| _tj|d| _|| _	|| _
d S rc   )r   r   r   r   r`   ra   r   rd   re   r   r   )r   r   r   r`   ra   rb   r   r   r   r   r   r      s   


z5BERTInBatchExclusiveHardNegativesRankingLoss.__init__c                    s   ddl m}    | jdkr4 fddt| D }t| | d  || < t	j
|dd}|S t }t	j
|dd}|S )Nr   r   r   c                    s   g | ]}t  qS r   )rF   
zeros_like)r4   _local_tensorr   r   r6      s    
zhBERTInBatchExclusiveHardNegativesRankingLoss._gather_global_in_batch_representations.<locals>.<listcomp>rC   r<   )r"   r   
contiguousr   rangeget_data_parallel_world_sizeall_gather_no_backproprQ   get_data_parallel_rankrF   rG   all_gather_with_backprop)r   r   r   global_tensorsr   r   r   '_gather_global_in_batch_representations   s   


zTBERTInBatchExclusiveHardNegativesRankingLoss._gather_global_in_batch_representationsr   r   c                    sP  ddl m} | }|dkrtd| j d| jr"| js"| |}d| j }|j	d | }|
| tdd  D }td	d  D } fd
dt| jD }	t||dd}
t|dt|	ddt|	jddj}tj|
|gdd}|dd}|| j9 }tjtt|tj|jd}| ||}t|g}|d|ifS )Nr   r   r   rf   rg   rh   c                 S   r1   ri   r   rj   r   r   r   r6      r7   zHBERTInBatchExclusiveHardNegativesRankingLoss.forward.<locals>.<listcomp>c                 S   r1   r   r   rj   r   r   r   r6     r7   c                    s$   g | ] t  fd dD qS )c                    s   g | ]}| d   qS )rh   r   rj   ir   r   r6     rl   zSBERTInBatchExclusiveHardNegativesRankingLoss.forward.<locals>.<listcomp>.<listcomp>)rF   rr   )r4   r{   r   r   r6     s    rm   )axisg      g      ?rn   r    )r"   r   r#   r&   r   r   r   r   r`   rp   rq   rF   rr   r   mm	transposemultiply	unsqueezert   rI   rK   TrG   clampra   rL   rw   r@   re   r'   )r   r   r   r   r*   rx   rz   queries	positives	hard_negspos_in_batch_negs_scoreshard_negs_scoresr   r   r   r.   r   r   r   r!      sD   





z4BERTInBatchExclusiveHardNegativesRankingLoss.forwardc                 C   r/   )r0   r    r   c                 S   r1   r2   r   r3   r   r   r   r6   #  r7   zGBERTInBatchExclusiveHardNegativesRankingLoss.reduce.<locals>.<listcomp>r   c                 S   r8   r9   r   r3   r   r   r   r6   )  r;   r<   r>   r?   TrA   rC   r   rE   rR   r   r   r   rU     rV   z3BERTInBatchExclusiveHardNegativesRankingLoss.reduce)FTr   r   r>   Fr   )rW   rX   rY   rZ   r[   r   r   r   r   r   r   r\   rF   r   r   r!   rU   r]   r   r   r   r   r      sF    	
-r   rL   maskc                 C   sZ   |   }|  }| dkrt|dd }|S t|d|d |  }|S )aS  Calculate masked token loss with consideration of possible NaN.
    Sometimes when the number of tokens is very small, none of the tokens get masked for prediction.
    In that case loss mask is all zeros i.e Happens when the entire batch is masked out
    (Practically when MBS=1 or 2, and the number of tokens in each batch is < 7 )
    r   rm   r>   )r   rK   rF   viewru   )rL   r   lossesr   rT   r   r   r   r%   <  s   "r%   sentence_orderc                 C   s.   |  dd }| d}tj||dd}|S )z)Calculate sentence order prediction loss.rm   rh   )ignore_index)r   r   Fcross_entropy)rL   r   r   rT   r   r   r   r$   L  s   
r$   c                 C   sN   ddl m} tdd | D }tjj|| d |tjj| d }|S )z*Reduce a tensor of losses across all GPUs.r   r   c                 S   s   g | ]}|   d qS r   )clonedetachr   )r4   rT   r   r   r   r6   Y  s    z=average_losses_across_data_parallel_group.<locals>.<listcomp>rC   )r"   r   rF   rG   rO   rP   rQ   get_world_size)r   r   averaged_lossesr   r   r   r'   U  s   r'   )typingr   r   r   rF   torch.nn.functionalr   
functionalr   r   torch.distributedr   r   torch.distributed.nn.functionalr    nemo.lightning.megatron_parallelr   r	   r
   r^   r   r%   r$   r'   r   r   r   r   <module>   s   NX|	