o
    }oi7                     @   sZ  d dl mZ d dlmZmZmZmZ d dlZd dlm	Z	 d dl
m	  mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ ed
\ZZed
ded\ZZerbd dlmZ G dd deedZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd dej%j&Z'ej(j)j*ej(j+j,fddZ-dS )    )ABCMeta)TYPE_CHECKINGDictOptionalTupleN)parallel_state)MegatronModule)Tensor)_Loss)logging)safe_importsafe_import_fromzmodelopt.torch.distillDistillationLossBalancer)alt)TransformerConfigc                	       sl   e Zd ZdZddddeej f fddZded	ed
e	eef fddZ
ddededed
efddZ  ZS )BaseLossz5Abstract base class for Megatron distillation losses.Nmodel_configr   projection_layerc                       t    || _|| _dS )
        Constructor.

        Args:
            model_config: MCore transformer config.
            projection_layer: Module which projects student activations to teacher's hidden dim.
        N)super__init___config_projectionselfr   r   	__class__ ^/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/modelopt/distill/loss.pyr   '   s   

zBaseLoss.__init__predictionstargetsreturnc                 C   s@   t |tr|d |d }}| jdur| |}| }||fS )zKPerforms projection of student tensor to match teacher's size if necessary.r   N)
isinstancetupler   detach)r   r    r!   r   r   r   pre_forward3   s   


zBaseLoss.pre_forwardFloss	tp_reduceis_sequence_parallelc                 C   s   | dd }|||fS )z@Reshapes tensor from [s, b] to [b, s] for upcoming loss masking.r      )	transpose
contiguous)r   r'   r(   r)   r   r   r   post_forward?   s   
zBaseLoss.post_forwardN)FF)__name__
__module____qualname____doc__r   nnModuler   r	   r   r&   boolr-   __classcell__r   r   r   r   r   $   s
    $r   )	metaclassc                   @   s&   e Zd ZdZdededefddZdS )MSELosszJCalculates MSE loss between two tensors without reducing the sequence dim.r    r!   r"   c                 C   s6   |  ||\}}tj||dd}|jdd}| |S )zForward function.

        Args:
            predictions: Student model tensors (size [s, b, h])
            targets: Teacher model tensors (size [s, b, h])

        Returns:
            MSE loss of tensors (size [b, s])
        none	reductiondim)r&   Fmse_losssumr-   r   r    r!   r'   r   r   r   forwardH   s   

zMSELoss.forwardN)r/   r0   r1   r2   r	   rC   r   r   r   r   r8   E   s    r8   c                       sH   e Zd ZdZddddeej f fddZded	ed
efddZ	  Z
S )HiddenStateCosineLossz
    Calculates Cosine loss between two tensors without reducing the sequence dim.

    The tensors are assumed to be intermediate activations, so extra restrictions are in place.
    Nr   r   r   c                    s:   t  j||d | jjdkr| jjstd dS dS dS )r   )r   r*   z``HiddenStateCosineLoss`` only works with tensors with full hidden dim. Ensure the tensor inputs meet this requirement or use `--sequence_parallel` if tensor parallel is enabled.N)r   r   r   tensor_model_parallel_sizesequence_parallelr   warningr   r   r   r   r   a   s   zHiddenStateCosineLoss.__init__r    r!   r"   c                 C   sj   |  ||\}}tj|d|d|d|d|ddd}|j|jdd  }| j|| jj	dS )z
        Forward function.

        Args:
            predictions: Student model tensors (size [s, b, h])
            targets: Teacher model tensors (size [s, b, h])

        Returns:
            Cosine loss of tensors (size [b, s])
        r<   r*   r9   r:   N   )r)   )
r&   r?   cosine_embedding_lossviewsizenew_onesshaper-   r   rF   rB   r   r   r   rC   q   s   zHiddenStateCosineLoss.forwardr.   )r/   r0   r1   r2   r   r3   r4   r   r	   rC   r6   r   r   r   r   rD   Z   s    rD   c                       sF   e Zd ZdZddddedef fdd	Zd
ededefddZ  Z	S )LogitsKLLossz[Calculates KL-Divergence loss between two logits tensors without reducing the sequence dim.      ?Fr   r   temperaturereversec                    s   t  | || _|| _dS )a  Constructor.

        Args:
            model_config: MCore transformer config.
            temperature: Divide tensors by this value prior to calculating loss.
            reverse: Whether to reverse the loss as KLD(teacher, student) instead of KLD(student, teacher)
        N)r   r   _temperature_reverse)r   r   rP   rQ   r   r   r   r      s   
zLogitsKLLoss.__init__r    r!   r"   c                 C   s  |  ||\}}| | j }| | j }| jjdkrtj|dd\}}tjj|tjj	j
t d ||jdd }tjt|dd}t|t d}tj|dd\}}tjj|tjj	j
t d ||jdd  }tjt|dd}	t|	t d}	|j\}
}}|t|	|
|d|
|| }|t||
|d|
|| }| jrtjtj||ddddd}nAtjtj||ddddd}n2| jrtjtjtj|ddtj|dddd	dd}ntjtjtj|ddtj|dddd	dd}| j|dd
S )zForward function.

        Args:
            predictions: Student model tensors (size [s, b, h])
            targets: Teacher model tensors (size [s, b, h])

        Returns:
            KLD loss of tensors (size [b, s])
        r*   r<   r=   opgroup)rV   r9   T)r;   
log_targetr:   )r(   )r&   floatrR   r   rE   torchmaxdistributed
all_reduceReduceOpMAXr   get_tensor_model_parallel_group	unsqueezerA   expall_reduce_autogradr%   rM   logrJ   expandrS   r?   kl_divlog_softmaxsoftmaxr-   )r   r    r!   output_teacheroutput_studentteacher_logits_max_denom_teacherstudent_logits_maxdenom_studentslenbszsharded_vocab_sizestudent_log_probteacher_log_probr'   r   r   r   rC      sp   
		zLogitsKLLoss.forwardrO   F)
r/   r0   r1   r2   rX   r5   r   r	   rC   r6   r   r   r   r   rN      s    rN   c                       sF   e Zd ZdZddedef fddZdeee	f d	e	fd
dZ
  ZS )"LogitsAndIntermediatesLossBalancerz
    LossBalancer implementation for Logit and Intermediate losses.

    Dynamically weighs distillation and original losses to balance during training.
    rO   Fkd_loss_scaleskip_original_lossc                    r   )ay  Constructor.

        Args:
            kd_loss_scale: Multiply distillation losses by this before weighing.
                (Not used when `skip_original_loss` is True.)
            skip_original_loss: Used to signal whether the original loss should be used, regardless
                of whether it was passed into ``mtd.DistillationModel.compute_kd_loss()`` or not.
        N)r   r   _kd_loss_scale_skip_original_loss)r   rv   rw   r   r   r   r      s   
	
z+LogitsAndIntermediatesLossBalancer.__init__	loss_dictr"   c                 C   s   | tjj}|D ]
}|tjr|}q	| |}t| t	t
|d }|dkr7| |  }|| }n||}|}| jrF|| }	n|| }
|
| |
  9 }
||
| j  }	|	||d}|S )zForward function.

        Args:
            loss_dict: All individual scalar losses, passed in during ``mtd.DistillationModel.compute_kd_loss()``

        Returns:
            Aggregate total scalar loss.
        r*   r   )kd_losslogits_lossintermediate_loss)popmtdloss_balancersSTUDENT_LOSS_KEY
startswithrN   r/   rA   valuesrZ   lenitem
new_tensorry   rx   )r   rz   original_loss_key
logits_keyr|   r}   dynamic_scaleintermediate_loss_scaled
total_lossr{   out_dictr   r   r   rC     s,   	



z*LogitsAndIntermediatesLossBalancer.forwardrt   )r/   r0   r1   r2   rX   r5   r   r   strr	   rC   r6   r   r   r   r   ru      s    "ru   c                       s8   e Zd ZdZd fddZdefdd	Zd
d Z  ZS )ProjectionLayerz>Module to project student layer activations to teacher's size.student_configr   teacher_configc                    sr   t  j|d |j|jkrt | _dS t|j|j| _| | j t	| jj
d| jj t	| jjd| jj dS )z
        Constructor.

        Args:
            student_config: Student's MCore transformer config.
            teacher_config: Teacher's MCore transformer config.
        )configrF   N)r   r   hidden_sizer3   Identity_fitLinearapply_init_weightssetattrweightr   rF   bias)r   r   r   r   r   r   r   -  s   zProjectionLayer.__init__student_tensorc                 C   s
   |  |S )zp
        Forward function.

        Args:
            student_tensor: Tensor to be fit to teacher size.
        )r   )r   r   r   r   r   rC   ?  s   
zProjectionLayer.forwardc                 C   s>   t |tjr| j|jj |jdur|jj  dS dS dS )zInitialize the weights.N)	r#   r3   r   r   init_methodr   datar   zero_)r   moduler   r   r   r   H  s   
zProjectionLayer._init_weights)r   r   r   r   )	r/   r0   r1   r2   r   r	   rC   r   r6   r   r   r   r   r   *  s
    	r   c                   @   s(   e Zd ZdZedd Zedd ZdS )
_AllReducez@Implementation from old PyTorch `torch.distributed.nn.parallel`.c                 C   s,   ||| _ | _| }tjj|||d |S )NrT   )rV   rU   clonerY   r[   r\   )ctxrU   rV   tensorr   r   r   rC   S  s   z_AllReduce.forwardc                 C   s   d d t | j| j|fS r.   )r   r   rU   rV   )r   grad_outputr   r   r   backward[  s   z_AllReduce.backwardN)r/   r0   r1   r2   staticmethodrC   r   r   r   r   r   r   P  s    
r   c                 C   s   t ||| S )zCustom all-reduce function.

    Needed instead of other all-reduce functions available when the computation following
    the all-reduce call differs per rank. In KL loss, this corresponds to the different numerators.
    )r   r   )r   rU   rV   r   r   r   rb   a  s   rb   ).abcr   typingr   r   r   r   rY   torch.nnr3   torch.nn.functional
functionalr?   megatron.corer   megatron.core.transformerr   r	   torch.nn.modules.lossr
   
nemo.utilsr   nemo.utils.import_utilsr   r   r   rk   objectr   ,megatron.core.transformer.transformer_configr   r   r8   rD   rN   ru   r   autogradFunctionr   r[   r]   SUMrV   WORLDrb   r   r   r   r   <module>   s.   !0f:&