o
    Ti\                     @   s4   d dl Z d dlmZ G dd de jjZdd ZdS )    Nc                   @   s$   e Zd Zedd Zedd ZdS )"_VocabSequenceParallelCrossEntropyc           
      C   s   t jjj|dd}|d| _t jjj| d| j|ddd}t	
|}t	|}|| _|| _|d| | _|d}t j| j||j|jd}	t	j|	||d	 | || |	S )
Ndim   none)	reductionr      )dtypedevice)group)torchnn
functionalsoftmaxsize
vocab_sizenll_losslogviewdistget_world_sizeget_ranksp_world_sizesp_rankseqlenemptyr
   r   all_gather_into_tensorsave_for_backward)
ctxvocab_seq_parallel_logitstargetsp_groupr   lossr   r   
batch_sizeloss_all r&   T/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/sequence/cross_entropy.pyforward   s"   (


z*_VocabSequenceParallelCrossEntropy.forwardc           
      C   s   | j \}}| j| j }| j}||| ||d  d d f }|}|d| j}tjd| d |j	d}	||	|df  d8  < |
|jdd |d d d fS )Nr	   r   r   )startendr   r   )saved_tensorsr   r   r   r   r   r   aranger   r   mul_	unsqueeze)
r   grad_outputr   r!   step_seqlenr   grad_output_part
grad_inputgrad_2d	arange_1dr&   r&   r'   backward)   s   
 z+_VocabSequenceParallelCrossEntropy.backwardN)__name__
__module____qualname__staticmethodr(   r5   r&   r&   r&   r'   r      s
    
r   c                 C   s   t | ||S )N)r   apply)vocab_parallel_logitsr!   r"   r&   r&   r'   %vocab_sequence_parallel_cross_entropy;   s   r<   )r   deepspeed.commcommr   autogradFunctionr   r<   r&   r&   r&   r'   <module>   s   0