o
    iN2                     @   s,  d dl mZmZmZ d dlZd dlm  mZ d dl	Z	d dl
mZ deejvr.ejjej_e	ddd ie	jdejdejdejd	ejfd
dZe	ddd ie	jdejdejfddZG dd dejjZ							ddejdejdeej dededededeejejf fddZdS )     )TupleOptionalUnionNall_gather_into_tensorHAS_SMOOTHINGc                 C      | d dkS N	smoothing         argsr   r   Y/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/ops/triton/cross_entropy.py<lambda>       r   
BLOCK_SIZESPLITPRECOMPUTED_LSEc              	   C   s  t d}|||t j  }d}|s~td }d}td||D ]M}|t d| }t j|| ||k td dt j| }|rO|t 	t 
||k |d7 }t |t |}t || | t 	t ||  }|}q t || }t || | nt || }t || }||krd}d}nS||
8 }|dkr||k rt || | }|r|s|nd|| |	  d| |  }n|s|nd| }n|r||s|nd||	   }nd}|s|| | }||7 }nd}t | | | |st || | d S d S )Nr   r
   infmaskother   )tl
program_idtoint64floatrangearangeloadfloat32sumwheremaximummaxexplogstore)loss_ptrlse_ptr
z_loss_ptr
logits_ptr
labels_ptrr	   logit_scalelse_square_scaleignore_indextotal_classesclass_start_idxn_colslogits_row_strider   r   r   r   row_idx
sum_logitsm_il_i
col_offsetcolslogitsm_i_newlse	label_idxlossz_losslogits_labelr   r   r   cross_entropy_fwd_kernel   s^   

&



rB   c                 C   r   r   r   r   r   r   r   r   d   r   c                 C   sJ  t d}t d}|||t j  }| ||t j  } || t d| }t || }||kr=t |||  }nd}t j|| ||k td dt j| }t || }t || }|d| | | 7 }||
8 }|rd| }||	 }t 	||k|| || }nt 	||k|d |}t j
| | || | ||k d d S )	Nr   r   r
   r   r   g       @      ?)r   )r   r   r   r   r   r    r   r!   r&   r#   r(   )dlogits_ptr	dloss_ptrr,   r*   r-   r	   r.   r/   r0   r1   r2   r3   r4   dlogits_row_stridedloss_row_strider   r   r5   col_block_idxcol_offsetsr>   dlossr;   r=   probssmooth_positivesmooth_negativer   r   r   cross_entropy_bwd_kernelb   s0   

$rN   c                   @   s4   e Zd Ze							d
ddZedd	 ZdS )CrossEntropyLossNr
   rC   Fc
                 C   s  |j tjkr$| d dkr$t|ddd df }| d dks$J |dks*J |j\}
}|j|
fks7J |	d u r=dntj|	}|| }|	d u rMdntj	|	}|| }|d uob|dkob|dk}|
ddkrn| }d	}tt||}|d
k r~dn|dk rdn|dk rdnd}tj|
tj|jd}|r|j|
fksJ | }n
tj|
tj|jd}tj|
tj|jd}tj|jj% t|
f |||||||||||||
d||dk||d W d    n1 sw   Y  |dkrM|dkr$tj||
|j |jd}tjj|||	d tjj|tjjj|	dd}tj|dd}|  ||7 }|dkr@||  }|||kd ||7 }nt|}|||kd | ||| | | || _ || _!|| _"|| _#|| _$|| _%|| _&||fS )N   r   )r   r   .r
   r   rC   i @               i       )dtypedevice)r   r   r   	num_warps)groupT)opr[   async_op)dim)'rX   torchlongdata_ptrFpadshapedistributedget_world_sizeget_rankstride
contiguousmintritonnext_power_of_2emptyr   rY   cudaindexrB   r   
all_reduceReduceOpSUM	logsumexpwaitsquaremasked_fill_
zeros_likesave_for_backwardmark_non_differentiabler	   r.   r/   r0   r1   r2   inplace_backward)ctxr;   labelsprecomputed_lser	   r.   r/   r0   rz   process_groupn_rowsr3   
world_sizer1   rankr2   use_precomputed_lseMAX_BLOCK_SIZEr   rZ   lossesr=   z_losseslse_allgatherhandle_lossesr   r   r   forward   s   







zCrossEntropyLoss.forwardc           
         s   ~| j \}}}| jr|nt|}|j\ tt d}|dk r$dn|dk r*dnd} fdd}	tj	|j	j
. t|	 |||||| j| j| j| j| j| j |d	|d	|d	||d
 W d    n1 slw   Y  |d d d d d d d d d f
S )Ni   rS   rT   rU   rV   rQ   c                    s   t  | d fS )Nr   )rk   cdiv)METAr3   r   r   r   r   
  s    z+CrossEntropyLoss.backward.<locals>.<lambda>r   )r   rZ   )saved_tensorsrz   r_   
empty_likerd   rj   rk   rl   rn   rY   ro   rN   r	   r.   r/   r0   r1   r2   rh   )
r{   grad_lossesgrad_z_lossesr;   r=   r|   dlogitsr   rZ   gridr   r   r   backward  s:   
zCrossEntropyLoss.backwardNr
   rC   r
   rP   FN)__name__
__module____qualname__staticmethodr   r   r   r   r   r   rO      s    irO   r
   rC   rP   Fr;   r|   r}   label_smoothingr.   r/   rz   returnc	           	      C   s   t | ||||||||	S )a*  
    Arguments:
        logits: (batch, vocab_size)
        labels: (batch,)
        label_smoothing: float
        logit_scale: float. Multiply logits by this scale before calculating the loss.
        lse_square_scale: float. If > 0, we add lse_square_scale * lse(logits) ^ 2 to the loss.
            This is also referred to as "z-loss".
        ignore_index: int. If labels == ignore_index, the loss is set to 0.0.
        inplace_backward: bool. If True, we do the backward pass in-place by modifying the logits.
            This saves memory.
        process_group: if not None, we're doing Tensor Parallel: each process is responsible for
            one part of the vocab. The loss will be aggregated across processes.
    Returns:
        losses: (batch,), float
        z_losses: (batch,), float
    )rO   apply)	r;   r|   r}   r   r.   r/   r0   rz   r~   r   r   r   cross_entropy_loss$  s   r   r   )typingr   r   r   r_   torch.nn.functionalnn
functionalrb   rk   triton.languagelanguager   dirre   _all_gather_baser   
heuristicsjit	constexprrB   rN   autogradFunctionrO   Tensorr   boolr   r   r   r   r   <module>   sr   I- 
