o
    c۷iGi                     @   s>  d dl Z d dlmZmZ d dlmZmZmZ d dlZd dlm	Z	 d dl
m  mZ d dlZd dlmZ d dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlm Z m!Z! d dl"m#Z# d d	l$m%Z% d d
l&m'Z' G dd de#Z(edddd Z)ej*j+dh dd	d<de	de	dee	 de	dee	 dee	 de,ddfddZ-e-j.	d<de	de	dee	 de	dee	 dee	 de,ddfddZ/			 	 	 d=dej	dej	deej	 de,d!e0d"e0d#e0dej	e1ej	 B fd$d%Z2G d&d' d'Z3eddd(d) Z4	d<dej	dej	d*ej	dej	dej	ddfd+d,Z5ej*j+d-dhd	d<dej	dej	d*ej	dej	dej	de,ddfd.d/Z6e6j.	d<dej	dej	d*ej	dej	dej	de,ddfd0d1Z7		 d>dej	dej	d*ej	dej	de,d#e0ddfd2d3Z8G d4d5 d5ej9j:Z;			6	 d?dej	dej	d7eej	 de,d8ed9 d#e0dej	fd:d;Z<dS )@    N)	lru_cachepartial)OptionalTypeLiteral)Tensor)Int32Int64Float32Boolean
const_expr)make_fake_tensor)
row_reduceonline_softmax_reduce)ReductionBasecompile_and_cache)torch2cute_dtype_mapc                       s   e Zd Zddeej dedef fddZdd Z	d	d
 Z
ejdejdejdeej dejdeej deej dedejfddZejdejdejdejdejdeej deej dedejdejdeje fddZ  ZS )CrossEntropyTdtypeNonline_softmaxc                    sP   || _ t j||| j sdnd| j stntd |dks| j r#d | _d S d| _d S )N      )stagereduction_dtype @  smem)r   super__init__r
   r	   reload_from)selfr   r   r   	__class__ I/home/ubuntu/vllm_env/lib/python3.10/site-packages/quack/cross_entropy.pyr      s   "zCrossEntropy.__init__c                 C   s(   | j }dD ]\}}||kr|  S qdS )N)@      )      )i       )i   r'   )r   r)      )r   r!   r   limitthreadsr$   r$   r%   _threads_per_row&   s   zCrossEntropy._threads_per_rowc                 C   sT   | j }t| jjdkrg d}ng d}|D ]\}}||kr$|| _ d S qd| _d S )Nr*   )r   r   )i   r   )      )   r(   )r1   )r2   r   )r4   r3   )i   r(   )r   r   r   width	cluster_n)r!   r   
thresholdsr.   clusterr$   r$   r%   _set_cluster_n-   s   

zCrossEntropy._set_cluster_nmXmTargetmTargetLogitmLossmLSEmdXignore_indexstreamc	                 C   s   |j | jksJ t|d u r|}t|d ur|j | jksJ |   t|j j}	t|d ur7tt|	|j j}	t| jd|	 }
| j	|
d\}}}|j
}| ||||||||||
jt|jd |d | jdg|ddgt| jdkryd| jdgnd |d d S )Nr)   vecsizer   r   )gridblockr8   rA   )element_typer   r   r9   r5   maxmathgcdr   _get_tiled_copysizekernellaunchcuteceil_divshaper6   )r!   r:   r;   r<   r=   r>   r?   r@   rA   largest_dtype_widthrC   
tiled_copytiler_mnthreads_per_rownum_threadsr$   r$   r%   __call__9   s<   
zCrossEntropy.__call__rS   rR   rT   c           2   
      s  t j \}}}t j \ }}t| jdkrtdnt j d |	j}|j}t |} fdd||fD \}}t	j
 }|j|jt jdddd}| ||\}}|	|}||}||}||d	 }t |}t|d d | j k}|rd ntj|||d d
}ttj|d}t |	t jj }| ||| |d d }tj} ||d k rt|| } ||d k r|||dd t j  t jd t| rt
|||jj   t !|| |" #t$}!t$j}"t%| |k}#||d k r0|d d dkr0|#s0tt &|jdkrt$||| f }"nt &|jdks*J t$|| }"t| j' rt(|!t j)j*|
|d t| jdkrL|d nd t$j  t| jdkr\t jj+nd d}$t| j,dkrut !|| |" #t$}!t-.t-j/}%t j-j0|!|% |$|%  dd}&t(|&t j)j1|
|d t| jdkr|d nd dd}'nt2|!|
|d |t| jdkrt jj+nd t|d ud\}$}'}&|d d dkr||d k r| jdkst j3 dkr|$t j-j4|'dd }(|#s|(|" nt$j})||)||< t|d ur|(||< t|d ur|'dks!|'|'ks!|#s!t j5|'nt$j}*|&|* }+t 6| f},||,}-t |-}.||}/t |t$}0|07|+ |#sqt	j8t |ddD ]}1|/|1 d | krg|0|1 n|0|1 d |0|1< qX|.7|0" #|.j ||d k r||.|- d S d S d S )Nr   r   c                       g | ]}t | fqS r$   rN   
local_tile.0mTbidx	cluster_yrS   r$   r%   
<listcomp>y       z'CrossEntropy.kernel.<locals>.<listcomp>r   r   orderr*   byte_alignment)r   NNNr.   predTis_asyncr   )NNr   )init_valhook_fnr   Ffastmath)NNr   g        )rm   )rn   return_exp_xunroll_full      ?)9rN   arch
thread_idx	block_idxr   r6   layout_tv_tiledrP   make_identity_tensorcutlassutilsSmemAllocatorallocate_tensorrF   make_ordered_layout#_allocate_reduction_buffer_and_mbar	get_slicepartition_Spartition_Dmake_fragment_like
copy_utilspredicate_kr   copyrK   	WARP_SIZE_initialize_clusterr   zerocp_async_commit_groupcp_async_wait_groupfill_oobinfautovec_copyloadtor
   r   rankr   r   ReductionOpMAXcluster_waitr    rH   log2eexp2ADDr   block_idx_in_clusterlog
rcp_approxrY   storerange)2r!   r:   r;   r<   r=   r>   r?   r@   rS   rR   rT   tidx_	tv_layoutrP   idXgXcXr   sXreduction_buffermbar_ptrthr_copytXgXtXsXtXcXtXrX	is_even_NtXpXr   	num_warpsrowtargetxtarget_logitshould_ignoremax_xlog2_eexp_xdenomlseloss_val	denom_invprobsgdXtXgdXtXrdXtXcFull	tXrdX_f32ir$   r]   r%   rL   c   s   $







&		




.zCrossEntropy.kernel)T)__name__
__module____qualname__r   rz   Numericintboolr   r0   r9   rN   jitr   r   r   cudaCUstreamrV   rL   Shape	TiledCopy	Constexpr__classcell__r$   r$   r"   r%   r      sX    "	)	
r   )maxsizec           	         s6   d f} fdd}t ||S )Ncross_entropy_fwdc            	         s   t  } tdj  }t|  f|}rt|  f|nd }t| f}d urAdkr:t| t  f|}n	t| f}nd }tt| f}rQtt| fnd }t  d}t j|||||||t	dt j
jdddd
S )	Nr)   r   )r   r   Tuse_tvm_ffi_env_stream--enable-tvm-ffioptions)rN   sym_intrH   rI   r5   fake_tensorr
   r   compiler   runtimemake_fake_stream)		batch_symdivx_cutedx_cutetarget_cutetarget_logit_cute	loss_cutelse_cutecross_entropy_opr   r   has_dxhas_lsetarget_dtypetarget_logit_dtypetarget_logit_ndimr$   r%   _compile  s6   z,_compile_cross_entropy_fwd.<locals>._compiler   )	r   r   r   r   r   r   r   keyr   r$   r   r%   _compile_cross_entropy_fwd   s   
 r   zquack::cross_entropy_fwd_out>   dxr   loss)mutates_argsr   r   r   r   r   r   r@   returnc              	   C   s4  |   dks
J d|  dksJ d| jr|jsJ d| jtjtjtjfv s-J d|jtjtjfv s:J d|durR|jsEJ d	|jtjtjtjfv sRJ |dur]|js]J d
| 	d}t
| j }t
|j }	|durut
|j nd}
|dur~|jnd}t||	|
||du|du|| |||||t| dS )a>  Cross entropy forward pass.

    Args:
        x: Input logits tensor of shape (M, N)
        target: Target class indices tensor of shape (M,)
        target_logit: (M, K) or (M,).
            If provided, the target logit will be read from this tensor instead of x.
        loss: Output loss tensor of shape (M,)
        lse: Optional output log-sum-exp tensor of shape (M,)
        dx: Optional output gradient tensor of shape (M, N)
        ignore_index: Index to ignore in loss computation

    Returns:
        None (mutates loss, lse, and optionally dx in-place)
    r   Input must be 2Dr   Target must be 1DTensors must be on CUDA deviceUnsupported input dtypeTarget must be int32 or int64Nz$Target logits must be on CUDA devicezdx must be on CUDA device)dimis_cudar   torchfloat16bfloat16float32int32int64rK   r   ndimr   r   )r   r   r   r   r   r   r@   r   r   r   r   r   r$   r$   r%   cross_entropy_fwd_out&  s6   


r   c                 C   s   ddl m} |rJt| dtjsL| d}t| j }	t|j }
|d ur)t|j nd }|d ur2|jnd }t	|	|
|||d u|d u| t
|	|
| d S d S d S Nr   )COMPILE_ONLYr   )quack.cache_utilsr  
isinstancerK   r   SymIntr   r   r   r   _compile_cross_entropy_backward)r   r   r   r   r   r   r@   r  r   r   r   r   r   r$   r$   r%   _cross_entropy_fwd_out_fake[  s&   


	r  F
return_lse	return_dxinplace_backwardc                 C   s   |  d}| j}tj||tjd}	|rtj||tjdnd }
|r)|s't| n| nd }t| |||	|
|| |r>|r>|	|
|fS |rD|	|
fS |rJ|	|fS |	S )Nr   )devicer   )rK   r  r   emptyr   
empty_liker   )r   r   r   r@   r  r	  r
  Mr  r   r   r   r$   r$   r%   r   |  s   
	
r   c                   @   s   e Zd Zdeej defddZdd Zdefdd	Z	e
jd
e
jde
jde
jde
jde
jdedejfddZe
jd
e
jde
jde
jde
jde
jdede
jde
jde
jdeje fddZdS )CrossEntropyBackwardr   r   c                 C   s   || _ || _d|j | _d S )Nr)   )r   r   r5   rC   )r!   r   r   r$   r$   r%   r     s   zCrossEntropyBackward.__init__c                 C   s.   t | jd}dD ]\}}||kr|  S qdS )Nr   r&   r,   )minr   r-   r$   r$   r%   r0     s   z%CrossEntropyBackward._threads_per_rowrC   c           	      C   s   | j | dksJ d| j  d| t| j d}|dkrdnd}|  }|| }t|| |}||| | f}tj| j|||d}|||fS )Nr   zInput N z! is not divisible by vector size r   r)   r,   )num_copy_elems)r   r  r0   rN   rO   r   tiled_copy_2dr   )	r!   rC   r   rU   rT   cols_per_blocknum_blocks_NrS   rR   r$   r$   r%   rJ     s   $

z$CrossEntropyBackward._get_tiled_copyr:   r;   mDLossr?   r>   r@   rA   c                    s   |j  jksJ |j  jksJ t jd jj } j|d\}	}
}|	j} fdd|||fD \}}} |||||||j	|
|	|
j
t|j	d |
d t|j	d |
d dg|ddg|d d S )Nr)   rB   c                    s   g | ]}t j|d  jdqS )r   )r   rK   )layout_utilsexpandr   )r[   Xr!   r$   r%   r`     s    z1CrossEntropyBackward.__call__.<locals>.<listcomp>r   r   )rD   rE   rA   )rF   r   rH   rI   r   r5   rJ   rK   rL   rP   rM   rN   rO   )r!   r:   r;   r  r?   r>   r@   rA   rC   rR   rS   rT   rU   r$   r  r%   rV     s8   

zCrossEntropyBackward.__call__rP   rS   rR   rT   c           *         s  t j \}}}t j \ }tj }|j|jt j	dddd}t 
|} fdd|||fD \}}}|	|}||}||}||d }||}||}dd ||fD \}}t|d	 d	  d
k}|rtd ntj|||d	 d}ttj|d}|d
 d
 }||d
 k r|||dd t j  t jd
 t| rt|||jj  t || | t}tj} tj}!tj}"||d
 k rt|| } t| |k}#|#st|| }!t|| }"ttj }$t jj!||$ |"|$  dd}%|%d }&t "|t}'tj#t $|ddD ]}(||( d	 | k|'|(< qt %|' |&|%})|)|! })|&|)|j ||d
 k rD||| d S d S )Nrb   rc   r*   re   c                    rW   r$   rX   rZ   r^   bidyrS   r$   r%   r`     ra   z/CrossEntropyBackward.kernel.<locals>.<listcomp>rg   c                 S   s   g | ]}t |qS r$   )rN   r   )r[   thrr$   r$   r%   r`     s    r   r   rh   ri   Trk   ro   rt   rr   )'rN   ru   rv   rw   rz   r{   r|   r}   rF   r~   ry   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r
   r   r   r   rH   r   r   r   r   r   rK   wherer   )*r!   r:   r;   r  r?   r>   r@   rP   rS   rR   rT   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   dlossr   r   r   r   prob_shiftedmaskr   gradr$   r  r%   rL     sb   

"






zCrossEntropyBackward.kernelN)r   r   r   r   rz   r   r   r   r0   rJ   rN   r   r   r   r   r   rV   rL   r   r   r   r$   r$   r$   r%   r    sT    )	
r  c                    s&   d f} fdd}t ||S )Ncross_entropy_bwdc                     s   t  } tdj  }t|  f|gd \}}t| f}tt| fgd \}}t }t j||||||t	dt j
jdddd	S )Nr)   r   r   Tr   r   r   )rN   r   rH   rI   r5   r   r
   r  r   r   r   r   )r   r   r   r   r   
dloss_cuter   cross_entropy_backward_opr   r   r   r$   r%   r   (  s"   
z1_compile_cross_entropy_backward.<locals>._compiler   )r   r   r   r   r   r$   r%  r%   r  $  s   
r  r  c           	      C   s<  |   dks
J d|  dksJ d|  dksJ d|  dks(J d| jd |jd ks6J d| jd |jd ksDJ d| jd |jd ksRJ d| jr^|jr^|jr^|jsbJ d	| jtjtjtjfv sqJ d
|jtjtj	fv s~J d| 
d}t| j }t|j }t|||| ||||t| dS )a<  Cross entropy backward pass.
    Args:
        x: Input logits tensor of shape (M, N)
        target: Target class indices tensor of shape (M,)
        dloss: Upstream gradients tensor of shape (M,)
        lse: Log-sum-exp values tensor of shape (M,)
    Returns:
        Input gradients tensor of shape (M, N)
    r   r   r   r   zdloss must be 1Dzlse must be 1Dr   zBatch dimensions must matchr   r   r   N)r   rP   r   r   r   r   r   r   r   r   rK   r   r  r   )	r   r   r  r   r   r@   r   r   r   r$   r$   r%   _cross_entropy_backward>  s$   



r&  zquack::cross_entropy_bwd_outc                 C   s   t | ||||| d S )N)r&  r   r   r  r   r   r@   r$   r$   r%   cross_entropy_bwd_outc  s   	r(  c           
      C   sX   ddl m} |r(t| dtjs*| d}t| j }t|j }	t||	| d S d S d S r  )	r  r  r  rK   r   r  r   r   r  )
r   r   r  r   r   r@   r  r   r   r   r$   r$   r%   _cross_entropy_bwd_out_fakeo  s   



r)  c                 C   sL   |rt j s| }t| |||| |d |S t | }t| |||||d |S )Nr'  )r   compileris_compilingr&  r  r(  )r   r   r  r   r@   r
  r   r$   r$   r%   r"    s   
r"  c                   @   s&   e Zd ZedddZedd ZdS )	CrossEntropyFunctionNr   Fc                 C   sR   |d u rt |||dd\}}nt ||||dd\}}| ||| || _|| _|S )NT)r@   r  )r   r@   r  )r   save_for_backwardr@   r
  )ctxr   r   lse_partialr@   r
  r   r   r$   r$   r%   forward  s   

zCrossEntropyFunction.forwardc                 C   s2   | j \}}}t||||| j| jd}|d d d d fS )N)r
  )saved_tensorsr"  r@   r
  )r.  r  r   r   r   r   r$   r$   r%   backward  s
   zCrossEntropyFunction.backward)Nr   F)r   r   r   staticmethodr0  r2  r$   r$   r$   r%   r,    s
    r,  meanr/  	reduction)noner4  sumc                 C   s^   t | ||||}|dkr| ||k   S |dkr!| S |dkr'|S td| d)a  Cross entropy loss with automatic differentiation support.

    Args:
        x: Input logits tensor of shape (M, N)
        target: Target class indices tensor of shape (M,)
        lse_partial: Optional precomputed log-sum-exp partial results
        reduction: Specifies the reduction to apply to the output:
            'none': no reduction will be applied (default)
            'mean': the sum of the output will be divided by the number of elements
            'sum': the output will be summed
        inplace_backward: Whether to perform backward pass in-place
        ignore_index: Index to ignore in loss computation (loss will be 0 for these indices)

    Returns:
        Cross entropy loss tensor:
            - If reduction='none': tensor of shape (M,) with per-example losses
            - If reduction='mean': scalar tensor with mean loss
            - If reduction='sum': scalar tensor with sum of losses
    r4  r7  r6  zInvalid reduction mode: z*. Expected one of 'none', 'mean', or 'sum')r,  applyr7  float
ValueError)r   r   r/  r@   r5  r
  r   r$   r$   r%   cross_entropy  s   
r;  )r   )Nr   FFF)r   F)Nr   r4  F)=rH   	functoolsr   r   typingr   r   r   r   r   cuda.bindings.driverbindingsdriverr   rz   cutlass.cuterN   r   r	   r
   r   r   quack.utilsr{   quack.copy_utilsr   quack.layout_utilsr  quack.compile_utilsr   r   quack.reducer   r   quack.reduction_baser   r  r   quack.cute_dsl_utilsr   r   r   library	custom_opr   r   register_faker  r   tupler   r  r  r&  r(  r)  r"  autogradFunctionr,  r;  r$   r$   r$   r%   <module>   sT   [
14#
 

%
