o
    پiCa                     @   s  d dl Z d dlmZ d dlmZmZmZ d dlZd dlmZ d dl	m
  mZ d dlZd dlmZ d dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZm Z  d dl!m"Z" d d	l#m$Z$ G d
d de"Z%ej&j'dh dd	d2dededee dedee dee de(ddfddZ)i e)_*					d3dejdejdeej de(de+de+de+deje,ej B fddZ-G d d! d!Z.	d2dejdejd"ejdejdejddfd#d$Z/i e/_*ej&j'd%dhd	d2dejdejd"ejdejdejde(ddfd&d'Z0		d4dejdejd"ejdejde(de+ddfd(d)Z1G d*d+ d+ej2j3Z4			,	d5dejdejd-eej de(d.ed/ de+dejfd0d1Z5dS )6    N)partial)OptionalTypeLiteral)Tensor)Int32Int64Float32Boolean
const_expr)make_fake_tensor)
row_reduceonline_softmax_reduce)ReductionBase)torch2cute_dtype_mapc                       s   e Zd Zddeej dedef fddZdd Z	d	d
 Z
ejdejdejdeej dejdeej deej dedejfddZejdejdejdejdejdeej deej dedejdejdeje fddZ  ZS )CrossEntropyTdtypeNonline_softmaxc                    sJ   t  j|||s	dnd|stntd || _|dks|r d | _d S d| _d S )N      )stagereduction_dtype @  smem)super__init__r	   r   r   reload_from)selfr   r   r   	__class__ G/home/ubuntu/.local/lib/python3.10/site-packages/quack/cross_entropy.pyr      s   

 zCrossEntropy.__init__c                 C   s(   | j }dD ]\}}||kr|  S qdS )N)@      )      )i       )i   r$   )r   r&      )r   r   r   limitthreadsr!   r!   r"   _threads_per_row%   s   zCrossEntropy._threads_per_rowc                 C   sT   | j }t| jjdkrg d}ng d}|D ]\}}||kr$|| _ d S qd| _d S )Nr'   )r   r   )i   r   )      )   r%   )r.   )r/   r   )r1   r0   )i   r%   )r   r   r   width	cluster_n)r   r   
thresholdsr+   clusterr!   r!   r"   _set_cluster_n,   s   

zCrossEntropy._set_cluster_nmXmTargetmTargetLogitmLossmLSEmdXignore_indexstreamc	                 C   s   |j | jksJ t|d u r|}t|d ur|j | jksJ |   t|j j}	t|d ur7tt|	|j j}	t| jd|	 }
| j	|
d\}}}|j
}| ||||||||||
jt|jd |d | jdg|ddgt| jdkryd| jdgnd |d d S )Nr&   vecsizer   r   )gridblockr5   r>   )element_typer   r   r6   r2   maxmathgcdr   _get_tiled_copysizekernellaunchcuteceil_divshaper3   )r   r7   r8   r9   r:   r;   r<   r=   r>   largest_dtype_widthr@   
tiled_copytiler_mnthreads_per_rownum_threadsr!   r!   r"   __call__8   s<   
zCrossEntropy.__call__rP   rO   rQ   c           2   
      s  t j \}}}t j \ }}t| jdkrtdnt j d |	j}|j}t |} fdd||fD \}}t	j
 }|j|jt jdddd}| ||\}}|	|}||}||}||d	 }t |}t|d d | j k}|rd ntj|||d d
}ttj|d}t |	t jj }| ||| |d d }tj} ||d k rt|| } ||d k r|||dd t j  t jd t| rt
|||jj   t !|| |" #t$}!t$j}"t%| |k}#||d k r0|d d dkr0|#s0tt &|jdkrt$||| f }"nt &|jdks*J t$|| }"t| j' rt(|!t j)j*|
|d t| jdkrL|d nd t$j  t| jdkr\t jj+nd d}$t| j,dkrut !|| |" #t$}!t-.t-j/}%t j-j0|!|% |$|%  dd}&t(|&t j)j1|
|d t| jdkr|d nd dd}'nt2|!|
|d |t| jdkrt jj+nd t|d ud\}$}'}&|d d dkr||d k r| jdkst j3 dkr|$t j-j4|'dd }(|#s|(|" nt$j})||)||< t|d ur|(||< t|d ur|'dks!|'|'ks!|#s!t j5|'nt$j}*|&|* }+t 6| f},||,}-t |-}.||}/t |t$}0|07|+ |#sqt	j8t |ddD ]}1|/|1 d | krg|0|1 n|0|1 d |0|1< qX|.7|0" #|.j ||d k r||.|- d S d S d S )Nr   r   c                       g | ]}t | fqS r!   rK   
local_tile.0mTbidx	cluster_yrP   r!   r"   
<listcomp>x       z'CrossEntropy.kernel.<locals>.<listcomp>r   r   orderr'   byte_alignment)r   NNNr+   predTis_asyncr   )NNr   )init_valhook_fnr   Ffastmath)NNr   g        )rj   )rk   return_exp_xunroll_full      ?)9rK   arch
thread_idx	block_idxr   r3   layout_tv_tiledrM   make_identity_tensorcutlassutilsSmemAllocatorallocate_tensorrC   make_ordered_layout#_allocate_reduction_buffer_and_mbar	get_slicepartition_Spartition_Dmake_fragment_like
copy_utilspredicate_kr   copyrH   	WARP_SIZE_initialize_clusterr   zerocp_async_commit_groupcp_async_wait_groupfill_oobinfautovec_copyloadtor	   r
   rankr   r   ReductionOpMAXcluster_waitr   rE   log2eexp2ADDr   block_idx_in_clusterlog
rcp_approxrV   storerange)2r   r7   r8   r9   r:   r;   r<   r=   rP   rO   rQ   tidx_	tv_layoutrM   idXgXcXr   sXreduction_buffermbar_ptrthr_copytXgXtXsXtXcXtXrX	is_even_NtXpXr   	num_warpsrowtargetxtarget_logitshould_ignoremax_xlog2_eexp_xdenomlseloss_val	denom_invprobsgdXtXgdXtXrdXtXcFull	tXrdX_f32ir!   rZ   r"   rI   b   s   $







&		




.zCrossEntropy.kernel)T)__name__
__module____qualname__r   rw   Numericintboolr   r-   r6   rK   jitr   r   r   cudaCUstreamrS   rI   Shape	TiledCopy	Constexpr__classcell__r!   r!   r   r"   r      sX    "	)	
r   zquack::cross_entropy_fwd_out>   dxr   loss)mutates_argsr   r   r   r   r   r   r=   returnc                 C   s(  |   dks
J d|  dksJ d| jr|jsJ d| jtjtjtjfv s-J d|jtjtjfv s:J d|durR|jsEJ d	|jtjtjtjfv sRJ |dur]|js]J d
| 	d}t
| j }t
|j }	|durut
|j nd}
||	|
||du|duf}|tjvrt }td|j |}t|||f|}|durt|||f|nd}t|	|f}|dur|jdkrt|
|t f|}n	t|
|f}nd}tt|f}|durtt|fnd}t|||du d}tj|||||||tdtjjdddd
tj|< tj| | |||||t| dS )a>  Cross entropy forward pass.

    Args:
        x: Input logits tensor of shape (M, N)
        target: Target class indices tensor of shape (M,)
        target_logit: (M, K) or (M,).
            If provided, the target logit will be read from this tensor instead of x.
        loss: Output loss tensor of shape (M,)
        lse: Optional output log-sum-exp tensor of shape (M,)
        dx: Optional output gradient tensor of shape (M, N)
        ignore_index: Index to ignore in loss computation

    Returns:
        None (mutates loss, lse, and optionally dx in-place)
    r   Input must be 2Dr   Target must be 1DTensors must be on CUDA deviceUnsupported input dtypeTarget must be int32 or int64Nz$Target logits must be on CUDA devicezdx must be on CUDA devicer&   )r   r   Tuse_tvm_ffi_env_stream--enable-tvm-ffioptions)dimis_cudar   torchfloat16bfloat16float32int32int64rH   r   cross_entropy_fwd_outcompile_cacherK   sym_intrE   rF   r2   fake_tensorndimr	   r   compiler   runtimemake_fake_stream)r   r   r   r   r   r   r=   r   r   target_dtypetarget_logit_dtypecompile_key	batch_symdivx_cutedx_cutetarget_cutetarget_logit_cute	loss_cutelse_cutecross_entropy_opr!   r!   r"   r      sj   



r   F
return_lse	return_dxinplace_backwardc                 C   s   |  d}| j}tj||tjd}	|rtj||tjdnd }
|r)|s't| n| nd }t| |||	|
|| |r>|r>|	|
|fS |rD|	|
fS |rJ|	|fS |	S )Nr   )devicer   )rH   r   r   emptyr   
empty_liker   )r   r   r   r=   r   r   r   Mr   r   r   r   r!   r!   r"   cross_entropy_fwdK  s   
	
r   c                   @   s   e Zd Zdeej defddZdd Zdefdd	Z	e
jd
e
jde
jde
jde
jde
jdedejfddZe
jd
e
jde
jde
jde
jde
jdede
jde
jde
jdeje fddZdS )CrossEntropyBackwardr   r   c                 C   s   || _ || _d|j | _d S )Nr&   )r   r   r2   r@   )r   r   r   r!   r!   r"   r   e  s   zCrossEntropyBackward.__init__c                 C   s.   t | jd}dD ]\}}||kr|  S qdS )Nr   r#   r)   )minr   r*   r!   r!   r"   r-   j  s   z%CrossEntropyBackward._threads_per_rowr@   c           	      C   s   | j | dksJ d| j  d| t| j d}|dkrdnd}|  }|| }t|| |}||| | f}tj| j|||d}|||fS )Nr   zInput N z! is not divisible by vector size r   r&   r)   )num_copy_elems)r   r   r-   rK   rL   r   tiled_copy_2dr   )	r   r@   r   rR   rQ   cols_per_blocknum_blocks_NrP   rO   r!   r!   r"   rG   q  s   $

z$CrossEntropyBackward._get_tiled_copyr7   r8   mDLossr<   r;   r=   r>   c                    s   |j  jksJ |j  jksJ t jd jj } j|d\}	}
}|	j} fdd|||fD \}}} |||||||j	|
|	|
j
t|j	d |
d t|j	d |
d dg|ddg|d d S )Nr&   r?   c                    s   g | ]}t j|d  jdqS )r   )r   rH   )layout_utilsexpandr   )rX   Xr   r!   r"   r]     s    z1CrossEntropyBackward.__call__.<locals>.<listcomp>r   r   )rA   rB   r>   )rC   r   rE   rF   r   r2   rG   rH   rI   rM   rJ   rK   rL   )r   r7   r8   r  r<   r;   r=   r>   r@   rO   rP   rQ   rR   r!   r	  r"   rS   ~  s8   

zCrossEntropyBackward.__call__rM   rP   rO   rQ   c           *         s  t j \}}}t j \ }tj }|j|jt j	dddd}t 
|} fdd|||fD \}}}|	|}||}||}||d }||}||}dd ||fD \}}t|d	 d	  d
k}|rtd ntj|||d	 d}ttj|d}|d
 d
 }||d
 k r|||dd t j  t jd
 t| rt|||jj  t || | t}tj} tj}!tj}"||d
 k rt|| } t| |k}#|#st|| }!t|| }"ttj }$t jj!||$ |"|$  dd}%|%d }&t "|t}'tj#t $|ddD ]}(||( d	 | k|'|(< qt %|' |&|%})|)|! })|&|)|j ||d
 k rD||| d S d S )Nr_   r`   r'   rb   c                    rT   r!   rU   rW   r[   bidyrP   r!   r"   r]     r^   z/CrossEntropyBackward.kernel.<locals>.<listcomp>rd   c                 S   s   g | ]}t |qS r!   )rK   r   )rX   thrr!   r!   r"   r]     s    r   r   re   rf   Trh   rl   rq   ro   )'rK   rr   rs   rt   rw   rx   ry   rz   rC   r{   rv   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r	   r   r   r
   rE   r   r   r   r   r   rH   wherer   )*r   r7   r8   r  r<   r;   r=   rM   rP   rO   rQ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   dlossr   r   r   r   prob_shiftedmaskr   gradr!   r
  r"   rI     sb   

"






zCrossEntropyBackward.kernelN)r   r   r   r   rw   r   r   r   r-   rG   rK   r   r   r   r   r   rS   rI   r   r   r   r!   r!   r!   r"   r   d  sT    )	
r   r  c                 C   s  |   dks
J d|  dksJ d|  dksJ d|  dks(J d| jd |jd ks6J d| jd |jd ksDJ d| jd |jd ksRJ d| jr^|jr^|jr^|jsbJ d	| jtjtjtjfv sqJ d
|jtjtj	fv s~J d| 
d}t| j }t|j }|||f}	|	tjvrt }
td|j |}t||
|f|gd \}}t||
f}tt|
fgd \}}t||}tj||||||tdtjjdddd	tj|	< tj|	 | ||||t| dS )a<  Cross entropy backward pass.
    Args:
        x: Input logits tensor of shape (M, N)
        target: Target class indices tensor of shape (M,)
        dloss: Upstream gradients tensor of shape (M,)
        lse: Log-sum-exp values tensor of shape (M,)
    Returns:
        Input gradients tensor of shape (M, N)
    r   r   r   r   zdloss must be 1Dzlse must be 1Dr   zBatch dimensions must matchr   r   r   r&   Tr   r   r   N)r   rM   r   r   r   r   r   r   r   r   rH   r   _cross_entropy_backwardr   rK   r   rE   rF   r2   r   r	   r   r   r   r   r   )r   r   r  r   r   r=   r   r   r   r   r   r   r   r   r   
dloss_cuter   cross_entropy_backward_opr!   r!   r"   r    sJ   





r  zquack::cross_entropy_bwd_outc                 C   s   t | ||||| d S )N)r  r   r   r  r   r   r=   r!   r!   r"   cross_entropy_bwd_out/  s   	r  c                 C   sL   |rt j s| }t| |||| |d |S t | }t| |||||d |S )Nr  )r   compileris_compilingr  r   r  )r   r   r  r   r=   r   r   r!   r!   r"   cross_entropy_bwd;  s   
r  c                   @   s&   e Zd ZedddZedd ZdS )	CrossEntropyFunctionNr   Fc                 C   sR   |d u rt |||dd\}}nt ||||dd\}}| ||| || _|| _|S )NT)r=   r   )r   r=   r   )r   save_for_backwardr=   r   )ctxr   r   lse_partialr=   r   r   r   r!   r!   r"   forwardQ  s   

zCrossEntropyFunction.forwardc                 C   s2   | j \}}}t||||| j| jd}|d d d d fS )N)r   )saved_tensorsr  r=   r   )r  r  r   r   r   r   r!   r!   r"   backward`  s
   zCrossEntropyFunction.backward)Nr   F)r   r   r   staticmethodr  r   r!   r!   r!   r"   r  P  s
    r  meanr  	reduction)noner"  sumc                 C   s^   t | ||||}|dkr| ||k   S |dkr!| S |dkr'|S td| d)a  Cross entropy loss with automatic differentiation support.

    Args:
        x: Input logits tensor of shape (M, N)
        target: Target class indices tensor of shape (M,)
        lse_partial: Optional precomputed log-sum-exp partial results
        reduction: Specifies the reduction to apply to the output:
            'none': no reduction will be applied (default)
            'mean': the sum of the output will be divided by the number of elements
            'sum': the output will be summed
        inplace_backward: Whether to perform backward pass in-place
        ignore_index: Index to ignore in loss computation (loss will be 0 for these indices)

    Returns:
        Cross entropy loss tensor:
            - If reduction='none': tensor of shape (M,) with per-example losses
            - If reduction='mean': scalar tensor with mean loss
            - If reduction='sum': scalar tensor with sum of losses
    r"  r%  r$  zInvalid reduction mode: z*. Expected one of 'none', 'mean', or 'sum')r  applyr%  float
ValueError)r   r   r  r=   r#  r   r   r!   r!   r"   cross_entropyi  s   
r)  )r   )Nr   FFF)r   F)Nr   r"  F)6rE   	functoolsr   typingr   r   r   r   r   cuda.bindings.driverbindingsdriverr   rw   cutlass.cuterK   r   r   r	   r
   r   quack.utilsrx   quack.copy_utilsr   quack.layout_utilsr  quack.compile_utilsr   r   quack.reducer   r   quack.reduction_baser   quack.cute_dsl_utilsr   r   library	custom_opr   r   r   r   tupler   r   r  r  r  autogradFunctionr  r)  r!   r!   r!   r"   <module>   s   [T
 
9
