o
    c۷il'                     @   s  d dl mZmZ d dlZd dlmZ d dlm  mZ d dlm	Z	 d dl
mZmZ d dlmZmZ d dlmZmZmZ d dlmZ 				
d%de	de	dee	 de	deded dede	fddZ			d&de	de	dee	 de	deded de	fddZ			d'de	de	de	dedededee	e	e	ee	 ee	 f fddZG dd dejjZ					d(de	de	de	dededed  dede	fd!d"ZG d#d$ d$ej Z!dS ))    )OptionalLiteralN)Tensor)
custom_fwd
custom_bwd)cross_entropycross_entropy_fwd_out)gemmgemm_addgemm_add_inplace)linear_fwd_convert_typemeanFxweightbiastargetignore_index	reductionnoner   suminplace_backwardreturnc                 C   s    t | ||}t|||||dS )Nr   r   r   Flinearr   )r   r   r   r   r   r   r   y r   P/home/ubuntu/vllm_env/lib/python3.10/site-packages/quack/linear_cross_entropy.pylinear_cross_entropy_func   s   	
r!   c                 C   s    t | ||}t j||||dS )N)r   r   r   )r   r   r   r   r   r   r   r   r   r    linear_cross_entropy_func_ref   s   r"      T
chunk_sizetunedc              
      s|  | j \}}|j \}}	| j}
|  d   } d dksJ d|d dks'J tj||
tjd}tj |f|
| jd}t| }|dkrLtj|tjdnd}d}d}tt fdd	| |||fD  D ]S\}\}}}}|j d }|d| }tj	||j
|d
 |}t||d|d||d tj	|||d
 ||d kr|}|}qc|dkrt|j|||d qct|j|||d qc|||||fS )a4  
    Chunked forward pass for linear cross entropy.

    Splits input along batch dimension, computes matmul and cross_entropy_fwd
    for each chunk, stores dx for each chunk, and accumulates dw.

    Returns:
        loss: (B*L,) loss values
        dx: (B*L, d) gradient w.r.t. input
        dw: (V, d) gradient w.r.t. weight (accumulated across chunks except last)
        last_dlogits_chunk: (chunk_len, V) gradient of last chunk's logits (for deferred dw computation)
        last_x_chunk: (chunk_len, d) last chunk's input (for deferred dw computation)
          r   z chunk_size must be multiple of 8)devicedtype)r)   Nc                 3   s    | ]}|  V  qd S )N)split).0tr$   r   r    	<genexpr>Q   s    z3chunked_linear_cross_entropy_fwd.<locals>.<genexpr>)out)losslsedxr   )r/   r%   r%   )shaper(   torchemptyfloat32r)   
empty_like	enumeratezipmmmTr   r	   Tr   )r   r   r   r$   r   r%   B_LdV_r(   
num_chunksr0   logits_chunk_preallocatedr2   dwlast_dlogits_chunklast_x_chunkix_chunktarget_chunk
loss_chunkdx_chunk	chunk_lenlogits_chunkdlogits_chunkr   r-   r     chunked_linear_cross_entropy_fwd*   sH   




rO   c                   @   sd   e Zd Zeedd				ddeded	ed
eded dedefddZ	ee
dddd ZdS )!ChunkedLinearCrossEntropyFunctioncuda)device_typer   r   r#   Tr   r   r   r   r   r   r   r$   r%   c                 C   s   |j | _t||\}}|jdd }|d|jd }t||||||d\}	}
}}}|	 }|dkr4dn	d||k   }| |
|||| || _	|| _
|| _|| _|du rY|S || S )zO
        Forward pass computes loss and stores dx and dw for backward.
        Nr3   r   g      ?)r)   weight_dtyper   r4   reshaperO   r   floatsave_for_backwardbatch_shaper   r   r%   )ctxr   r   r   r   r   r$   r%   rY   r0   r2   rD   rE   rF   loss_sum
loss_scaler   r   r    forwardr   s    z)ChunkedLinearCrossEntropyFunction.forwardc              	   C   s   | j \}}}}}| j}|dur|| }|| |jg | j|jd R  }|du r6t|j|| j||d}n| j|j	krHt
|j|||||d nt|j||||| j|d}||dddddfS )z
        Backward pass scales pre-computed gradients by dloss and completes
        the last chunk's dw computation.
        dloss is a scalar.
        NrT   )	out_dtypealphar%   )r_   betar%   )r_   r`   r^   r%   )saved_tensorsr%   mul_rV   rY   r4   r	   r=   rU   r)   r   r
   )rZ   dlossr2   rD   rE   rF   r\   r%   r   r   r    backward   s8   
	z*ChunkedLinearCrossEntropyFunction.backwardN)r   r   r#   T)__name__
__module____qualname__staticmethodr   r   intr   boolr]   r   rd   r   r   r   r    rP   q   s2    rP   rS   c              	   C   s0   |dvrt d| t| ||||||}|S )a  
    Chunked linear cross entropy with automatic differentiation support.

    Args:
        x: Input tensor of shape (B*L, d)
        weight: Weight tensor of shape (V, d)
        target: Target indices of shape (B*L,)
        chunk_size: Size of chunks to process
        ignore_index: Index to ignore in loss computation
        reduction: Type of reduction to apply
        tuned: Whether to use tuned kernels

    Returns:
        Loss tensor with specified reduction
    rS   zInvalid reduction: )
ValueErrorrP   apply)r   r   r   r$   r   r   r%   r0   r   r   r    chunked_linear_cross_entropy   s   rm   c                       sr   e Zd Z								ddededed	ed
ed dee dededdf fddZdededefddZ	  Z
S )LinearCrossEntropyFr   r   NTin_featuresout_featuresr   r   r   r   r$   r   r%   r   c                    s8   t  j||||	|
d || _|| _|| _|| _|| _d S )N)r   r(   r)   )super__init__r   r   r$   r   r%   )selfro   rp   r   r   r   r$   r   r%   r(   r)   	__class__r   r    rr      s   
zLinearCrossEntropy.__init__inputr   c              	   C   s   | j d u rK|jrK|ddkrK| jd dkrK| jd dkrK|jd d  d dkrK| jd urK| jd dkrK| jdv rKt	|| j
|| j| j| j| jdS t|| j
| j || j| j| jdS )NrT   r&   r'   r   rS   )r$   r   r   r%   r   )r   is_cudastridero   rp   r4   numelr$   r   rm   r   r   r%   r!   r   )rs   rv   r   r   r   r    r]      s8   



zLinearCrossEntropy.forward)Fr   r   NFTNN)re   rf   rg   ri   rj   r   r   rr   r   r]   __classcell__r   r   rt   r    rn      s:    	rn   )r   r   F)r   r   )r#   r   T)r#   r   r   T)"typingr   r   r5   torch.nnnntorch.nn.functional
functionalr   r   	torch.ampr   r   quack.cross_entropyr   r   quack.gemm_interfacer	   r
   r   quack.linearr   ri   rj   r!   r"   tuplerO   autogradFunctionrP   rm   Linearrn   r   r   r   r    <module>   s   


GS
 