o
    c۷i.(                     @   s<  d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZmZ d dl
mZmZ d dl
mZ d d	l
mZmZ d d
lmZ d dlmZmZmZmZmZmZmZmZ ed ZG dd dZG dd dZG dd deZ G dd deZ!G dd dej"j#Z$dd Z%	d de&fddZ'G dd dej(Z)dS )!    )Literal)partialN)Tensor	rearrange)linear_act_funcact_linear_func)linear_gated_funcgated_linear_func)linear_fwd_convert_type)_recompute_act_postact_recompute_gated_postact)gate_fn_map)act_to_pytorch_fn_mapgated_to_pytorch_fn_mapgemmgemm_add_inplace
gemm_gatedgemm_dgatedgemm_act	gemm_dact)gelu_tanh_approxrelurelu_sqswiglu
swiglu_oaireglugeglugluc                   @   sL   e Zd ZeZeZeeddZ	eeddZ
eeddZeeddZeeZdS )_MLPOpsTdynamic_schedulerN)__name__
__module____qualname__r   
matmul_fwdr   matmul_fwd_actr   r   matmul_bwd_dactmatmul_bwd_dxmatmul_bwd_dwr   matmul_bwd_dw_inplacestaticmethodr   recompute_postact r-   r-   ?/home/ubuntu/vllm_env/lib/python3.10/site-packages/quack/mlp.pyr   *   s    r   c                   @   sd   e Zd ZeeddZeeddZeedddZ	eedddZ
eedddZeedddZeeZdS )_MLPUntunedOpsFtunedTr!   r1   N)r"   r#   r$   r   r   r%   r   r&   r   r'   r(   r)   r   r*   r+   r   r,   r-   r-   r-   r.   r/   4   s    r/   c                   @   s$   e Zd ZeZeeddZee	Z
dS )_MLPGatedOpsTr    N)r"   r#   r$   r   r&   r   r   r'   r+   r   r,   r-   r-   r-   r.   r3   >   s    r3   c                   @   s.   e Zd ZeeddZeedddZee	Z
dS )_MLPGatedUntunedOpsFr0   Tr2   N)r"   r#   r$   r   r   r&   r   r'   r+   r   r,   r-   r-   r-   r.   r4   D   s    r4   c                   @   s(   e Zd ZdZedd Zedd ZdS )MLPRecomputeFunca  MLP with activation recomputation: saves only x (not preact) to reduce memory.

    In backward, recomputes preact = x @ W1.T (one extra matmul) instead of loading it
    from saved tensors. This trades compute for memory:
      - Saves: batch * 2 * hidden * dtype_size bytes of activation memory
      - Costs: one extra GEMM (x @ W1.T) during backward

    Ops class selects between non-gated (gemm_act/gemm_dact) and gated (gemm_gated/gemm_dgated)
    variants, as well as tuned/untuned.
    c                 C   s<  t |||\}}}tjjddd |j| _|| _|| _|| _||}}|j	d d }	|
d|j	d }
|j|
|j|d\}}|||j}| j}|d pT|d pT|d }|d p\|d }|ra|nd }|rg|nd }|rm|nd }| ||||rx|nd |r}|nd  |j
g |	|j	d R  W  d    S 1 sw   Y  d S )	NcudaFenabled
activationr         )r   torchampautocastdtypeweight_dtypefuse_grad_accumr;   opsshapereshaper&   Tr%   needs_input_gradsave_for_backward)ctxxweight1weight2r;   rC   rD   
weight1_og
weight2_ogbatch_shapex_flat_preactpostactoutrH   any_grad	need_dactsaved_xsaved_w1saved_w2r-   r-   r.   forwardV   s4   


$zMLPRecomputeFunc.forwardc              
   C   s  t jjddd | j}| j\}}}}}|jd d }|d|jd  }|d ur3|d|jd nd }	| jd p>| jd }
|
pE| jd }|
r\|	|	|j
}|j|||| jd\}}n|ro|	|	|j
}||| j}d }nd	\}}t| ||||j|j| jd }| jd r|||}|jg ||jd R  }nd }t| ||	||j|j| jd }|||d d d fW  d    S 1 sw   Y  d S )
Nr6   Fr7   r9   r   r<   r=   r:   )NN)r>   r?   r@   rD   saved_tensorsrE   rF   
contiguousrH   r%   rG   r'   r;   r,   _compute_weight_gradr)   r*   r(   )rJ   doutrD   rK   rL   rM   rN   rO   rP   rQ   rV   rU   preactdpreactrS   dweight2dxdweight1r-   r-   r.   backwards   sT   


	$zMLPRecomputeFunc.backwardN)r"   r#   r$   __doc__r+   rZ   rd   r-   r-   r-   r.   r5   J   s    
r5   c                 C   sf   |sd S | d|jd }| jr|jd u stj r#||j|| jdS ||j||j |j}d |_|S )Nr9   )	out_dtype)	rF   rE   rC   gradr>   compileris_compilingrG   rB   )rJ   r^   rK   	weight_og	matmul_fnmatmul_inplace_fn
needs_graddweightr-   r-   r.   r]      s   r]   FTr;   c                 C   s   |r|t v }|r|rtnt}n|rtnt}t| |||||S |t v }|r'tnt}	|r-t	nt
}
|	| ||t ||d\}}|
||||||d}|S )N)store_preactrC   r1   )r;   rC   r1   )r   r3   r4   r   r/   r5   applyr	   r   r
   r   r>   is_grad_enabled)rK   rL   rM   r;   rC   r1   	recomputegatedrD   fc1_fnfc2_fnr_   rS   rT   r-   r-   r.   mlp_func   s4   
rv   c                	       sX   e Zd Z											ddededed	ef fd
dZdedefddZ  ZS )MLPNFr   r<   Tr;   rC   r1   rr   c                    s   ||	d}t    |d ur|n|}|| _|tv | _|d u r+| jr'td| nd| }|dkr9|| d | | }| jr@d| n|}tj||fd|i|| _| jr]dd d	d f| jj	_
tj||fd|i|| _|
| _|| _|| _d S )
N)devicerA   gUUUUUU@   r<   r=   biasc                 S   s   t | dddS )Nz(d two) e -> two d er=   )twor   wr-   r-   r.   <lambda>   s    zMLP.__init__.<locals>.<lambda>c                 S   s
   t | dS )Nztwo d e -> (d two) er   r|   r-   r-   r.   r~      s   
 )super__init__r;   r   rs   intnnLinearfc1weight_muon_reshape_functionsfc2rC   r1   rr   )selfin_featureshidden_featuresout_featuresbias1bias2r;   multiple_ofrx   rA   rC   r1   rr   factory_kwargsfc1_out	__class__r-   r.   r      s&   




zMLP.__init__inputreturnc              	   C   s   | j jd u rF| jjd u rF|jrF|ddkrF| j jd dkrF| j j| jr&dnd dkrF| jjd dkrFt|| j j	| jj	| j
| j| j| jdS |  |}| jret| j
 |dd d df |ddd df }nt| j
 |}| |S )	Nr9   r<      r      )r;   rC   r1   rr   .r=   )r   rz   r   is_cudastrider   r   rs   rv   r   r;   rC   r1   rr   r   r   )r   r   yr-   r-   r.   rZ      s,   

.
zMLP.forward)NNFFr   r<   NNFTF)	r"   r#   r$   
Activationboolr   r   rZ   __classcell__r-   r-   r   r.   rw      s,    $rw   )FTF)*typingr   	functoolsr   r>   torch.nnr   r   einopsr   quack.linearr   r   r	   r
   r   r   r   quack.activationr   quack.gemm_interfacer   r   r   r   r   r   r   r   r   r   r/   r3   r4   autogradFunctionr5   r]   strrv   Modulerw   r-   r-   r-   r.   <module>   s4   (

^
 