o
    cÛ·i.(  ã                   @   s<  d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZmZ d dl
mZmZ d dl
mZ d d	l
mZmZ d d
lmZ d dlmZmZmZmZmZmZmZmZ ed ZG dd„ dƒZG dd„ dƒZG dd„ deƒZ G dd„ deƒZ!G dd„ dej"j#ƒZ$dd„ Z%	d de&fdd„Z'G dd„ dej(ƒZ)dS )!é    )ÚLiteral)ÚpartialN)ÚTensor©Ú	rearrange)Úlinear_act_funcÚact_linear_func)Úlinear_gated_funcÚgated_linear_func)Úlinear_fwd_convert_type)Ú_recompute_act_postactÚ_recompute_gated_postact)Úgate_fn_map)Úact_to_pytorch_fn_mapÚgated_to_pytorch_fn_mapÚgemmÚgemm_add_inplaceÚ
gemm_gatedÚgemm_dgatedÚgemm_actÚ	gemm_dact)Úgelu_tanh_approxÚreluÚrelu_sqÚswigluÚ
swiglu_oaiÚregluÚgegluÚgluc                   @   sL   e Zd ZeZeZeeddZ	eeddZ
eeddZeeddZeeƒZdS )Ú_MLPOpsT©Údynamic_schedulerN)Ú__name__Ú
__module__Ú__qualname__r   Ú
matmul_fwdr   Úmatmul_fwd_actr   r   Úmatmul_bwd_dactÚmatmul_bwd_dxÚmatmul_bwd_dwr   Úmatmul_bwd_dw_inplaceÚstaticmethodr   Úrecompute_postact© r-   r-   ú?/home/ubuntu/vllm_env/lib/python3.10/site-packages/quack/mlp.pyr   *   s    r   c                   @   sd   e Zd ZeeddZeeddZeedddZ	eedddZ
eedddZeedddZeeƒZdS )Ú_MLPUntunedOpsF©ÚtunedT©r!   r1   N)r"   r#   r$   r   r   r%   r   r&   r   r'   r(   r)   r   r*   r+   r   r,   r-   r-   r-   r.   r/   4   s    r/   c                   @   s$   e Zd ZeZeeddZee	ƒZ
dS )Ú_MLPGatedOpsTr    N)r"   r#   r$   r   r&   r   r   r'   r+   r   r,   r-   r-   r-   r.   r3   >   s    r3   c                   @   s.   e Zd ZeeddZeedddZee	ƒZ
dS )Ú_MLPGatedUntunedOpsFr0   Tr2   N)r"   r#   r$   r   r   r&   r   r'   r+   r   r,   r-   r-   r-   r.   r4   D   s    r4   c                   @   s(   e Zd ZdZedd„ ƒZedd„ ƒZdS )ÚMLPRecomputeFuncaï  MLP with activation recomputation: saves only x (not preact) to reduce memory.

    In backward, recomputes preact = x @ W1.T (one extra matmul) instead of loading it
    from saved tensors. This trades compute for memory:
      - Saves: batch * 2 * hidden * dtype_size bytes of activation memory
      - Costs: one extra GEMM (x @ W1.T) during backward

    Ops class selects between non-gated (gemm_act/gemm_dact) and gated (gemm_gated/gemm_dgated)
    variants, as well as tuned/untuned.
    c                 C   s<  t |||ƒ\}}}tjjdddƒ |j| _|| _|| _|| _||}}|j	d d… }	| 
d|j	d ¡}
|j|
|j|d\}}| ||j¡}| j}|d pT|d pT|d }|d p\|d }|ra|nd }|rg|nd }|rm|nd }|  ||||rx|nd |r}|nd ¡ |j
g |	¢|j	d ‘R Ž W  d   ƒ S 1 s—w   Y  d S )	NÚcudaF©Úenabledéÿÿÿÿ©Ú
activationr   é   é   )r   ÚtorchÚampÚautocastÚdtypeÚweight_dtypeÚfuse_grad_accumr;   ÚopsÚshapeÚreshaper&   ÚTr%   Úneeds_input_gradÚsave_for_backward)ÚctxÚxÚweight1Úweight2r;   rC   rD   Ú
weight1_ogÚ
weight2_ogÚbatch_shapeÚx_flatÚ_preactÚpostactÚoutrH   Úany_gradÚ	need_dactÚsaved_xÚsaved_w1Úsaved_w2r-   r-   r.   ÚforwardV   s4   


û$èzMLPRecomputeFunc.forwardc              
   C   sŠ  t jjddd³ | j}| j\}}}}}|jd d… }| d|jd ¡ ¡ }|d ur3| d|jd ¡nd }	| jd p>| jd }
|
pE| jd }|
r\| 	|	|j
¡}|j|||| jd\}}n|ro| 	|	|j
¡}| || j¡}d }nd	\}}t| ||||j|j| jd ƒ}| jd r›| ||¡}|jg |¢|jd ‘R Ž }nd }t| ||	||j|j| jd ƒ}|||d d d fW  d   ƒ S 1 s¾w   Y  d S )
Nr6   Fr7   r9   r   r<   r=   r:   )NN)r>   r?   r@   rD   Úsaved_tensorsrE   rF   Ú
contiguousrH   r%   rG   r'   r;   r,   Ú_compute_weight_gradr)   r*   r(   )rJ   ÚdoutrD   rK   rL   rM   rN   rO   rP   rQ   rV   rU   ÚpreactÚdpreactrS   Údweight2ÚdxÚdweight1r-   r-   r.   Úbackwards   sT   
ÿù

ù	$ÐzMLPRecomputeFunc.backwardN)r"   r#   r$   Ú__doc__r+   rZ   rd   r-   r-   r-   r.   r5   J   s    
r5   c                 C   sf   |sd S |  d|jd ¡}| jr|jd u stj ¡ r#||j|| jdS ||j||jƒ |j}d |_|S )Nr9   )Ú	out_dtype)	rF   rE   rC   Úgradr>   ÚcompilerÚis_compilingrG   rB   )rJ   r^   rK   Ú	weight_ogÚ	matmul_fnÚmatmul_inplace_fnÚ
needs_gradÚdweightr-   r-   r.   r]   ¨   s   r]   FTr;   c                 C   s’   |r|t v }|r|rtnt}n|rtnt}t | |||||¡S |t v }|r'tnt}	|r-t	nt
}
|	| ||t ¡ ||d\}}|
||||||d}|S )N)Ústore_preactrC   r1   )r;   rC   r1   )r   r3   r4   r   r/   r5   Úapplyr	   r   r
   r   r>   Úis_grad_enabled)rK   rL   rM   r;   rC   r1   Ú	recomputeÚgatedrD   Úfc1_fnÚfc2_fnr_   rS   rT   r-   r-   r.   Úmlp_funcµ   s4   
úúrv   c                	       sX   e Zd Z											ddededed	ef‡ fd
d„Zdedefdd„Z‡  ZS )ÚMLPNFr   r<   Tr;   rC   r1   rr   c                    sê   ||	dœ}t ƒ  ¡  |d ur|n|}|| _|tv | _|d u r+| jr'td| ƒnd| }|dkr9|| d | | }| jr@d| n|}tj||fd|i|¤Ž| _| jr]dd„ d	d„ f| jj	_
tj||fd|i|¤Ž| _|
| _|| _|| _d S )
N)ÚdevicerA   gUUUUUU@é   r<   r=   Úbiasc                 S   s   t | dddS )Nz(d two) e -> two d er=   )Útwor   ©Úwr-   r-   r.   Ú<lambda>ò   s    zMLP.__init__.<locals>.<lambda>c                 S   s
   t | dƒS )Nztwo d e -> (d two) er   r|   r-   r-   r.   r~   ó   s   
 )ÚsuperÚ__init__r;   r   rs   ÚintÚnnÚLinearÚfc1ÚweightÚ_muon_reshape_functionsÚfc2rC   r1   rr   )ÚselfÚin_featuresÚhidden_featuresÚout_featuresÚbias1Úbias2r;   Úmultiple_ofrx   rA   rC   r1   rr   Úfactory_kwargsÚfc1_out©Ú	__class__r-   r.   r€   Ö   s&   



þ
zMLP.__init__ÚinputÚreturnc              	   C   sâ   | j jd u rF| jjd u rF|jrF| d¡dkrF| j jd dkrF| j j| jr&dnd dkrF| jjd dkrFt|| j j	| jj	| j
| j| j| jdS |   |¡}| jret| j
 |dd d d…f |ddd d…f ƒ}nt| j
 |ƒ}|  |¡S )	Nr9   r<   é   r   é   )r;   rC   r1   rr   .r=   )r„   rz   r‡   Úis_cudaÚstrider‰   r‹   rs   rv   r…   r;   rC   r1   rr   r   r   )rˆ   r“   Úyr-   r-   r.   rZ   ú   s,   ÿù

.
zMLP.forward)NNFFr   r<   NNFTF)	r"   r#   r$   Ú
ActivationÚboolr€   r   rZ   Ú__classcell__r-   r-   r‘   r.   rw   Õ   s,    óùõôó$rw   )FTF)*Útypingr   Ú	functoolsr   r>   Útorch.nnr‚   r   Úeinopsr   Úquack.linearr   r   r	   r
   r   r   r   Úquack.activationr   Úquack.gemm_interfacer   r   r   r   r   r   r   r   rš   r   r/   r3   r4   ÚautogradÚFunctionr5   r]   Ústrrv   ÚModulerw   r-   r-   r-   r.   Ú<module>   s4   (ÿ

^ÿ
ÿ