o
    پi?&                     @   s   d dl mZ d dlZd dlmZ d dlm  mZ d dlmZ d dl	m
Z
mZ d dlmZmZmZmZ dd Zdd	 Zd
d Zdd ZG dd dejjZG dd deZd$ddZG dd deZG dd deZ	d%ddZG dd deZG dd deZd&d d!ZG d"d# d#ej Z dS )'    )partialN)Tensor)
custom_fwd
custom_bwd)gemmgemm_add_inplacegemm_act	gemm_dactc                     s,   t d t  rt fdd| D } | S )Ncudac                 3   s    | ]	}|j  d V  qdS )dtypeN)to).0tautocast_dtype @/home/ubuntu/.local/lib/python3.10/site-packages/quack/linear.py	<genexpr>   s    z*linear_fwd_convert_type.<locals>.<genexpr>)torchget_autocast_dtypeis_autocast_enabledtuple)tensorsr   r   r   linear_fwd_convert_type   s   
r   c                 C   s>   |\}}|s
d\}}|sd }|  ||| jr| d S d  d S )NNN)save_for_backwardfuse_grad_accum)ctxxweight	weight_ogneeds_x_w_gradneeds_input_gradneeds_weight_gradr   r   r   linear_fwd_postprocess   s   "r%   c                 C   s$   | j d r|d usJ |||S d S )Nr   )r#   )r   doutr    	matmul_fnr   r   r   linear_bwd_compute_input_grad   s   

r(   c                 C   s   | j d r<|d usJ |d|jd }| jr!|jd u s!tj r,||j|| j	d}|S ||j||j |j}d |_|S d }|S )N   )	out_dtype)
r#   reshapeshaper   gradr   compileris_compilingTweight_dtype)r   r&   r   r!   r'   matmul_inplace_fndweightr   r   r   linear_bwd_compute_weight_grad&   s   
r5   c                   @   sb   e Zd ZeZeeddZeeddZeeddZ	e
edddddZe
eddd	d
 ZdS )
LinearFuncTdynamic_schedulerr
   device_typeNFc           	      C   s   |j |_||_|}t||\}}|jdd }|d|jd }| j||j|d}t|||||j	dd d |dur>|j nd|_
|jg ||jd R  S )z
        x: (..., in_features)
        weight: (out_features, in_features)
        bias: (out_features,) or None
        out: (..., out_features)
        Nr*   )bias   r"   )r   r2   r   r   r-   r,   matmul_fwd_fnr1   r%   r#   
bias_dtype)	clsr   r   r    r;   r   r!   batch_shapeoutr   r   r   forward>   s   	zLinearFunc.forwardc                 G   s   |j \}}}|jdd }|d|jd }|jdur(|jd r(|jd|jdnd}t|||| j}	|	durC|	jg ||	jd R  nd}	t||||| j	| j
}
|	|
|gdgd R S )+
        dout: (..., out_features)
        Nr*   r<   r   r   
   )saved_tensorsr-   r,   r?   r#   sumr(   matmul_bwd_dxr5   matmul_bwd_dwmatmul_bwd_dw_inplace)r@   r   r&   argsr   r    r!   rA   dbiasdxr4   r   r   r   backwardS   s   &zLinearFunc.backward)NF)__name__
__module____qualname__r   r>   r   rH   rI   r   rJ   classmethodr   rC   r   rN   r   r   r   r   r6   7   s    r6   c                   @   s@   e Zd ZeeddZeedddZeedddZeeddZ	dS )LinearUntunedFuncFtunedTr8   rU   r7   N)
rO   rP   rQ   r   r   r>   rH   rI   r   rJ   r   r   r   r   rS   j   
    rS   FTc                 C   s   |rt nt}|| |||S N)r6   rS   apply)r   r    r;   r   rU   fn_clsr   r   r   linear_funcr   s   r[   c                   @   s*   e Zd ZeZeedd	dddZdS )	LinearActFuncr
   r9   NTFc                 C   s   |j |_||_|}t||\}}|jdd }	|d|jd }| j||j|||d\}
}t|||||j	dd d |
durL|
jg |	|
jd R  }
|durS|j nd|_
|| |d |
|jg |	|jd R  fS )z
        x: (..., in_features)
        weight: (out_features, in_features)
        bias: (out_features,) or None
        out: (..., out_features)
        Return both out and post-activation, but only out is differentiable.
        Nr*   )r;   
activationstore_preactr<   r=   F)r   r2   r   r   r-   r,   r>   r1   r%   r#   r?   mark_non_differentiableset_materialize_grads)r@   r   r   r    r]   r;   r^   r   r!   rA   rB   postactr   r   r   rC   {   s    


zLinearActFunc.forward)NTF)rO   rP   rQ   r   r>   rR   r   rC   r   r   r   r   r\   w   s    r\   c                   @   s@   e Zd ZeeddZeedddZeedddZee	ddZ
dS )LinearActUntunedFuncFrT   TrV   r7   N)rO   rP   rQ   r   r   r>   r   rH   rI   r   rJ   r   r   r   r   rb      rW   rb   c                 C   s    |rt nt}|| |||||S rX   )r\   rb   rY   )r   r    r]   r;   r^   r   rU   rZ   r   r   r   linear_act_func   s   rc   c                   @   sF   e Zd ZeeddZeedddddZee	dddd	 Z
d
S )DActLinearFuncTr7   r
   r9   Fc           
      C   s   |j |_||_|}t||\}}|jdd }|d|jd }| ||j}	t|||||j	dd d ||_
|	jg ||	jd R  S )z
        x: (..., in_features)
        weight: (out_features, in_features)
        out: (..., out_features)
        Takes in an extra preact argument which is the pre-activation, to be used in the backward pass.
        Nr*   r<   r=   )r   r2   r   r   r-   r,   r>   r1   r%   r#   r]   )
r@   r   preactr    r   r]   r   r!   rA   rB   r   r   r   rC      s   	zDActLinearFunc.forwardc           
      C   s   |j \}}}|jdd }|d|jd }|d|jd }|jd r7|dus*J | j||||jd\}}nd\}}|durL|jg ||jd R  nd}t||||| j| j}	||	gdgd R S )rD   Nr*   r   )r]   r      )	rF   r-   r,   r#   rH   r]   r5   rI   rJ   )
r@   r   r&   re   r    r!   rA   dpreactr   r4   r   r   r   rN      s   
&zDActLinearFunc.backwardN)F)rO   rP   rQ   r   r	   rH   rR   r   rC   r   rN   r   r   r   r   rd      s    rd   c                   @   s@   e Zd ZeeddZeedddZeedddZee	ddZ
dS )DActLinearUntunedFuncFrT   TrV   r7   N)rO   rP   rQ   r   r   r>   r	   rH   rI   r   rJ   r   r   r   r   rh      rW   rh   c                 C   s   |rt nt}|| ||||S rX   )rd   rh   rY   )re   r    r   r]   r   rU   rZ   r   r   r   act_linear_func   s   ri   c                       sN   e Zd Z				ddededededdf
 fdd	Zd
edefddZ  ZS )LinearFNin_featuresout_featuresr;   r   returnc                    s    t  j|||||d || _d S )N)r;   devicer   )super__init__r   )selfrk   rl   r;   rn   r   r   	__class__r   r   rp      s   	
zLinear.__init__inputc                 C   sJ   |j r| jd dkr| jd dkrt|| j| j| jdS t|| j| jS )N   r   )r   )	is_cudark   rl   r[   r    r;   r   Flinear)rq   rt   r   r   r   rC      s   "zLinear.forward)FNNF)	rO   rP   rQ   intboolrp   r   rC   __classcell__r   r   rr   r   rj      s"    rj   )NFT)NTFT)FT)!	functoolsr   r   torch.nnnntorch.nn.functional
functionalrw   r   	torch.ampr   r   quack.gemm_interfacer   r   r   r	   r   r%   r(   r5   autogradFunctionr6   rS   r[   r\   rb   rc   rd   rh   ri   rj   r   r   r   r   <module>   s,   	3
"	
2
