o
     iC                     @   s  d dl mZ d dlmZmZmZmZmZ d dlZd dl	m
  mZ d dlm
Z
 d dlmZmZ ddlmZmZmZ ddlmZmZ ejjrnejd	d
dejdejdeej dejdeej deejejejf fddZeG dd deZeG dd deZG dd dejjZ G dd dejjZ!G dd dZ"G dd de"Z#G dd  d e"Z$dejdejdeej dejdeej d!ejd"eej dejfd#d$Z%eG d%d& d&Z&d'e&de'fd(d)Z(d'e&de'fd*d+Z)d'e&de'fd,d-Z*e#e d.d/e*gd0Z+e#e!d.d1e(e)gd0Z,e$ed2d3d4e(e)gd0Z-e$e%d.d5g d0Z.dee/e/f fd6d7Z0dd8dejdejdeej dejdeej d!ejd"eej d'ee" dejfd9d:Z1dejd;ejd<eej d!ejd"eej d'e"dejfd=d>Z2G d?d@ d@e
j3Z4dS )A    )	dataclass)DictOptionalSequenceTupleUnionN)nn)
custom_bwd
custom_fwd   )BaseOperatorget_xformers_operatorregister_operator)stack_or_noneunbindz%xformers::dual_gemm_silu_identity_mulcudaxw1b1w2b2returnc                 C   sL   | |j  }|d ur||7 }| |j  }|d ur||7 }t|| }|||fS N)TFsilu)r   r   r   r   r   x1x2x4 r   J/home/ubuntu/.local/lib/python3.10/site-packages/xformers/ops/swiglu_op.py dual_gemm_silu_identity_mul_cuda   s   


r!   c                   @   s   e Zd ZedZdZdZdS )DualGemmSiluOpdual_gemm_silu_identity_mulswigludual_gemm_siluN__name__
__module____qualname__r   OPERATOROPERATOR_CATEGORYNAMEr   r   r   r    r"   '       r"   c                   @   s   e Zd ZedZdZdZdS )GemmFusedSumOpgemm_fused_operand_sumr$   Nr&   r   r   r   r    r.   .   r-   r.   c                   @   s8   e Zd ZdZdZdZdd Zedd Zedd	 Z	d
S )_SwiGLUDecomposedFuncap  
    This is just an example implementation with all
    operations explicited. This implementation is worse
    than pytorch, because pytorch is able to fuse some operations
    (eg the linear forward ...) that are decomposed here.

    The time measurements were made on the ViT-Giant setting:
    - A100/f16
    - input: [4440, 1536]
    - hidden: [4440, 4096]
    
decomposedFc                 C   s@   ddt |    }|  | d| d|    |jS )Nr   )torchexpfloattodtype)dyr   sigmr   r   r    _silu_backwardE   s   (z$_SwiGLUDecomposedFunc._silu_backwardc	                 C   sr   || dd | }	|| dd | }
t|	}||
 }|| dd | }|||||||||	|
||| |S N)	transposer   r   save_for_backward)clsctxr   r   r   r   r   w3b3r   r   x3r   x5r   r   r    forwardK   s   
 z_SwiGLUDecomposedFunc.forwardc                 C   s   |j }| jr| }dd |j D }|\}}}}}}	}
}}}}}||	 }|dd| }|d}|| }|| }| ||}|| }|dd| }|d}||| 7 }|dd| }|d}|||||||fS )Nc                 S   s   g | ]}|  qS r   )r4   ).0tr   r   r    
<listcomp>\   s    z2_SwiGLUDecomposedFunc.backward.<locals>.<listcomp>r;   r<   r   )saved_tensorsFORCE_BW_F32r4   r=   sumr9   )r?   r@   dx5rI   r   r   r   r   r   rA   rB   r   r   rC   r   rD   dx4dw3db3dx3dx2dx1dxdw2db2dw1db1r   r   r    backwardW   s$   


z_SwiGLUDecomposedFunc.backwardN)
r'   r(   r)   __doc__r,   rJ   r9   classmethodrE   rX   r   r   r   r    r0   5   s    
r0   c                   @   sn   e Zd ZdZeedddd Zedej	dej	de
d	eej	eej	 f fd
dZeedddd ZdS )_SwiGLUFusedFunczfused.pyr   )device_typec	                 C   sV   t |||||\}	}
}t|||}||||||	|
 |d u|d u|d ug|_|S r   )r"   r*   r   linearr>   bias)r?   r@   r   r   r   r   r   rA   rB   r   r   r   rD   r   r   r    rE   p   s
   z_SwiGLUFusedFunc.forwardr7   r   r^   r   c                 C   s8   |s|  dd| d fS t|  dd|\}}||fS r:   )r=   r.   r*   )r7   r   r^   dwdbr   r   r    
_linear_bwz   s   z_SwiGLUFusedFunc._linear_bwc                 C   s  |j \}}}}}}t||gdd}	|| }
tjj|||
\}}|d\}}~~~
| j|||jd d\}}~~|	d ur|	 sCJ |		 sIJ |	
|jd d |jd g}	|
|jd d|jd  g|	 }|
|jd d|jd  gdd| }| j|
|jd d|jd  g||jd d\}}|
dg|jd\}}|jd r|
d|jd g}tj|dd\}}n2d  }}n-|| }tj||||jdd|d | j|||jd d\}}| j|||jd d\}}|||||||fS )	Nr   dimr      r^   r;   r<   )betaalphaout)rI   r   r2   opsxformerssilu_bw_fusedr   ra   r^   is_contiguousviewshaper=   addmmr5   r6   )r?   r@   rL   r   r   r   rA   r   r   w1w2rM   dx1dx2r   rR   rQ   rN   rO   rS   dw1dw2db1db2rV   rT   rW   rU   r   r   r    rX      s:   "*&


z_SwiGLUFusedFunc.backwardN)r'   r(   r)   r,   rZ   r
   rE   staticmethodr2   Tensorboolr   r   ra   r	   rX   r   r   r   r    r[   m   s$    r[   c                   @   s\   e Zd ZdZdedefddZdddefd	d
Zdee	j
 de	j
fddZdefddZdS )SwiGLUOpzABase class for any swiglu operator in :attr:`xformers.ops.swiglu`packed_weightsnamec                 C   s   || _ || _|| _|| _d S r   )r,   PACKED_WEIGHTSopconstraints)selfr{   rx   ry   r|   r   r   r    __init__   s   
zSwiGLUOp.__init__r{   SwiGLUOpDispatchr   c                    s(   | j r jsdS t fdd| jD S )NFc                 3   s    | ]}| V  qd S r   r   )rF   cr{   r   r    	<genexpr>   s    z$SwiGLUOp.supports.<locals>.<genexpr>)rz   rx   allr|   r}   r{   r   r   r    supports   s   zSwiGLUOp.supportsargsc                 G   s   t  r   )NotImplementedError)r}   r   r   r   r    __call__      zSwiGLUOp.__call__c                 C   s   d| j  S )Nz	SwiGLUOp:)r,   r}   r   r   r    __str__      zSwiGLUOp.__str__N)r'   r(   r)   rY   rv   strr~   r   r   r2   ru   r   r   r   r   r   r    rw      s    rw   c                       s.   e Zd Zdddef fddZdd Z  ZS )_ForwardToPythonAutogradFuncr{   r   r   c                    s   t  |S r   )superr   r   	__class__r   r    r      r   z%_ForwardToPythonAutogradFunc.supportsc                 O   s   | j j|i |S r   )r{   applyr}   r   kwargsr   r   r    r      s   z%_ForwardToPythonAutogradFunc.__call__)r'   r(   r)   rv   r   r   __classcell__r   r   r   r    r      s    r   c                   @   s   e Zd Zdd Zdd ZdS )_ForwardToFuncc                 O   s   | j |i |S r   r   r   r   r   r    r         z_ForwardToFunc.__call__c                 C   s   | j jdkrdS dS )Nno_such_operatorz	not built	available)r{   r'   r   r   r   r    info   s   z_ForwardToFunc.infoN)r'   r(   r)   r   r   r   r   r   r    r      s    r   rA   rB   c           
      C   s8   t | ||}t | ||}t || }	t |	||S r   )r   r]   r   )
r   r   r   r   r   rA   rB   r   r   hiddenr   r   r    _eager_functional_swiglu   s   	r   c                   @   s   e Zd ZU dZeejef ed< ej	ed< e
ej	 ed< eed< eed< edefdd	Zed
ejdejde
ej dejde
ej dejde
ej dd fddZdS )r   z\Dispatcher to automatically select
    the best operator in :attr:`xformers.ops.swiglu`
    devicer6   dtype_autocast_gpurx   bias_enabledr   c                 C   s(   t tg}|D ]}|| r|  S qtS )zsComputes the best operator

        Returns:
            SwiGLUOp: The best operator for the configuration
        )SwiGLUPackedFusedOpSwiGLUFusedOpr   SwiGLUEagerOp)r}   
prioritiesr{   r   r   r    r{      s   
zSwiGLUOpDispatch.opr   r   r   r   r   rA   rB   c                 C   sL   t | j| jt||fddd ut rt n|j|d uo#|d uo#|d udS )Nr   rb   )r   r6   rx   r   r   )r   r   r6   r   r2   is_autocast_enabledget_autocast_gpu_dtype)r   r   r   r   r   rA   rB   r   r   r    from_arguments   s   

zSwiGLUOpDispatch.from_argumentsN)r'   r(   r)   rY   r   r2   r   r   __annotations__r6   r   rv   propertyrw   r{   rt   ru   r   r   r   r   r    r      s6   
 
r   r{   c                 C   s8   t | jtr	| jn| jj}|dkotj| jd dkS )Nr   r      )
isinstancer   r   typer2   r   get_device_capability)r{   r\   r   r   r    
_only_sm80  s   r   c                 C   s*   t jt jg}| j|v p| jd uo| j|v S r   )r2   halfbfloat16r6   r   )r{   HALF_DTYPESr   r   r    _only_half_or_autocast  s   
r   c                 C   s   | j S r   )r   r   r   r   r    _bias_enabled  r   r   Fr1   )r|   fusedswiglu_packedwTzfused.p.cppeagerc                   C   s   dd t fD S )Nc                 S   s   i | ]}|j | qS r   )r,   r   )rF   r{   r   r   r    
<dictcomp>7  s    z_info.<locals>.<dictcomp>)r   r   r   r   r    _info6  r   r   r   c             	   C   s  | j dd }| d| j d g} |jdks|j |j kr(td|j  d|j  |durC|jdks;|j d |j d krCtd|j  |dur^|jdksV|j d |j d kr^td	|j  |jdksm|j d |j d krutd
|j  |dur|jdks|j d |j d krtd|j  d|j  |du rt| ||||||j}|js|| ||||||g |dS t||fdd}	|dur|durt||fdd}
|
du rt	dnd}
|du r|du sJ |	du rt	d|| |	|
||g |dS )a<  
    Computes a SwiGLU block given the weights/bias of the 3
    linear layers.

    - It is recommended to keep ``op=None`` so the best implementation     available for the inputs will be used.


    :Equivalent pytorch code:

    .. code-block:: python

        x1 = F.linear(x, w1, b1)
        x2 = F.linear(x, w2, b2)
        hidden = F.silu(x1) * x2
        return F.linear(hidden, w3, b3)

    :Packing weights:

    To allow faster implementations, it's recommended to have w1/w2 come from the same storage, as in:
        .. code-block:: python

            w1, w2 = xformers.ops.unbind(w12, 0)

    :Supported hardware:

    This operator is only optimized on A100+ on ``torch.half`` or ``torch.bfloat16``         (autocast is supported), and will fallback to a functional pytorch         implementation otherwise.
    Nr<   rd   zInvalid shapes for w1: z / w2: r   r   zInvalid shapes for b1: zInvalid shapes for b2: zInvalid shape for w3: Invalid shapes for w3:  / b3: rb   z!b1/b2 needs to be properly packedz!w1/w2 needs to be properly packed)
rn   reshapendim
ValueErrorr   r   r{   rz   r   r   )r   r   r   r   r   rA   rB   r{   batch_shaperp   b1b2r   r   r    r$   :  s<   *"r$   rp   r   c                C   s   | j dd }| d| j d g} |dur0|jdks$|j d |j d kr0td|j  d|j  |js7J d|| ||||g |dS )a  
    Computes a SwiGLU block given the weights/bias of the 3
    linear layers.

    :Equivalent pytorch code:

    .. code-block:: python

        x1 = F.linear(x, w1, b1)
        x2 = F.linear(x, w2, b2)
        hidden = F.silu(x1) * x2
        return F.linear(hidden, w3, b3)

    :Supported hardware:

    This operator is only optimized on A100+ on ``torch.half`` or ``torch.bfloat16``         (autocast is supported), and will fallback to a functional pytorch         implementation otherwise.
    Nr<   r   r   r   r   zNot implemented PACKED_WEIGHTS)rn   r   r   r   rz   )r   rp   r   rA   rB   r{   r   r   r   r    swiglu_packed  s   r   c                       s   e Zd ZdZ		ddddededee ded	ed
df fddZdej	d
ej	fddZ
d
eej	eej	 ej	eej	 ej	eej	 f fddZd
eej	eej	 ej	eej	 f fddZ  ZS )SwiGLUz
    A Module that encapsulates the call to :attr:`xformers.ops.swiglu`,
    and holds the weights for the 3 linear layers
    NT)_pack_weightsin_featureshidden_featuresout_featuresr^   r   r   c                   s   t    |p|}|p|}|  |rtj|d| |d| _nd| _tj|||d| _tj|||d| _tj|||d| _|| _|| _	|| _
d| _dS )aq  Create a SwiGLU module

        Args:
            in_features (int): Number of features of the input
            hidden_features (int): Number of hidden features
            out_features (Optional[int], optional): Number of features of the input. Defaults to None.
            bias (bool, optional): Whether linear layers also include a bias. Defaults to True.
        rd   re   N)r   r~   r   Linearw12r   r   rA   r   r   r   r{   )r}   r   r   r   r^   r   r   r   r    r~     s   

zSwiGLU.__init__r   c                 C   s\   | j dur | jdur | jjsJ dt|g|  R d| jiS t|g|  R d| jiS )zComputes :attr:`swiglu` with the module's weights

        Args:
            x (torch.Tensor): A Tensor of shape ``[..., in_features]``

        Returns:
            torch.Tensor: A Tensor of shape ``[..., out_features]``
        Nz5_pack_weights and self.op.PACKED_WEIGHTS should matchr{   )r   r{   rz   r   _packed_ordered_paramsr$   _ordered_params)r}   r   r   r   r    rE     s   
	
zSwiGLU.forwardc                 C   s   | j dur?| j j}| j j}t|d|jd d |jd gdd\}}|dur:t|d|jd d gdd\}}nd\}}n| jj| jj}}| jj| jj}}||||| jj| jjfS )z:Used for testing - returns ordered arguments for operatorsNrd   r   r   rb   )NN)	r   weightr^   r   rm   rn   r   r   rA   )r}   rp   r   r   r   r   r   r   r   r    r     s&   

&
zSwiGLU._ordered_paramsc                 C   s|   | j d us	J d	 | j j}| j j}|d|jd d |jd g}d }|d ur4|d|jd d g}||| jj| jjfS )Nz0Packed weights are only available when using w12rd   r   r   )r   r   r^   rm   rn   rA   )r}   rp   
b1b2_paramr   r   r   r    r     s    zSwiGLU._packed_ordered_params)NT)r'   r(   r)   rY   intr   rv   r~   r2   ru   rE   r   r   r   r   r   r   r   r    r     sP    	#
%r   )5dataclassesr   typingr   r   r   r   r   r2   torch.nn.functionalr   
functionalr   	torch.ampr	   r
   commonr   r   r   r   r   versionhiplibraryregister_kernelru   r!   r"   r.   autogradFunctionr0   r[   rw   r   r   r   r   rv   r   r   r   _SwiGLUDecomposedOpr   r   r   r   r   r$   r   Moduler   r   r   r   r    <module>   s   8>

0
	

M
(