o
    Tix                     @   st  d dl Z d dlmZ d dl mZ d dlmZ d dlm	Z	 d dl
mZ d dlmZmZ d dlmZ d d	lmZmZ d d
lmZmZmZmZmZ ddlmZmZmZ d dlmZ d dl m!Z! d dlm"Z" g dZ#ej$a%dZ&dZ'dd Z(dd Z)dHddZ*G dd de j+j,Z-G dd de j+j,Z.G dd de j+j,Z/G d d! d!ej0eZ1d"d# Z2G d$d% d%Z3G d&d' d'e1Z4G d(d) d)e1Z5G d*d+ d+Z6G d,d- d-e5Z7G d.d/ d/e5Z8G d0d1 d1e4Z9G d2d3 d3e5Z:G d4d5 d5e5Z;G d6d7 d7e4Z<G d8d9 d9e4Z=G d:d; d;ej0Z>G d<d= d=e>Z?G d>d? d?e>Z@G d@dA dAej0ZAG dBdC dCej0ZBG dDdE dEeBZCG dFdG dGej0ZDdS )I    N)comm)nn)
functional)	Parameter)get_accelerator)get_shard_sizeget_shard_size_list)is_zero_param)ABCabstractmethod)IterableAnyOptionalListTuple   )shard_value_with_share_qkshard_chunk_mlpprepare_tp_fused_qkvw)AUTOTP_MODE)deepcopy)Union)
TensorParallel_LayerLinearAllreduceLinearLayerLmHeadLinearAllreduceYuan_LinearAllreduceYuan_LinearLayerGateUpPack_LinearLayerConv_LinearALlreducefused_LinearLayerconv_LinearLayerds_is_replaced_moduletensor_model_parallelc                   C   s   t S N)DEEPSPEED_AUTOTP_MODE r&   r&   R/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/module_inject/layers.pyget_auto_tp_mode   s   r(   c                   C   
   t tjkS r$   r%   r   TRAININGr&   r&   r&   r'   is_autotp_training_mode$      
r,   Fc                 C   s   | rt jadS t jadS )zB
    Set the DEEPSPEED_AUTOTP_MODE based on the training flag
    N)r   r+   r%   	INFERENCE)trainingr&   r&   r'   set_autotp_mode)   s   

r0   c                   @   sl   e Zd ZdZedd Zededejde	j
dede	j
f
d	d
Zedede	j
dede	j
df fddZdS )RowParallelzI
    A custom autograd function for performing row-wise parallelism.
    c                 C   s   |S zSymbolic function for tracing.r&   graphinputr&   r&   r'   symbolic9   s   zRowParallel.symbolicctxgroupr5   is_inference_modereturnc                 C   s>   || _ |dkr	|S |rtj||d |S tj| |d |S )
        Forward pass.
        Nr8   )r8   distinference_all_reduce
all_reduce
contiguous)r7   r8   r5   r9   r&   r&   r'   forward>   s   zRowParallel.forwardgrad_outputNc                 C   s
   d|dfS ) 
        Backward pass.
        Nr&   r7   rB   r&   r&   r'   backwardL   s   
zRowParallel.backward)__name__
__module____qualname____doc__staticmethodr6   r   r=   ProcessGrouptorchTensorboolrA   r   rE   r&   r&   r&   r'   r1   4   s    
&*r1   c                	   @   sV   e Zd ZededejdejdejfddZ	ededejde
dejf fd	d
ZdS )AsyncColumnParallelr7   r8   r5   r:   c                 C   sD   |du| _ || _t||dd}|dur||7 }| || |S )r;   N)use_biasr8   rL   matmul	transposesave_for_backward)r7   r8   r5   weightbiasoutputr&   r&   r'   rA   V   s   
zAsyncColumnParallel.forwardrB   Nc                 C   s|   | j \}}||}tj| | jdd}|d|jd  |d|jd }| j	r2|
dnd }|  d |||fS )NT)r8   async_oprP   r   )saved_tensorsrS   r=   r?   r@   r8   viewshapetrR   sumwait)r7   rB   r5   rV   
grad_inputhandlegrad_weight	grad_biasr&   r&   r'   rE   e   s   

*zAsyncColumnParallel.backward)rF   rG   rH   rJ   r   r=   rK   rL   rM   rA   r   rE   r&   r&   r&   r'   rO   T   s
    "(rO   c                	   @   sf   e Zd ZdZedd Zededejde	j
de	j
fdd	Zeded
e	j
dede	j
f fddZdS )ColumnParallelz?
    Custom autograd function for column-wise parallelism.
    c                 C   s   t | t  S r2   )r=   r?   r@   get_tensor_model_parallel_groupr3   r&   r&   r'   r6   v   s   zColumnParallel.symbolicr7   r8   r5   r:   c                 C   s
   || _ |S )r;   r<   )r7   r8   r5   r&   r&   r'   rA   {   s   zColumnParallel.forwardrB   Nc                 C   s.   | j dkr	d|fS tj| | j d d|fS )rC   Nr<   )r8   r=   r?   r@   rD   r&   r&   r'   rE      s   
zColumnParallel.backward)rF   rG   rH   rI   rJ   r6   r   r=   rK   rL   rM   rA   r   rE   r&   r&   r&   r'   rd   q   s    
"(rd   c                       s   e Zd ZU dZdZeed< dZeed< 	 dee	j
 def fddZed	efd
dZedd Zedd Zedeej fddZdd Zdd Zdd Zdd Zdd Z  ZS )r   a  
    A base class for model layers with  tensor parallelism support.
    This class is designed to be extended by specific layers that require distributed
    operations and parameter gather/partitioning during inference or training.

    Attributes:
        mode (str): The mode of operation[INFERENCE or TRAINING], default is "INFERENCE".
        mp_group (Optional[dist.ProcessGroup]): The process group used for model parallelism.
        tp_world_size (int): The world size of tensor parallelism, i.e., the number of parallel workers.
        tp_index (int): The rank (ID) of the current worker in tensor parallelism.
        support_training (bool): Flag indicating whether the layer supports training (default: False).
        name (Optional[str]): The name of the layer, if provided.
    Fkeep_module_on_hosttp_overlap_commmp_groupkwargsc                    sx   t    d| _|dur$|| _t| j| _t|| _| j| _	| j| _
t| dd| _|ddur:|d| _dS dS )a2  
        Initializes the TensorParallel_Layer with optional model parallelism group and layer name.

        Args:
            mp_group (Optional[dist.ProcessGroup]): The process group for model parallelism.
                                                    If None, no model parallelism is set.
        FNname)super__init__support_trainingrh   r=   get_world_sizetp_world_sizeget_ranktp_index
world_sizerankgetattrrj   get)selfrh   ri   	__class__r&   r'   rl      s   
zTensorParallel_Layer.__init__valuec                 C   s
   || _ dS )z
        Set the static variable keep_module_on_host.

        Args:
            value (bool): The new value for keep_module_on_host.
        N)rf   )clsry   r&   r&   r'   set_keep_module_on_host   s   
z,TensorParallel_Layer.set_keep_module_on_hostc                 C      dS )zm
        Forward pass method. Must be implemented by subclasses to define layer-specific operations.
        Nr&   rv   r5   r&   r&   r'   rA         zTensorParallel_Layer.forwardc                 C   r|   )z
        Gathers parameters across devices for distributed training. Must be implemented by subclasses in "TRAINING" mode.
        Nr&   rv   params_listr&   r&   r'   gather_params   r~   z"TensorParallel_Layer.gather_paramsr   c                 C   r|   )z
        Partitions the parameters for tensor parallelism.
        It is necessary to ensure that this function only involves the logic of params partitioning.
        Nr&   r   r&   r&   r'   _tp_partition   s   z"TensorParallel_Layer._tp_partitionc                 C   sn   |   r| jsJ d|dur5|   r|jdu rd|_nd|_t|td t|td | j|_| j|_dS dS )a  
        Configures the weight tensor for training with tensor parallelism. This includes enabling gradients
        and associating necessary methods for parameter gathering and partitioning.

        Args:
            weight (Optional[torch.Tensor]): The weight tensor to configure for tensor parallelism.
                                              If None, no action is taken.
        zNo implementation of backward.NTF)is_training_moderm   requires_gradsetattrDS_TENSOR_MODEL_PARALLELDS_IS_REPLACED_MODULEr   r   )rv   rV   r&   r&   r'   config_tp_params   s   

z%TensorParallel_Layer.config_tp_paramsc                 C   r)   r$   r*   rv   r&   r&   r'   r      r-   z%TensorParallel_Layer.is_training_modec                 C   sZ   | j }||}t|  D ]\}}|dkr| j|_qt||t|| q||t| < |S )Nrh   )rx   __new__varsitemsrh   r   r   id)rv   memorz   new_objkeyry   r&   r&   r'   __deepcopy__   s   

z!TensorParallel_Layer.__deepcopy__c                 C   sl   d\}}| j d ur t| j r| j jdd  n| j jdd  \}}| j d ur)| j jnd }d||| jd u|S )N)NNrQ   z2in_features={}, out_features={}, bias={}, dtype={})rV   r	   ds_shaper\   dtypeformatrW   )rv   out_featuresin_featuresr   r&   r&   r'   
extra_repr	  s   
zTensorParallel_Layer.extra_reprc                 C   sR   |j r|S | jjrdnt  }| jj }|j||d}|r'tjd|jd|_	|S )Ncpu)copyr   )device)
is_metarx   rf   r   current_device_nametorL   emptyr   data)rv   tensorr   return_new_copycloned_tensorr&   r&   r'   move  s   
zTensorParallel_Layer.move)rF   rG   rH   rI   rf   rN   __annotations__rg   r   r=   rK   r   rl   classmethodr{   r   rA   r   r   rL   rM   r   r   r   r   r   r   __classcell__r&   r&   rw   r'   r      s&   
 	

	r   c                 C   s0   dg}|D ]}t | |rtt|t| | qd S )Nrg   )hasattrr   r   rt   )configruntime_keysr   r&   r&   r'   !configure_tensor_parallel_runtime,  s   
r   c                   @   sf   e Zd ZdZ	ddeeej ejf dejj	de
fddZdejd	e
fd
dZdddZdddZdS )GatherReplacedLayerParamsz
    A context manager for gathering parameters of a replaced layer, enabling partitioning and gathering functionality
    based on the configuration of the model.
    Tparamsmoduleenabledc                    sb   | _ | _|s
dS t|trt|tjst| _n|g _t fdd|D s/d _ dS dS )a  
        Initialize the context manager to handle parameter gathering and partitioning for a replaced layer.

        Args:
            params (Iterable or torch.Tensor): A collection or single parameter to manage.
            module (torch.nn.Module): The module that these parameters belong to.
            enabled (bool): Flag indicating whether the parameter management is enabled (default: True).
        Nc                 3   s    | ]}  |V  qd S r$   )_is_replaced_module_weight).0pr   r&   r'   	<genexpr>Q  s    z5GatherReplacedLayerParams.__init__.<locals>.<genexpr>F)	r   r   
isinstancer   rL   rM   listr   any)rv   r   r   r   r&   r   r'   rl   9  s   z"GatherReplacedLayerParams.__init__paramr:   c                 C   s   t |tdS )a  
        Helper function to determine if a parameter belongs to a replaced module.

        Args:
            param (torch.Tensor): The parameter to check.

        Returns:
            bool: True if the parameter belongs to a replaced module, False otherwise.
        F)rt   r   )rv   r   r&   r&   r'   r   U  s   
z4GatherReplacedLayerParams._is_replaced_module_weightNc                 C       | j r| jd | j dS dS )zc
        Enter the context manager. If enabled, gather parameters for the replaced module.
        r   N)r   r   r   r   r&   r&   r'   	__enter__a  s   z#GatherReplacedLayerParams.__enter__c                 C   r   )zi
        Exit the context manager. If enabled, partition the parameters for the replaced module.
        r   N)r   r   r   )rv   exc_type	exc_value	tracebackr&   r&   r'   __exit__h  s   z"GatherReplacedLayerParams.__exit__)T)r:   N)rF   rG   rH   rI   r   r   rL   rM   r   ModulerN   rl   r   r   r   r&   r&   r&   r'   r   3  s    

r   c                       sL   e Zd Z fddZdd Ze dd Ze dd Zd	d
 Z	  Z
S )r   c                    sj   t t| j|fi | |j| _|j| _| | j| jg d| _| | j | jd ur3| | j d S d S NT)rk   r   rl   rV   rW   r   rm   r   rv   r   rh   ri   rw   r&   r'   rl   s  s   
zLinearAllreduce.__init__c                 C   sD   t || jdd}t| j||   }| jd ur || j7 }|S )NrP   rQ   )	rL   rS   rV   rT   r1   applyrh   r   rW   rv   r5   rX   r&   r&   r'   rA   ~  s
   

zLinearAllreduce.forwardc                 C   s   t |D ]E\}}|d u s|dkr d S |j|| _|dd }tj| j|jd  |jd |j	|j
d}tj||| jd |dd || _qd S Nr   r   r   r   r<   )	enumerater   data_partitionrT   r@   rL   r   ro   r\   r   r   r=   all_gather_into_tensorrh   rv   r   idxr   output_paramr&   r&   r'   r     s   zLinearAllreduce.gather_paramsc                 C   sr   |   s| | d S t|D ]'\}}|d u s|dkr d S tj|| jdd| j }| | }||| _	qd S )Nr   rP   dim
r   uneven_partitionr   rL   chunkro   rq   r   detachr   rv   r   r   r   
_partitionr&   r&   r'   r     s   
zLinearAllreduce._tp_partitionc                 C   s   t |D ];\}}|d u s|dkr d S | jd usJ d|| jt|| jd | j| jdd| j }| | }||| _	qd S )Nr   7The module name must be provided in the initialization.r   r   
r   rj   splitr   r\   ro   rq   r   r   r   r   r&   r&   r'   r     s   z LinearAllreduce.uneven_partition)rF   rG   rH   rl   rA   rL   no_gradr   r   r   r   r&   r&   rw   r'   r   q  s    

r   c                       sf   e Zd Zd fdd	Zdd Ze dd Ze d	d
 Zdd Z	e
dejddfddZ  ZS )r   NFc                    sn   t t| j|fi | |j| _|j| _|s| | j| jg d| _| | j | jd ur5| | j d S d S r   )rk   r   rl   rV   rW   r   rm   r   rv   r   rh   skip_partitionri   rw   r&   r'   rl     s   
zLinearLayer.__init__c                 C   sn   | j js*t| dd d urt| j|}t|| j	dd}| j
d ur(|| j
7 }|S t| j|| j| j
}|S )Nrh   rP   rQ   )rx   rg   rt   rd   r   rh   rL   rS   rV   rT   rW   rO   r   r&   r&   r'   rA     s   

zLinearLayer.forwardc                 C   sj   t |D ].\}}|j|| _tj| j|jd  |jd |j|jd}t	j
||| jd | || _qd S r   )r   r   r   rL   r   ro   r\   r   r   r=   r   rh   r@   r   r&   r&   r'   r     s   zLinearLayer.gather_paramsc                 C   sj   |   s| | d S t|D ]#\}}|d u r d S tj|| jdd| j }| | }||| _	qd S )Nr   r   r   r   r&   r&   r'   r     s   
zLinearLayer._tp_partitionc                 C   s|   t |D ]7\}}|d u r d S | jd usJ d|| jt|| jd | j| jdd| j }| | }||| _	qd S )Nr   r   r   r   r   r&   r&   r'   r     s   zLinearLayer.uneven_partitionc                 C   sz   |d ur%|j d }|j d }tj|||d ud}||j_|d ur$||j_n|d }|d }tj|||d ud}| |ddS )Nr   r   )rW   T)r   )r\   r   LinearrV   r   rW   )rz   weight_shaper   rV   rW   r   r   linearr&   r&   r'   from_weights  s   

zLinearLayer.from_weightsNF)rF   rG   rH   rl   rA   rL   r   r   r   r   r   halfr   r   r&   r&   rw   r'   r     s    

r   c                   @   s$   e Zd ZdejfddZdd ZdS )FusedModuleWrapperfused_modulec                 C   s
   || _ d S r$   r   )rv   r   r&   r&   r'   rl        
zFusedModuleWrapper.__init__c                 C   s   | j S r$   r   )rv   r   r&   r&   r'   __getattr__  s   zFusedModuleWrapper.__getattr__N)rF   rG   rH   r   r   rl   r   r&   r&   r&   r'   r     s    r   c                       s.   e Zd Zd fdd	Ze dd Z  ZS )r    Fc                    sB   | dd usJ dt| d| _t j|||fi | d S )Nr   z+'fused_module' is required but not provided)ru   r   r   rk   rl   r   rw   r&   r'   rl     s   zfused_LinearLayer.__init__c                 C   sR   t |D ]"\}}|d u r d S t| jj|| j| j}| | }||| _qd S r$   )	r   r   r   r   ro   rq   r   r   r   r   r&   r&   r'   r     s   zfused_LinearLayer._tp_partitionF)rF   rG   rH   rl   rL   r   r   r   r&   r&   rw   r'   r      s    r    c                   @      e Zd Ze dd ZdS )r!   c                 C   s   d }d }t |dkr|d }nt |dkr|d |d }}|jjt|jd | j| jdd| j }| |	 }||_|d ur`|jjt|jd | j| jdd| j }| |	 }||_d S d S )Nr   r      r   )
lenr   r   r   r\   ro   rj   rq   r   r   )rv   r   rV   rW   r   r&   r&   r'   r   -  s,   

zconv_LinearLayer._tp_partitionNrF   rG   rH   rL   r   r   r&   r&   r&   r'   r!   +      r!   c                   @   r   )r   c                 C   sF   t |d j|d | j| jd\}}||d _|d ur!||d _d S d S )Nr   r   F)r   r   rq   ro   rv   r   rV   rW   r&   r&   r'   r   F  s   
z"Yuan_LinearAllreduce._tp_partitionNr   r&   r&   r&   r'   r   C  s    r   c                   @   r   )r   c                 C   sZ   t |d j|d | j| jd\}}| | |d _|d ur+| | |d _d S d S )Nr   r   T)r   r   rq   ro   r   r   r   r&   r&   r'   r   Q  s   zYuan_LinearLayer._tp_partitionNr   r&   r&   r&   r'   r   O  r   r   c                   @   r   )r   c                 C   sX   t |d j|d | j| j\}}| | |d _|d ur*| | |d _d S d S )Nr   r   )r   r   rq   ro   r   r   r   r&   r&   r'   r   \  s
    z$GateUpPack_LinearLayer._tp_partitionNr   r&   r&   r&   r'   r   Z  r   r   c                   @   r   )r   c                 C   sv   t |D ]4\}}|d u r d S |jdd |_|jt|jd | j| jdd| j	 }| 
| }||| _qd S )NrP   rQ   r   r   r   )r   r   rT   r@   r   r   r\   ro   rj   rq   r   r   r   r&   r&   r'   r   f  s   z"Conv_LinearALlreduce._tp_partitionNr   r&   r&   r&   r'   r   d  r   r   c                       s$   e Zd Z fddZdd Z  ZS )r   c                    s`   d| _ t|j  |_t|dr#|jd ur#t|j  |_t j	||fi | d S )Nlm_headrW   )
rj   r   r   rV   cloner   r   rW   rk   rl   r   rw   r&   r'   rl   x  s
   zLmHeadLinearAllreduce.__init__c                 C   s   t |jd | jd}tt|jd | jdd| j }t|d d d d ||| f | j	dd}| j
d ur@tj|| j
d | jd urJ|| j7 }|S )NrP   r   r   rQ   r<   )r   r\   ro   r^   r   rq   rL   rS   rV   rT   rh   r=   r>   rW   )rv   r5   input_shard_sizeinput_shard_offsetrX   r&   r&   r'   rA     s   " 


zLmHeadLinearAllreduce.forward)rF   rG   rH   rl   rA   r   r&   r&   rw   r'   r   v  s    r   c                       s:   e Zd Z fddZdd ZdejdejfddZ  ZS )	TensorParallelConv2dc                    s*   t    || _|| _|| _| | d S r$   )rk   rl   rs   rr   shard_by_ocshard_weights)rv   convrs   rr   r   rw   r&   r'   rl     s
   
zTensorParallelConv2d.__init__c                 C   st  | j r
|jjd }n|jjd }d }dg}t| jd ddD ]}|| j }||| j k r0|d7 }||d |  q|jj}| j re||| j || jd   }|jd urd|jj|| j || jd   }n |d d || j || jd  f }|jd ur|jjt	| j }t
|jd |jd |j|j|j|j|j|jd u|j	| _tj
|| j_|jd urtj
|| j_~d S )Nr   r   rP   )r   rV   r\   rangerr   appendr   rs   rW   floatr   Conv2dkernel_sizestridepaddingdilationgroupspadding_moder   rL   r   )rv   r   
total_size	bias_datacols_per_rankicolsweight_datar&   r&   r'   r     s4   

 $
 
z"TensorParallelConv2d.shard_weightsr5   r:   c                 C   s
   |  |S r$   )r   r}   r&   r&   r'   rA     r   zTensorParallelConv2d.forward)	rF   rG   rH   rl   r   rL   rM   rA   r   r&   r&   rw   r'   r     s    r   c                       s   e Zd Z fddZ  ZS )TensorParallelOcShardConv2dc                       t  |||d d S r   rk   rl   rv   r   rs   rr   rw   r&   r'   rl        z$TensorParallelOcShardConv2d.__init__)rF   rG   rH   rl   r   r&   r&   rw   r'   r    s    r  c                       s2   e Zd Z fddZdejdejfddZ  ZS )TensorParallelIcShardConv2dc                    r  r   r  r  rw   r&   r'   rl     r  z$TensorParallelIcShardConv2d.__init__r5   r:   c                 C   s"   |  |}| jdkrt| |S )Nr   )r   rr   r=   r>   )rv   r5   outr&   r&   r'   rA     s   


z#TensorParallelIcShardConv2d.forward)rF   rG   rH   rl   rL   rM   rA   r   r&   r&   rw   r'   r    s    r  c                       s2   e Zd Zdejdddf fdd	Zdd Z  ZS )	NormalizeNh㈵>c                    sd   t t|   |d ur|| _|| _ntj||d|t 	 | _
| j
j| _| j
j| _|| _d S )Neps)rk   r	  rl   rV   rW   r   	LayerNormr   r   r   normr  )rv   r   r   r  rV   rW   rw   r&   r'   rl     s   "


zNormalize.__init__c                 C   s&   t jj||jdd  | j| j| jdS )NrP   r  )r   r   
layer_normr\   rV   rW   r  r}   r&   r&   r'   rA     s   &zNormalize.forwardrF   rG   rH   rL   r   rl   rA   r   r&   r&   rw   r'   r	    s    r	  c                       s0   e Zd Zdejddf fdd	Zdd Z  ZS )EmbeddingLayerNc                    sJ   t t|   |d u r ttj|d |d |t  d| _d S || _d S )Nr   r   r   )	rk   r  rl   r   rL   r   r   r   rV   )rv   r   r   rV   rW   rw   r&   r'   rl     s   


zEmbeddingLayer.__init__c                 C   s   t || jS r$   )F	embeddingrV   r}   r&   r&   r'   rA     s   zEmbeddingLayer.forward)rF   rG   rH   rL   r   rl   rA   r   r&   r&   rw   r'   r    s    r  c                       s@   e Zd ZdZd fdd	Zddejdedef fd	d
Z  Z	S )OPTEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    Nc                    s   d| _ t j||d d S )Nr   )rV   )offsetrk   rl   )rv   r   rV   rW   rw   r&   r'   rl     s   zOPTEmbedding.__init__r   attention_maskpast_key_values_lengthposition_idsc                    sN   |  }tj|dd||   d }|dd|df }t || j S )z3`input_ids_shape` is expected to be [bsz x seqlen].r   r   N)longrL   cumsumtype_asrk   rA   r  )rv   r  r  r  	positionsrw   r&   r'   rA     s    zOPTEmbedding.forward)NNN)r   r   )
rF   rG   rH   rI   rl   rL   
LongTensorintrA   r   r&   r&   rw   r'   r    s    &r  c                       s0   e Zd Zdejddf fdd	Zdd Z  ZS )RMSNormalizeNr
  c                    sF   t t|   |d ur|| _nttj||t 	 d| _|| _
d S )Nr   )rk   r  rl   rV   r   r   rL   onesr   r   r  )rv   r   r   r  rV   rw   r&   r'   rl     s
   
zRMSNormalize.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv r)| | jj}|| j S )Nr   rP   T)keepdim)r   rL   float32powmeanrsqrtr  rV   r   float16bfloat16)rv   hidden_statesvariancer&   r&   r'   rA     s
   
zRMSNormalize.forwardr  r&   r&   rw   r'   r    s    	r  r   )ErL   	deepspeedr   r=   r   torch.nnr   r  torch.nn.parameterr   deepspeed.acceleratorr    deepspeed.module_inject.tp_shardr   r   deepspeed.runtime.zero.utilsr	   abcr
   r   typingr   r   r   r   r   fusedqkv_utilsr   r   r   !deepspeed.runtime.tensor_parallelr   r   r   r   __all__r.   r%   r   r   r(   r,   r0   autogradFunctionr1   rO   rd   r   r   r   r   r   r   r   r    r!   r   r   r   r   r   r   r  r  r	  r  r  r  r&   r&   r&   r'   <module>   sZ   
  >FV	
,