o
    Ti>                  
   @   sJ  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ dd
dZz8eedrfeejdrfeejdrfejejje  dZejejje  dZne  jZe  jZW n eefy Z z
eZeZW Y dZ[ndZ[ww G dd dejjZdddZ G dd de	Z!dS )    N)Tensor)	Parameter)init)Module)noop_decorator)comm)get_acceleratorFc                 C   s(   t  dkr|s
|rt|  d S d S d S )Nr   )distget_rankprint)messagedebugforce r   Q/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/runtime/zero/linear.pyprint_rank_0   s   r   amp
custom_fwd
custom_bwd)device_typec                   @   s.   e Zd ZeedddZeedd ZdS )LinearFunctionForZeroStage3Nc                 C   s^   |  ||| | dkr|d urt||| }|S || }|d ur+||7 }|}|S )N   )save_for_backwarddimtorchaddmmtmatmul)ctxinputweightbiasretoutputr   r   r   forward5   s   z#LinearFunctionForZeroStage3.forwardc           	      C   s   | j \}}}d  } }}| jd r||}| jd r@| }|dkr9|d|jd  |d|jd }n| |}|d ura| jd ra|dkr\|dd t|d D }n|d}|||fS )Nr      r   c                 S   s   g | ]}|qS r   r   ).0ir   r   r   
<listcomp>i   s    z8LinearFunctionForZeroStage3.backward.<locals>.<listcomp>)	saved_tensorsneeds_input_gradr   r   reshapeshaper   sumrange)	r   grad_outputr   r    r!   
grad_inputgrad_weight	grad_biasr   r   r   r   backwardH   s$   




z$LinearFunctionForZeroStage3.backwardN)__name__
__module____qualname__staticmethodautocast_custom_fwdr$   autocast_custom_bwdr4   r   r   r   r   r   2   s    r   c                 C   s"   |d u r
t | |S t | ||S r5   )r   apply)r   r    r!   r   r   r   zero3_linear_wrapr   s   r=   c                	       s   e Zd ZU dZddgZeed< eed< eed< ddedededdf fd	d
Z	dddZ
dedefddZdefddZ  ZS )LinearModuleForZeroStage3a  Applies a linear transformation to the incoming data: :math:`y = xA^T + b`.
    The weights are pre-transposed and stored as A^T instead of transposing during each
    forward. Memory savings proportional to the parameter size.

    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        bias: If set to ``False``, the layer will not learn an additive bias.
            Default: ``True``

    Shape:
        - Input: :math:`(N, *, H_{in})` where :math:`*` means any number of
          additional dimensions and :math:`H_{in} = \text{in\_features}`
        - Output: :math:`(N, *, H_{out})` where all but the last dimension
          are the same shape as the input and :math:`H_{out} = \text{out\_features}`.

    Attributes:
        weight: the learnable weights of the module of shape
            :math:`(\text{out\_features}, \text{in\_features})`. The values are
            initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
            :math:`k = \frac{1}{\text{in\_features}}`
        bias:   the learnable bias of the module of shape :math:`(\text{out\_features})`.
                If :attr:`bias` is ``True``, the values are initialized from
                :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
                :math:`k = \frac{1}{\text{in\_features}}`

    Examples::

        >>> m = nn.Linear(20, 30)
        >>> input = torch.randn(128, 20)
        >>> output = m(input)
        >>> print(output.size())
        torch.Size([128, 30])
    in_featuresout_featuresr    Tr!   returnNc                    sb   t t|   td || _|| _tt||| _	|r%tt|| _
n| dd  |   d S )NzBuilding ZeRO moduler!   )superr>   __init__r   r?   r@   r   r   r   r    r!   register_parameterreset_parameters)selfr?   r@   r!   	__class__r   r   rC      s   z"LinearModuleForZeroStage3.__init__c                 C   sX   t j| jtdd | jd ur*t | j\}}dt| }t | j| | d S d S )N   )ar%   )r   kaiming_uniform_r    mathsqrtr!   _calculate_fan_in_and_fan_outuniform_)rF   fan_in_boundr   r   r   rE      s   
z*LinearModuleForZeroStage3.reset_parametersr   c                 C   s   t || j| jS r5   )r   r<   r    r!   )rF   r   r   r   r   r$      s   z!LinearModuleForZeroStage3.forwardc                 C   s   d | j| j| jd uS )Nz(in_features={}, out_features={}, bias={})formatr?   r@   r!   )rF   r   r   r   
extra_repr   s   z$LinearModuleForZeroStage3.extra_repr)T)rA   N)r6   r7   r8   __doc____constants__int__annotations__r   boolrC   rE   r$   strrT   __classcell__r   r   rG   r   r>   y   s   
 " 
r>   )FFr5   )"rL   	functoolsr   r   torch.nn.parameterr   torch.nnr   torch.nn.modules.moduler   deepspeed.runtime.utilsr   	deepspeedr   r	   deepspeed.acceleratorr   r   hasattrr   partialr   device_namer:   r   r;   ImportErrorAttributeErrorexpautogradFunctionr   r=   r>   r   r   r   r   <module>   s4   
"
@