o
    Ti-                     @   sN   d dl Z d dlZd dlmZ d
ddZG dd de jjZG dd	 d	eZ	dS )    N)partition_uniformFc                 C   s6   |   d }tj| ||d}|rtdd |D S |S )a+  Split a tensor along its last dimension. Adapted from Megatron-LM.

    Arguments:
        tensor: input tensor.
        partitions: list of partition sizes to supply to torch.split
        contiguous_split_chunks: If True, make each chunk contiguous
                                 in memory.
       dimc                 s   s    | ]}|  V  qd S )N)
contiguous).0chunk r	   Q/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/runtime/zero/tiling.py	<genexpr>   s    z.split_tensor_along_last_dim.<locals>.<genexpr>)r   torchsplittuple)tensor
partitionscontiguous_split_chunkslast_dimtensor_listr	   r	   r
   split_tensor_along_last_dim   s
   
r   c                       s`   e Zd Zdddddejjdf fdd	Zdd Zd	d
 Zdd Z	dd Z
e dd Z  ZS )TiledLinearTr   FNc
                    s  t    |dk s||krtd|dk s||krtd|| _|| _|| _|| _|| _|| _|| _	t
||d| _t
||d| _t| j|d ksKJ t| j|d ksVJ | jd dks_J | j| |kshJ | j| |ksqJ tj | _t|D ]G}| jtj  | j|d  | j|  }t|D ]+}||d kr|nd}| j|d  | j|  }|||fd|i|
}| j| | qq{|	dur| |	 dS dS )	a@  A replacement for ``torch.nn.Linear`` that works with ZeRO-3 to reduce
        memory requirements via tiling.

        TiledLinear breaks the input and output dimensions of a linear layer
        into tiles that are processed in sequence. This class enables huge
        linear layers when combined with ZeRO-3 because inactive tiles can be
        partitioned and offloaded.

        .. note::
            We recommend using as few tiles as necessary. Tiling
            significantly reduces memory usage, but can reduce throughput
            for inexpensive layers. This due to the smaller kernels having
            less parallelism and lower arithmetic intensity, while
            introducing more frequent synchronization and communication.

        Args:
            in_features (int): See ``torch.nn.Linear``
            out_features (int): See ``torch.nn.Linear``
            bias (bool, optional): See ``torch.nn.Linear``
            in_splits (int, optional): The number of tiles along the input dimension. Defaults to 1.
            out_splits (int, optional): The number of tiles along the output dimension. Defaults to 1.
            input_is_already_split (bool, optional): If set to ``True``, assume that the ``input_`` in
                to ``forward()`` is already split into ``in_splits`` chunks. Defaults to ``False``.
            combine_out_splits (bool, optional): If set to ``False``, do not combine the ``out_splits`` outputs
                into a single tensor. Defaults to ``True``.
            linear_cls (class, optional): The underlying class to build individual tiles.
                Defaults to ``torch.nn.Linear``.
            init_linear (``torch.nn.Linear``, optional): If set, copy the parameters of
                ``init_linear``. Useful for debugging. Defaults to ``None``.
            kwargs (dict, optional): additional keyword arguments to provide to ``linear_cls()``.

        Raises:
            RuntimeError: ``in_splits`` must be within the range [1, in_features).
            RuntimeError: ``out_splits`` must be within the range of [1, out_features).
        r   z,in splits must be in range [1, in_features].z.out splits must be in range [1, out_features].)	num_items	num_partsr   FbiasN)super__init__RuntimeErrorin_featuresout_featuresuse_bias
out_splits	in_splitsinput_is_already_splitcombine_out_splits	partitionin_parts	out_partslenr   nn
ModuleListlinearsrangeappendcopy_params_from)selfr   r   r   r    r   r!   r"   
linear_clsinit_linearkwargsout_idlocal_out_dimin_id
local_biaslocal_in_dimlocal	__class__r	   r
   r   "   s@   
/	zTiledLinear.__init__c                    s   | j dkr$| js$t|jd | j   fddt| j D }| ||}n| j dkr@|}t|| j ks?J d| j  dt| n|g}d g| j }t| jD ]"}t| j D ]}| j| | || }| j	|||| |d||< qUqN| j
ry| |S |S )Nr   c                    s    g | ]} |d    |  qS )r   r	   )r   pinput_partsr	   r
   
<listcomp>        z'TiledLinear.forward.<locals>.<listcomp>zCol splits z does not match input splits r3   r1   current_outnew_out)r    r!   r#   shaper*   _split_global_inputr&   r   r)   _reduce_local_outputr"   _combine_output_splits)r-   input_split_sizesinputsoutputsr1   r3   local_outputr	   r;   r
   forward   s6   

zTiledLinear.forwardc                 C   s
   t ||S )a  Partition an input tensor along the last dimension, aligned with given splits.

        Subclasses should override this method to account for new input types.

        Args:
            input (List[Tensor]): The tensor to partition along the last dimension.
            split_sizes (List[int]): The size of each partition.

        Returns:
            List[Any]: A list of the chunks of ``input``.
        )r   )r-   inputrG   r	   r	   r
   rC      s   
zTiledLinear._split_global_inputc                 C   s   |du r|  S || S )a  Reduce (sum) a new local result into the existing local results.

        Subclasses should override this method.

        For a given ``out_id``, this method is called ``in_id-1`` times. The first input
        split is a simple assignment.

        Args:
            in_id (int): The input split that produced ``new_out``.
            out_id (int): The output split that produced ``new_out``.
            current_out (Any): The reduced form of all previous ``out_id`` results.
            new_out (Any): The local result from forward (``in_id``, ``out_id``)e

        Returns:
            Any: The combined result of ``current_out`` and ``new_out``.
        N)clone)r-   r3   r1   r@   rA   r	   r	   r
   rD      s   z TiledLinear._reduce_local_outputc                 C   s    t || jks	J tj|ddS )zJoin the splits of the output into a single result.

        Args:
            outputs (List[Any]): The reduced outputs for each output split.

        Returns:
            Any: The combined outputs.
        r9   r   )r&   r   r   cat)r-   rI   r	   r	   r
   rE      s   	z"TiledLinear._combine_output_splitsc           
   
   C   s|  t |dsJ |j | j| jfksJ | jr1t |dsJ |jdus%J |j | jfks0J n|jdu s8J t| jD ]~}| j	| }| j	|d  }t| j
D ]?}| j| }| j|d  }| j| | }|j||||f }	tjj|jdd |j|	 W d   n1 sw   Y  qP|jdurtjj|jdd |jj|j|| j W d   n1 sw   Y  q=dS )a  Copy the weight and bias data from ``other``.

        This is especially useful for reproducible initialization and testing.

        Equivalent to:

        .. code-block:: python

            with torch.no_grad():
                self.weight.copy_(other.weight)
                if self.bias is not None:
                    self.bias.copy_(other.bias)

        .. note::
            If ZeRO-3 is enabled, this is a collective operation and the
            updated parameters of data-parallel rank 0 will be visible on all
            ranks. See :class:`deepspeed.zero.GatheredParameters` for more
            information.


        Args:
            other (``torch.nn.Linear``): the linear layer to copy from.
        weightr   Nr   r   )modifier_rank)hasattrrO   sizer   r   r   r   r*   r   r%   r    r$   r)   	deepspeedzeroGatheredParameterscopy_data)
r-   otherrowrstartrstopcolcstartcstopr6   global_weightr	   r	   r
   r,      s2   


zTiledLinear.copy_params_from)__name__
__module____qualname__r   r'   Linearr   rK   rC   rD   rE   no_gradr,   __classcell__r	   r	   r7   r
   r       s    ^r   c                       s,   e Zd ZdZ fddZ fddZ  ZS )TiledLinearReturnBiaszeWrapper for a Linear class that returns its own bias parameter, such as
    used by Megatron-LM.
    c           	         st   |dur	|\}}nd\}}t |tsJ t|dksJ |\}}|dus&J t j||||d}|du r6|}||fS )z3Reduces output tensors, but not the returned bias. N)NN   r?   )
isinstancer   r&   r   rD   )	r-   r3   r1   r@   rA   
old_tensorold_biasr   r   r7   r	   r
   rD     s   
z*TiledLinearReturnBias._reduce_local_outputc                    sT   dd |D }t  |}dd |D }t|dkr$t  |}||fS d }||fS )Nc                 S   s   g | ]}|d  qS )r   r	   r   or	   r	   r
   r=     s    z@TiledLinearReturnBias._combine_output_splits.<locals>.<listcomp>c                 S   s    g | ]}|d  dur|d  qS )r   Nr	   rk   r	   r	   r
   r=   "  r>   r   )r   rE   r&   )r-   rI   tensorsr   biasesr   r7   r	   r
   rE     s   z,TiledLinearReturnBias._combine_output_splits)r`   ra   rb   __doc__rD   rE   re   r	   r	   r7   r
   rf     s    rf   )F)
r   rS   deepspeed.runtime.utilsr   r#   r   r'   Moduler   rf   r	   r	   r	   r
   <module>   s   
 d