o
    TiO                     @   s   d dl mZ d dlZddlmZmZ ddlmZmZ 		ddej	de
d	e
d
ededej	fddZ	ddej	de
d	e
dedeej	 f
ddZde
d	e
de
de
fddZdS )    )OptionalN   )ShardingTypeDEFAULT_SHARD_GRANULARITY)shard_paramget_shard_endpointsFparam
shard_rank
num_shardsgatedis_moereturnc                 C   s>   |rdnd}|rt | tj||td |dS t | tj|||dS )a  
    Utility method for sharding an MLP 1 parameter. Both biases and weights are supported, as well
    as for fused weights for MoE.

    Args:
        param (torch.Tensor): The parameter to shard.
        shard_rank (int): Which shard of the partitioned tensor to return.
        num_shards (int): The total number of shards the parameter is distributed across.
        gated (bool): Whether or not the parameter is from a gated MLP.
       r   )granularity	bias_dims)r   )r   r   OUTER_DIMENSIONr   )r   r	   r
   r   r   r    r   m/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/inference/v2/model_implementations/sharding/mlp.pyshard_mlp_1_param   s   r   c                 C   s:   |rdnd}t | j|kr|dkr| S dS t| tj||S )ah  
    Utility method for sharding an MLP 2 parameter.

    Args:
        param (torch.Tensor): The parameter to shard.
        shard_rank (int): Which shard of the partitioned tensor to return.
        num_shards (int): The total number of shards the parameter is distributed across.
        is_moe (bool): Whether or not the parameter is from a MoE model.
    r   r   r   N)lenshaper   r   INNER_DIMENSION)r   r	   r
   r   bias_dim_sizer   r   r   shard_mlp_2_param*   s   r   intermediate_sizec                 C   s   t | ||}|d |d  S )aV  
    Utility method for getting the size of the intermediate dimension of a sharded MLP.

    Args:
        intermediate_size (int): The size of the intermediate dimension.
        num_shards (int): The total number of shards the parameter is distributed across.
        shard_rank (int): Which shard of the partitioned tensor to return.
    r   r   )r   )r   r
   r	   	endpointsr   r   r   sharded_intermediate_dimA   s   	r   )FF)F)typingr   torchtypesr   r   utilsr   r   Tensorintboolr   r   r   r   r   r   r   <module>   s8   



