o
    Ti                     @   s   d dl mZ d dlZddlmZ ddlmZmZ 		ddejde	de	d	e	d
ee	 dee	 deej fddZ
		dde	de	de	d	e	d
ee	 dee	 de	fddZdS )    )OptionalN   )ShardingType)shard_paramget_shard_endpointsparam
shard_rank
num_shards	head_size	n_heads_q
n_heads_kvreturnc                 C   s
  |dur|du rt d|du r+| jd d | dkrt d| jd | d }d}n||k}||k r7t d|rDt| tj||d|dS || dkrNt d	| jd ||d
|   kr_t d||krm|| dkrmt d||k r{|| dkr{t d||k}| du rdS | d||  }| || d }	|rt|tj|||d}t|	tj||d
|d}	tj||	gddS || }
||
 }|	|| |d |  }|	|| | || d |  }||
 }|| }||| | ||d  |  }t|tj||
|d}tj|||gddS )a
  
    Utility method for sharding a QKV parameter. Both biases and weights are supported. It is assumed
    that the layout of the parameter is such that all Q heads, all K heads, and all V heads
    are contiguous with respect to each other.

    Args:
        param (torch.Tensor): The parameter to shard.
        shard_rank (int): Which shard of the partitioned tensor to return.
        num_shards (int): The total number of shards the parameter is distributed across.
        head_size (int): The size of each head.
        n_heads_q (int): The number of query heads. This only needs to be passed if the number
             of query and key/value heads are different. If passed without n_heads_kv, default
             MHA partitioning will be used.
        n_heads_kv (int): The number of key/value heads. This only needs to be passed if the number
                of query and key/value heads are different. This argument should not be passed without
                n_heads_q (we want to explicitly opt into GQA sharding).
    N1n_heads_kv should not be passed without n_heads_qr      zMHA param shape is not correctTz?There must be at least as many query heads as there are shards.)num_concatenated_matricesgranularityz8Must be an even ratio between query and key/value heads.   zGQA param shape is not correctzACurrently do not support uneven partitioning of KV heads for GQA.zRCurrently do not support distributing KV heads across different numbers of shards.r   )dimr   )
ValueErrorshaper   r   OUTER_DIMENSIONtorchcat)r   r   r	   r
   r   r   mha_shardingeven_kv_shardingq_paramkv_paramq_sharding_degreekv_headk_paramv_paramq_sharding_rankq_factorq_chunk r%   m/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/inference/v2/model_implementations/sharding/qkv.pyshard_qkv_param   sn     r'   in_featuresc                 C   s   |dur|du rt d|du p||k}|dur"| || kr"t d|r6t| |||d}|d |d  d S ||krT|| dkrDt d|| }|| }	|| d	|	  S || dkr^t d
|| }
||
 }|| | }t|||
|d}|d |d  d	|  S )aq  
    Helper to calculate the expected output projection dimension of a QKV projection matrix.

    Args:
        in_features (int): The model dimension.
        shard_rank (int): Which rank to return the corresponding size for.
        num_shards (int): The total number of shards the parameter is distributed across.
        head_size (int): The size of each head.
        n_heads_q (int): The number of query heads. This only needs to be passed if the number
             of query and key/value heads are different. If passed without n_heads_kv, default
             MHA partitioning will be used.
        n_heads_kv (int): The number of key/value heads. This only needs to be passed if the number
            of query and key/value heads are different. This argument cannot be passed without also
            passing n_heads_q (we want to explicitly opt into GQA sharding).
    Nr   z:in_features is not consistent with n_heads_q and head_sizer   r   r   r   z:The KV heads must be evenly distributed across the shards.r   zHA shared KV head must always partition across the same number of shards.)r   r   )r(   r   r	   r
   r   r   r   	endpointsn_local_groups
group_sizeq_split_degreeq_split_ranksplit_granularityq_endpointsr%   r%   r&   qkv_out_featuress   s*   r0   )NN)typingr   r   typesr   utilsr   r   Tensorintr'   r0   r%   r%   r%   r&   <module>   sD   

i