o
    پi                     @  s  d Z ddlmZ ddlZddlZddlmZmZmZm	Z	m
Z
 ddlZddlmZ ddlmZmZ ddlmZmZmZmZmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z, erddl-m.Z.m/Z/ e* Z0e0oe(dZ1e2e3Z4g dZ5e) Z6e+ Z7dd Z8d0ddZ9dd Z:dd Z;G d d! d!ejj<Z=G d"d# d#e=Z>G d$d% d%e=Z?G d&d' d'e?Z@G d(d) d)e?ZAG d*d+ d+e=ZBG d,d- d-e=ZCG d.d/ d/ej<ZDdS )1zhAdapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/layers/linear.py    )annotationsN)TYPE_CHECKINGDictListOptionalTuple)nn)	ParameterUninitializedParameter)divideget_tensor_model_parallel_rank$get_tensor_model_parallel_world_sizeget_tp_groupsplit_tensor_along_last_dim tensor_model_parallel_all_gather tensor_model_parallel_all_reduce)use_symmetric_memory)get_attention_tp_groupis_allocation_symmetric)BasevLLMParameterBlockQuantScaleParameterPackedColumnParameterPackedvLLMParameterPerTensorScaleParameterRowvLLMParameter_ColumnvLLMParameter)pad_or_narrow_weight)get_bool_env_varis_cpuis_hipis_npuset_weight_attrs)QuantizationConfigQuantizeMethodBaseSGLANG_ROCM_DISABLE_LINEARQUANT)CompressedTensorsLinearMethodAWQMarlinLinearMethodAWQLinearMethodAWQLinearAscendMethodGPTQMarlinLinearMethodFp8LinearMethodBlockInt8LinearMethodMarlinLinearMethodQQQLinearMethodGPTQMarlin24LinearMethodTPUInt8LinearMethodGPTQLinearMethodFBGEMMFp8LinearMethodGPTQLinearAscendMethodModelOptFp8LinearMethodModelOptFp4LinearMethodIPEXAWQLinearMethodPetitNvFp4LinearMethodQuarkInt4Fp8LinearMethodc                 C  s,   t | dd }|d u r||fS || || fS )Nmarlin_tile_size)getattr)param
shard_sizeshard_offsetr8    r=   L/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/linear.pyadjust_marlin_shardP   s   r?   r:   r	   shard_offsetsDict[str, Tuple[int, int]]loaded_shard_idstrreturnTuple[int, int]c           
      C  sD   |d \}}|| \}}| j jd }|| | }|| | }	|	|fS )zDAdjust the quantization offsets and sizes for BitsAndBytes sharding.totalr   )datashape)
r:   r@   rB   rF   _orig_offset	orig_sizequantized_totalquantized_offsetquantized_sizer=   r=   r>   adjust_bitsandbytes_4bit_shardX   s   rO   c                 C  sl   dddd}t |tr|| }nt |tstd| t|jdkr0|jd dks,J |d }| | |fS )a  For fused modules (QKV and MLP) we have an array of length
    N that holds 1 scale for each "logical" matrix. So the param
    is an array of length N. The loaded_weight corresponds to
    one of the shards on disk. Here, we slice the param based on
    the shard_id for loading.
    r         qkvzUnknown Shard Id )
isinstancerC   int
ValueErrorlenrH   )r:   loaded_weightshard_idqkv_idxsr=   r=   r>   adjust_scalar_to_fused_arrayg   s   


r]   c                 C  sn   | |}| d d | d d  }||kr5g }d}| D ]\}}}	||	 | }
||||
f ||
7 }q|S | S )Nr   )sizeappend)r@   rZ   dimactual_weight_sizetarget_weight_sizenew_shard_offsets
new_offsetr[   r<   r;   actual_shard_sizer=   r=   r>   adjust_shard_offsets~   s   

rh   c                      s6   e Zd ZdZ				dd fddZdddZ  ZS )
LinearBaseaa  Base linear layer.

    Args:
        input_size: input dimension of the linear layer.
        output_size: output dimension of the linear layer.
        bias: If true, add bias.
        skip_bias_add: If true, skip adding bias but instead return it.
        params_dtype: Data type for the parameters.
        quant_config: Quantization configure.
    FN 
input_sizerW   output_sizeskip_bias_addboolparams_dtypeOptional[torch.dtype]quant_configOptional[QuantizationConfig]prefixrC   c                   sl   t    || _|| _|| _|d u rt }|| _|| _|d u r,ddl	m
} | | _d S |j| |d| _d S )Nr   )UnquantizedLinearMethodrs   )super__init__rk   rl   rm   torchget_default_dtypero   rq   &sglang.srt.layers.quantization.unquantrt   quant_methodget_quant_method)selfrk   rl   rm   ro   rq   rs   rt   	__class__r=   r>   rw      s   
	zLinearBase.__init__xtorch.TensorrD   c                 C  s   t N)NotImplementedError)r}   r   r=   r=   r>   forward   s   zLinearBase.forwardFNNrj   )rk   rW   rl   rW   rm   rn   ro   rp   rq   rr   rs   rC   )r   r   rD   r   )__name__
__module____qualname____doc__rw   r   __classcell__r=   r=   r~   r>   ri      s    ri   c                      sL   e Zd ZdZ					d!d" fddZd#ddZd$ddZd%dd Z  ZS )&ReplicatedLineara  Replicated linear layer.

    Args:
        input_size: input dimension of the linear layer.
        output_size: output dimension of the linear layer.
        bias: If true, add bias.
        skip_bias_add: If true, skip adding bias but instead return it.
        params_dtype: Data type for the parameters.
        quant_config: Quantization configure.
        prefix: The name of the layer in the state dict, including all parents
                        (e.g. model.layers.0.qkv_proj)
    TFNrj   rk   rW   rl   biasrn   rm   ro   rp   rq   rr   rs   rC   c              	     s   t  j||||||d | jd usJ | jj| | j| jg| j| j| j| jd |rAtt	j
| j| jd| _t| jd| jd d S | dd  d S )Nru   )weight_loaderdtyper   
output_dimr   r   )rv   rw   r{   create_weightsrk   rl   ro   r   r	   rx   emptyr   r!   register_parameter)r}   rk   rl   r   rm   ro   rq   rs   r~   r=   r>   rw      s:   


zReplicatedLinear.__init__r:   r	   rZ   r   c                 C  s   t |jdkr|d}trI| | kr3|ddkr3t||d r,|d d }nt| d|jtj	ks?|jtj	krI|j|jksIJ d| | ksSJ |j
| d S )Nr   rP   z are not all equalz:init para dtype and loaded weight dtype should be the same)rY   rH   reshape_is_npur`   rx   allcloserX   r   int8rG   copy_r}   r:   rZ   r=   r=   r>   r      s   
zReplicatedLinear.weight_loaderr   rD   +Tuple[torch.Tensor, Optional[torch.Tensor]]c                 C  sF   | j s| jnd }| jd usJ | j| ||}| j r| jnd }||fS r   )rm   r   r{   apply)r}   r   r   outputoutput_biasr=   r=   r>   r     s
   zReplicatedLinear.forwardc                 C  s4   d| j  }|d| j 7 }|d| jd u 7 }|S )Nin_features=, output_features=, bias=)rk   rl   r   r}   sr=   r=   r>   
extra_repr  s   zReplicatedLinear.extra_repr)TFNNrj   )rk   rW   rl   rW   r   rn   rm   rn   ro   rp   rq   rr   rs   rC   r:   r	   rZ   r   )r   r   rD   r   rD   rC   )	r   r   r   r   rw   r   r   r   r   r=   r=   r~   r>   r      s    
-
r   c                      s`   e Zd ZdZ											d)d* fddZd+d d!Zd+d"d#Zd$d% Zd,d'd(Z  Z	S )-ColumnParallelLinearaC  Linear layer with column parallelism.

    The linear layer is defined as Y = XA + b. A is parallelized along
    its second dimension as A = [A_1, ..., A_p].

    Args:
        input_size: first dimension of matrix A.
        output_size: second dimension of matrix A.
        bias: If true, add bias.
        gather_output: If true, call all-gather on output and make Y available
                       to all GPUs, otherwise, every GPU will have its output
                       which is Y_i = XA_i
        skip_bias_add: This was added to enable performance optimizations where
                       bias can be fused with other element-wise operations. we
                       skip adding bias but instead return it.
        params_dtype: Data type for the parameters.
        quant_config: Quantization configure.
        output_sizes: list of output sizes packed into one output, like for QKV
                       the list would be size 3.
        prefix: The name of the layer in the state dict, including all parents
                        (e.g. model.layers.0.qkv_proj)
    TFNrj   rk   rW   rl   r   rn   gather_outputrm   ro   rp   rq   rr   output_sizesOptional[List[int]]rs   rC   tp_rankOptional[int]tp_sizeuse_presharded_weightsskip_block_quant_checkc              
     s  t  ||||||	 || _|| _|
d u rt }
 d u rt  |
 | _| _| jd us-J t	| j
 | _| jg| _t| drI fdd| jD | _|d u rP|g}| jj| | j| j| j| j
| j|| jjjtv ri| jn| jd |rttj| j|d| _t| jd| jd d S | dd  d S )	Nr   c                   s   g | ]}t | qS r=   )r   .0rl   r   r=   r>   
<listcomp>O  s    
z1ColumnParallelLinear.__init__.<locals>.<listcomp>layerinput_size_per_partitionoutput_partition_sizesrk   rl   ro   r   r   r   r   r   r   )rv   rw   r   r   r   r   r   r   r{   r   rl   output_size_per_partitionr   hasattrr   r   rk   ro   r   r   WEIGHT_LOADER_V2_SUPPORTEDweight_loader_v2r   r	   rx   zerosr   r!   r   )r}   rk   rl   r   r   rm   ro   rq   r   rs   r   r   r   r   r~   r   r>   rw   -  sT   


zColumnParallelLinear.__init__r:   r	   rZ   r   c                 C  s   t |dd }|j}t |dd}t |dd}|r| |_|r,t|tr,|j|j|jd t |dd}|d urc|sc|j| }| j	| }	t
rYddlm}
 |
||d|	||| j \}}n
| jsc|||	|}t|jdkro|d	}|j|jkswJ || d S )
Nr   is_gguf_weightFis_gguf_weight_typer   use_bitsandbytes_4bitr   %narrow_padded_param_and_loaded_weightrP   )r9   rG   itemweight_typerV   r
   materializerH   r   r   _is_cpu$sglang.srt.model_loader.weight_utilsr   r   narrowrY   r   r   )r}   r:   rZ   r   
param_datar   r   r   r;   	start_idxr   r=   r=   r>   r   r  s>   





z"ColumnParallelLinear.weight_loaderc                 C     t |jdkr| dksJ |d}t|tr%|j|| j| jd d S z|j|| j| jd W d S  t	yA   || Y d S w Nr   rP   )r   r   )
rY   rH   numelr   rV   r   load_column_parallel_weightr   r   	TypeErrorr   r=   r=   r>   r     s$   


	z%ColumnParallelLinear.weight_loader_v2c                 C  sZ   | j s| jnd }| jd usJ | j| ||}| jrt|}n|}| j r'| jnd }||fS r   )rm   r   r{   r   r   r   )r}   input_r   output_parallelr   r   r=   r=   r>   r     s   
zColumnParallelLinear.forwardrD   c                 C  T   d| j  }|d| j 7 }|d| jd u 7 }|d| j 7 }|d| j 7 }|S )Nr   r   r   
, tp_size=z, gather_output=)rk   r   r   r   r   r   r=   r=   r>   r        zColumnParallelLinear.extra_repr)TFFNNNrj   NNFF)rk   rW   rl   rW   r   rn   r   rn   rm   rn   ro   rp   rq   rr   r   r   rs   rC   r   r   r   r   r   rn   r   rn   r   r   
r   r   r   r   rw   r   r   r   r   r   r=   r=   r~   r>   r     s$    
E
1r   c                      sf   e Zd ZdZ									d(d) fddZ	d*d+dd Zd,d"d#Zd,d$d%Z	d*d-d&d'Z  Z	S ).MergedColumnParallelLineara(  Packed linear layers with column parallelism.

    Similar to ColumnParallelLinear, but the weight matrix is concatenated
    along the output dimension. When the weight matrix is loaded, the
    different partitions are sharded separately.

    Args:
        input_size: input dimension of the linear layer.
        output_sizes: list of output dimensions of the linear layer.
        bias: If true, add bias.
        gather_output: If true, call all-gather on output and make the output
                       available to all GPUs, otherwise, every GPU will have
                       its own output.
        skip_bias_add: This was added to enable performance optimizations where
                       bias can be fused with other element-wise operations. we
                       skip adding bias but instead return it.
        params_dtype: Data type for the parameters.
        quant_config: Quantization configure.
        prefix: The name of the layer in the state dict, including all parents
                        (e.g. model.layers.0.qkv_proj)
    TFNrj   rk   rW   r   	List[int]r   rn   r   rm   ro   rp   rq   rr   rs   rC   r   r   r   r   c                   s   || _ |	d u r
t }	 d u rt  |	 | _| _t fdd|D s%J || _t j|t	||||||||	 |d || _
d S )Nc                 3  s    | ]	}|  d kV  qdS )r   Nr=   r   r   r=   r>   	<genexpr>   s    z6MergedColumnParallelLinear.__init__.<locals>.<genexpr>)rk   rl   r   r   rm   ro   rq   rs   r   r   r   )r   r   r   r   r   allr   rv   rw   sumrs   )r}   rk   r   r   r   rm   ro   rq   rs   r   r   r   r~   r   r>   rw     s,   
z#MergedColumnParallelLinear.__init__r:   r	   rZ   r   rB   c                   s  t |dd}t |dd}|r|j| | | |j|< d S |rQt |dd }||| j }| j| }||||}|j	
| t|j|j|< |j
| d S |j}	t |dd }t |dd}
t |dd}|d u r|d u r|ryt|	|d\}	}|	j|jksJ |	| d S d}g }t| jD ]\}}|
|||f ||7 }qt |dd }t |d	d}trt|||}|D ]S\}}}||kr||j }||j }t|||\}}|rttdg| j   fd
dt| jD }| jdf|d< t||t|\}}||||}| ||| qd S |t| jk sJ |d urt| jd | | j }| j| | j }t |dd }||krN||j }||j }t|||\}}t |d	d}|rc|j| }|j| | }|	|||}	| j| }trddlm} ||	|d|||| o| j  \}	}nQ|s| j s|| }||j| krt!||||}n6||||}n.|
r|jd }|| }|	d||}	n|rt|	||\}	}nt |dd}|st"#d |	j|jksJ |	| d S )Nr   Fr   r   is_metadataneeds_scalar_to_arrayr   
packed_dimr   c                   s"   i | ]\}}t | | |fqS r=   )rC   )r   ir`   indexr=   r>   
<dictcomp>Z  s    z<MergedColumnParallelLinear.weight_loader.<locals>.<dictcomp>rF   r   ignore_warningzLoading a weight without `output_dim` attribute in MergedColumnParallelLinear, assume the weight is the same for all partitions.)$r9   rG   r   r   shard_weight_typer`   r   r   r   r[   ra   rY   data_containershard_id_mapr]   rH   	enumerater   r   rh   pack_factorr?   list	itertools
accumulaterl   rO   rC   r   r   r   r   r   r   loggerwarning)r}   r:   rZ   rB   r   r   r   r;   r   r   r   r   current_shard_offsetr@   r   rl   r   r   r[   r<   orig_offsetsloaded_weight_shardr   end_idxr   r=   r   r>   r     s   	
















z(MergedColumnParallelLinear.weight_loaderr   c                 C  s   d}g }t | jD ]\}}||||f ||7 }q	|D ]*\}}}	t|ttfr7|j|jkr7|j|	|d\}	}|	|j||	}
| 
||
| qdS )a  
        Handle special case for models where MLP layers are already
        fused on disk. In this case, we have no shard id. This function
        determmines the shard id by splitting these layers and then calls
        the weight loader using the shard id.

        An example of a model with these fused layers:
        https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
        r   r;   r<   N)r   r   ra   rV   r   r   r   r    adjust_shard_indexes_for_packingr   r   )r}   r:   rZ   r   r@   r   rl   r[   r<   r;   r   r=   r=   r>   "_load_fused_module_from_checkpoint  s"   

z=MergedColumnParallelLinear._load_fused_module_from_checkpointc              
   C  s   | j jj}|d |d }}t|ddrdn|}g }g }d}| jD ]}	|	| d | }
||
 || ||
7 }q!tt||D ]'\}\}}
||j	||
}|| j
 }|
| j
 }|j||||| j| j
| jd qAdS )z
        Handle block-wise scale loading for MergedColumnParallelLinear.
        Similar to QKVParallelLinear._load_qkv_block_scale, but for merged column layers.
        r   rP   format_ue8m0F)rZ   r[   r<   r;   r   r   r   N)r{   rq   weight_block_sizer9   r   ra   r   zipr   r   r   load_merged_column_weightr   r   )r}   r:   rZ   r   block_nrI   shard_block_sizesshard_block_offsetscurrent_block_offsetrl   shard_block_sizer[   shard_block_offsetr   rank_shard_offsetrank_shard_sizer=   r=   r>   _load_merged_block_scale  s:   






z3MergedColumnParallelLinear._load_merged_block_scalec           
   	   C  sV  |d u r?t |tr|j|d| j| jd d S t |tr#| || d S t|tt	fv r7|j|| j| jd d S | 
|| d S |t| jk sHJ t |tr| jjj}|d |d }}t|ddrcdn|}t| jd | | d | | j }| j| | d | | j }	nt| jd | | j }| j| | j }	|j||||	| j| j| jd d S )Nr   )rZ   r[   r   r   )rZ   r   r   rP   r   F)rZ   r[   r<   r;   r   r   r   )rV   r   r   r   r   r   r   typer   r   r   rY   r   r{   rq   r   r9   r   r   )
r}   r:   rZ   rB   r   raw_block_nrI   r   r<   r;   r=   r=   r>   r     s\   




z+MergedColumnParallelLinear.weight_loader_v2)	TFFNNrj   NNF)rk   rW   r   r   r   rn   r   rn   rm   rn   ro   rp   rq   rr   rs   rC   r   r   r   r   r   rn   r   )r:   r	   rZ   r   rB   r   r:   r   rZ   r   )r:   r   rZ   r   rB   r   )
r   r   r   r   rw   r   r   r   r   r   r=   r=   r~   r>   r     s&    ) 
'
$1r   c                      s~   e Zd ZdZ											d/d0 fddZd1ddZd1dd Zd2d%d&Zd2d'd(Z	d3d4d*d+Z		d3d5d-d.Z
  ZS )6QKVParallelLineara1  Linear layers for the attention's QKV transformation.

    Linear layers for the linear transformation of the query, key, and value
    vectors in the attention layer. The weight matrix is concatenated along
    the output dimension. The layer is parallelized along the head dimension.
    When the number of key/value heads is smaller than the number of query
    heads (e.g., multi-query/grouped-query attention), the key/value head may
    be replicated while the query heads are partitioned.

    Args:
        hidden_size: input hidden state size of the transformer.
        head_size: size of each attention head.
        total_num_heads: total number of attention query heads.
        total_num_kv_heads: total number of attention key/value heads. If
                            None, assume total_num_kv_heads = total_num_heads.
        bias: If true, add bias.
        skip_bias_add: This was added to enable performance optimizations where
                       bias can be fused with other element-wise operations. we
                       skip adding bias but instead return it.
        params_dtype: Data type for the parameters.
        quant_config: Quantization configure.
        prefix: The name of the layer in the state dict, including all parents
                        (e.g. model.layers.0.qkv_proj)
    NTFrj   hidden_sizerW   	head_sizetotal_num_headstotal_num_kv_headsr   r   rn   rm   ro   rp   rq   rr   rs   rC   r   r   load_presharded_attnv_head_sizer   c                   sf  || _ || _|d ur|n|| _|| _|d u r|}|| _|
d u r"t }
|d u r)t }|
|| _| _t	| j|| _
|| jkrGd| _t	|| j| _n
t	| j|| _d| _| j
| j | _| j| j | _| j| j | _| j }| j
| j | j| j  | j| j  | }| j
| j | | j| j | | j| j | g| _|| _trd n|}t j|||d||||	|
|| j|d d S )NrP   F)rk   rl   r   r   rm   ro   rq   rs   r   r   r   r   )r  r  r	  r  r  r   r   r   r   r   	num_headsnum_kv_headsnum_kv_head_replicasq_proj_shard_sizekv_proj_shard_sizev_proj_shard_sizer   r   _disable_hip_linear_quantrv   rw   )r}   r  r  r  r  r   rm   ro   rq   rs   r   r   r  r	  r   rk   rl   r~   r=   r>   rw   \  sb   




zQKVParallelLinear.__init__rB   c                 C  sH   d| j | j | j | j | j | j | j | j | j| j  d}||S )Nr   rS   rT   rU   rF   r
  r  r  r	  get)r}   rB   shard_offset_mappingr=   r=   r>   _get_shard_offset_mapping  s   


z+QKVParallelLinear._get_shard_offset_mappingc                 C  s.   | j | j | j| j | j| j d}||S )NrR   r  )r}   rB   shard_size_mappingr=   r=   r>   _get_shard_size_mapping  s
   



z)QKVParallelLinear._get_shard_size_mappingr:   r   rZ   r   c                 C  s   dd| j | j fd| j | j | j| j fd| j | j | j | j| j fg}|D ]-\}}}t|ttfrB|j|jkrB|j	||d\}}| j
sM||j||}| ||| q'dS )a  
        Handle special case for models where QKV layers are already
        fused on disk. In this case, we have no shard id. This function
        determmines the shard id by splitting these layers and then calls
        the weight loader using the shard id.

        An example of a model with these fused layers:
        https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
        rS   r   rT   rU   r   N)r  r  r  r	  rV   r   r   r   r   r   r   r   r   )r}   r:   rZ   r@   r[   r<   r;   r   r=   r=   r>   r     s.   



z4QKVParallelLinear._load_fused_module_from_checkpointc              
   C  s   | j jj\}}| j| j | }| j| j | }| j| j | }dd|fd||fd|| |fg}|D ]*\}	}
}||j|
|}| |	| }| 	|	| }|j
|| j|	||| j| jd q1d S )NrS   r   rT   rU   rZ   r
  r[   r<   r;   r   r   )r{   rq   r   r  r  r  r   r   r  r  load_qkv_weightr  r   r   )r}   r:   rZ   r   rI   q_sizek_sizev_sizer@   r[   r<   r;   r   r   r   r=   r=   r>   _load_qkv_block_scale  s0   z'QKVParallelLinear._load_qkv_block_scaleOptional[str]c           
   	   C  s
  |d u r7t |tr|j|dd d S t|ttfv r"|j|d d S t |tr/| || d S | || d S |dv s=J | 	|}| 
|}t |trt| jjj}|d |d }}t|ddrbdn|}	||	 d |	 }||	 d |	 }|j|| j|||| j| jd d S )	Nr   )rZ   r[   )rZ   rR   rP   r   Fr  )rV   r   r  r   r   r   r   r  r   r  r  r{   rq   r   r9   r  r   r   )
r}   r:   rZ   rB   r<   r;   r   r  rI   r   r=   r=   r>   r     s<   






z"QKVParallelLinear.weight_loader_v2r	   c                 C  s  t |dd}t |dd}|r+|d ur+dddd}|j||  | | |j|< d S |r]t |dd }||| j }| j| }	|||	|}|j	
| t|j|j|< |j
| d S |j}
t |dd }t |d	d}t |d
d}|d u r=|d u r|rt|
|d\}
}|
j|jksJ |
| d S dd| j| j fd| j| j | j| j fd| j| j | j | j| j fg}t |dd}t |dd }trt|||}|D ]k\}}}||kr||j }||j }t|||\}}|r(d| j| j f| j| j | j| j f| j| j | j | j| j f| j| j | j | j| j  dfd}t|||\}}| js3||||}| ||| qd S |dv sDJ |d ur#|dkrWd}| j| j }n&|dkri| j| j }| j| j }n|dkr}| j| j | j }| j| j }t |dd }||kr||j }||j }t|||\}}t |dd}|rd| j| j f| j| j | j| j f| j| j | j | j| j f| j| j | j | j| j  dfd}t|||\}}|
|||}
|dkr| j}n| j| j }|| }	trddlm} ||
|d|	||| o| j \}
}nB|s"| js"|||	|}n3|r<|jd }g d |}|
d|| |}
n|rHt|
||\}
}nt |dd}|sVt!"d |
j|jksiJ d|
jd|j|
| d S )Nr   Fr   r   rP   rQ   rR   r   r   r   rS   rT   rU   r   r   r  r   r   zwLoading a weight without `output_dim` attribute in QKVParallelLinear, assume the weight is the same for all partitions.param_data.shape= loaded_weight.shape=)#r9   rG   r   r   r   r`   r   r   r   r[   ra   rY   r   r   r]   rH   r  r  r  r	  r   rh   r   r?   rO   r   r   r
  r  r  r   r   r   r   r   )r}   r:   rZ   rB   r   r   idx_mapr   r;   r   r   r   r   r@   r   r   r[   r<   orig_qkv_offsetsr   r   shard_indexr   r=   r=   r>   r   '  s2  	




























zQKVParallelLinear.weight_loader)NTFNNrj   NNFNF)r  rW   r  rW   r  rW   r  r   r   rn   rm   rn   ro   rp   rq   rr   rs   rC   r   r   r   r   r  rn   r	  r   r   rn   )rB   rC   r  r   )r:   r   rZ   r   rB   r  )r:   r	   rZ   r   rB   r  )r   r   r   r   rw   r  r  r   r  r   r   r   r=   r=   r~   r>   r  B  s,    
E



-!.r  c                      sb   e Zd ZdZ											d)d* fddZd+dd Zd,d"d#Zd-d$d%Zd.d'd(Z  Z	S )/RowParallelLineara  Linear layer with row parallelism.

    The linear layer is defined as Y = XA + b. A is parallelized along
    its first dimension and X along its second dimension as:
               -   -
              | A_1 |
              | .   |
          A = | .   |        X = [X_1, ..., X_p]
              | .   |
              | A_p |
               -   -
    Arguments:
        input_size: first dimension of matrix A.
        output_size: second dimension of matrix A.
        bias: If true, add bias. Note that bias is not parallelized.
        input_is_parallel: If true, we assume that the input is already
                           split across the GPUs and we do not split
                           again.
        skip_bias_add: This was added to enable performance optimization where
                       bias can be fused with other element-wise operations.
                       We skip adding bias but instead return it.
        params_dtype: Data type for the parameters.
        quant_config: Quantization configure.
    TFNrj   rk   rW   rl   r   rn   input_is_parallelrm   ro   rp   reduce_resultsrq   rr   rs   rC   r   r   r   r   use_dp_attention_reducec              	     s   t rd n|}t ||||||	 || _|| _|| _|
d u r!t }
|d u r(t }|
|| _| _	t
|| j	| _| jd us=J || _| jj| | j| jg| j| j| j| jjjtv rY| jn| jd |rwttj| j|d| _t| jd| jd d S | dd  d S )N)r   r   r   rk   rl   ro   r   r   r   r   r   )r  rv   rw   r%  r&  r'  r   r   r   r   r   r   r{   r   r   rl   rk   ro   r   r   r   r   r   r	   rx   r   r   r!   r   )r}   rk   rl   r   r%  rm   ro   r&  rq   rs   r   r   r   r'  r~   r=   r>   rw     sF   zRowParallelLinear.__init__r:   r	   rZ   r   c                 C  sR  t |dd }t |dd}t |dd}t |dd}|r| |_|r@t|tr@t|j}|r6|| | j ||< |jt	||j
d |j}|d ur|s| js|j| }	| j|	 }
trjddlm} |||d|
||	\}}n|
|	 }||j| kr}t|||
|	}n|||
|	}t|jdkr|d	}|j|jksJ d
|jd|j|| d S )N	input_dimr   Fr   r   r   r   r   rP   r  r   )r9   r   r   rV   r
   r   rH   r   r   tupler   rG   r   r   r   r   r   r   r   rY   r   r   )r}   r:   rZ   r(  r   r   r   weight_shaper   r;   r   r   r   r=   r=   r>   r   W  sV   






zRowParallelLinear.weight_loaderr   c                 C  r   r   )
rY   rH   r   r   rV   r   load_row_parallel_weightr   r   r   r   r=   r=   r>   r     s$   


	z"RowParallelLinear.weight_loader_v2c           	      C  s   | j r|}nt|| jd}|| j  }| jd usJ | jdks#| jr%d n| j}tt	 t
  d | jj| ||d}W d    n1 sEw   Y  | jrc| jdkrc|sc| jr^t |}nt|}n|}| jrk| jnd }||fS )N)num_partitionsr   )disabled)r   rP   )r%  r   r   r   
contiguousr{   rm   r   r   r   r   r   r&  r'  r   
all_reducer   )	r}   r   skip_all_reduceinput_parallelsplitted_inputbias_r   r   r   r=   r=   r>   r     s(   

zRowParallelLinear.forwardrD   c                 C  r   )Nzinput_features=r   r   r   z, reduce_results=)r   rl   r   r   r&  r   r=   r=   r>   r     r   zRowParallelLinear.extra_repr)TTFNTNrj   NNFF)rk   rW   rl   rW   r   rn   r%  rn   rm   rn   ro   rp   r&  rn   rq   rr   rs   rC   r   r   r   r   r   rn   r'  rn   r   r  )Fr   r   r=   r=   r~   r>   r$     s$    
=
?
r$  c                      s@   e Zd ZdZ				dd  fddZd!ddZd"ddZ  ZS )#"MergedColumnParallelRepeatedLineara  Merged column parallel linear and repeated linear layer.

    TODO: quantization is not supported yet.
    Args:
        input_size: input dimension of the linear layer.
        column_output_sizes: output dimension of the column linear layers.
        repeated_output_sizes: output dimension of the repeated linear layers.
        skip_bias_add: If true, skip adding bias but instead return it.
        params_dtype: Data type for the parameters.
        quant_config: Quantization configure.
    FNrj   rk   rW   column_output_sizesr   repeated_output_sizesrm   rn   ro   rp   rq   rr   rs   rC   c           	   
     s   t |t | }t j||||||d t| _t  _t  _ fdd|D |  _	 j
j  j j	 j j jd jd | _d S )N)rk   rl   rm   ro   rq   rs   c                   s   g | ]}t | jqS r=   )r   r   )r   r   r}   r=   r>   r      s    z?MergedColumnParallelRepeatedLinear.__init__.<locals>.<listcomp>Tr   )r   rv   rw   rY   num_column_parallelr   r   r   r   r   r{   r   rk   rl   ro   r   rs   )	r}   rk   r5  r6  rm   ro   rq   rs   rl   r~   r7  r>   rw     s8   



z+MergedColumnParallelRepeatedLinear.__init__r   r   rD   c                 C  s   | j | |S r   )r{   r   )r}   r   r=   r=   r>   r     s   z*MergedColumnParallelRepeatedLinear.forwardr:   r	   rZ   rB   c           	      C  sb   |j }t| jd | }| j| }|j|||}|| jk r*| j| }||||}|| d S r   )r   r   r   rG   r   r8  r   r   )	r}   r:   rZ   rB   r   r<   r;   r   r   r=   r=   r>   r     s   


z0MergedColumnParallelRepeatedLinear.weight_loaderr   )rk   rW   r5  r   r6  r   rm   rn   ro   rp   rq   rr   rs   rC   )r   r   rD   r   r:   r	   rZ   r   rB   rW   rD   r   r   r   r   r   rw   r   r   r   r=   r=   r~   r>   r4    s    
'r4  c                      s6   e Zd ZdZd fdd	ZdddZdddZ  ZS )ColumnParallelBatchedLineara7  Column parallel batched linear layer.

    TODO: quantization is not supported yet.
    Args:
        batch: batch dimension of the linear layer.
        input_size: input dimension of the linear layer.
        output_size: output dimension of the linear layer.
        dtype: Data type for the parameters.
    batchrW   rk   rl   r   torch.dtypec                   sR   t    t | _t | _tjtj	||| j ||ddd| _
t| j
d| j d S )Nr   F)requires_gradr   )rv   rw   r   r   r   r   r   r	   rx   r   weightsetattrr   )r}   r<  rk   rl   r   r~   r=   r>   rw   -  s   
z$ColumnParallelBatchedLinear.__init__inputr   rD   c                 C  s   t || jddS )Nr^   r_   )rx   bmmr?  	transpose)r}   rA  r=   r=   r>   r   9  s   z#ColumnParallelBatchedLinear.forwardr:   r	   rZ   rB   c                 C  s8   | j jd }| j| }|d||}|j| | d S )Nr_   r   )r?  rH   r   r   rG   r   )r}   r:   rZ   rB   r;   r   r=   r=   r>   r   <  s   
z)ColumnParallelBatchedLinear.weight_loader)r<  rW   rk   rW   rl   rW   r   r=  )rA  r   rD   r   r9  r:  r=   r=   r~   r>   r;  "  s
    

r;  )r:   r	   r@   rA   rB   rC   rD   rE   )Er   
__future__r   r   loggingtypingr   r   r   r   r   rx   r   torch.nn.parameterr	   r
   sglang.srt.distributedr   r   r   r   r   r   r   <sglang.srt.distributed.device_communicators.pynccl_allocatorr   sglang.srt.layers.dp_attentionr   r   sglang.srt.layers.parameterr   r   r   r   r   r   r   sglang.srt.layers.utilsr   sglang.srt.utilsr   r   r   r    r!   *sglang.srt.layers.quantization.base_configr"   r#   _is_hipr  	getLoggerr   r   r   r   r   r?   rO   r]   rh   Moduleri   r   r   r   r  r$  r4  r;  r=   r=   r=   r>   <module>   sV    $	$	

*_ A  o   A ]F