o
    پiL                     @   sr  d Z ddlZddlmZ ddlmZmZmZ ddlZddl	m
Z
 ddlmZ ddlmZ g dZeeZe Zd	ejd
ee fddZdejdejfddZG dd de
ZG dd deZG dd deZG dd deeZG dd deeZG dd deZG dd deeZG dd  d eZ G d!d" d"eZ!G d#d$ d$eZ"d%ed&ed'ed
efd(d)Z#d*d+ Z$d,d- Z%dS ).zdAdapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/parameter.py    N)Fraction)CallableOptionalUnion)	Parameter)pad_or_narrow_weight)is_cpu)	BasevLLMParameterPackedvLLMParameterPerTensorScaleParameterModelWeightParameterChannelQuantScaleParameterGroupQuantScaleParameterBlockQuantScaleParameterPackedColumnParameterRowvLLMParameterdtypereturnc                 C   sP   | t jt jt jt jfv rdS | t jt jfv rdS | t jkrdS | t jkr&dS d S )Nr            )	torchfloat8_e4m3fnfloat8_e4m3fnuzfloat8_e5m2float8_e5m2fnuzfloat16bfloat16float32float64)r    r    O/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/parameter.py_dtype_rank   s   

r"   targetloaded_weightc                 C   s   | j |j ksJ d| j d|j | j|jkr| | dS t| j}t|j}|du s1|du r=td| jd|j||k rMtd| jd|j| | dS )z
    Copy `loaded_weight` into `target` while forbidding downcasts.
    bf16/fp16 share the same rank, and all fp8 variants share the same rank.
    ztarget.shape=, loaded_weight.shape=Nz.Unsupported copy between dtypes: target.dtype=z, loaded_weight.dtype=z&Downcasting not allowed: target.dtype=)shaper   copy_r"   
ValueError)r#   r$   target_rankloaded_rankr    r    r!   copy_with_check/   s"   


r+   c                       s   e Zd ZdZdejf fddZdejdefddZe	dd	 Z
d
ejfddZd
ejfddZd
ejfddZd
ejfddZd
ejfddZ  ZS )r	   z
    Base parameter for vLLM linear layers. Extends the torch.nn.parameter
    by taking in a linear weight loader. Will copy the loaded weight
    into the parameter when the provided weight loader is called.
    datac                    s   t  j| |ddS )NF)r,   requires_grad)super__new__)clsr,   kwargs	__class__r    r!   r/   S   s   zBasevLLMParameter.__new__weight_loaderc                 C   s
   || _ dS )z
        Initialize the BasevLLMParameter

        :param data: torch tensor with the parameter data
        :param weight_loader: weight loader callable

        :returns: a torch.nn.parameter
        N_weight_loader)selfr,   r4   r    r    r!   __init__W   s   

zBasevLLMParameter.__init__c                 C      | j S Nr5   r7   r    r    r!   r4   c      zBasevLLMParameter.weight_loaderr$   c                 C   s"   | j j|jks	J | j | d S r:   )r,   r&   r'   r7   r$   r    r    r!   _assert_and_loadg   s   z"BasevLLMParameter._assert_and_loadc                 C      |  | d S r:   r>   r=   r    r    r!   load_column_parallel_weightk      z-BasevLLMParameter.load_column_parallel_weightc                 C   r?   r:   r@   r=   r    r    r!   load_row_parallel_weightn   rB   z*BasevLLMParameter.load_row_parallel_weightc                 K   r?   r:   r@   r7   r$   r1   r    r    r!   load_merged_column_weightq   rB   z+BasevLLMParameter.load_merged_column_weightc                 K   r?   r:   r@   rD   r    r    r!   load_qkv_weightt   rB   z!BasevLLMParameter.load_qkv_weight)__name__
__module____qualname____doc__r   Tensorr/   r   r8   propertyr4   r>   rA   rC   rE   rF   __classcell__r    r    r2   r!   r	   L   s    
r	   c                       sz   e Zd ZdZdef fddZedd Z	ddej	d	ed
e
fddZdej	fddZ	ddej	d	ed
e
fddZ  ZS )_ColumnvLLMParametera  
    Private class defining weight loading functionality
    (load_merged_column_weight, load_qkv_weight)
    for parameters being loaded into linear layers with column
    parallelism. This includes QKV and MLP layers which are
    not already fused on disk. Requires an output dimension
    to be defined. Called within the weight loader of
    each of the column parallel linear layers.
    
output_dimc                       || _ t jdi | d S Nr    )_output_dimr.   r8   )r7   rO   r1   r2   r    r!   r8         z_ColumnvLLMParameter.__init__c                 C   r9   r:   )rR   r;   r    r    r!   rO      r<   z_ColumnvLLMParameter.output_dimFr$   tp_rankuse_presharded_weightsc                 C   s   |s9| j j| j }ddlm} tr/|| j |d|| | j|\}}|j|jks(J || d S || j|| |}t| j | d S )Nr   %narrow_padded_param_and_loaded_weight)	r,   r&   rO   $sglang.srt.model_loader.weight_utilsrW   _is_cpur'   narrowr+   )r7   r$   rT   rU   
shard_sizerW   
param_datar    r    r!   rA      s&   
z0_ColumnvLLMParameter.load_column_parallel_weightc                 K   s   | d}| d}| d}| d}t| ttfr*| j| jkr*| j||d\}}| j}|| j||}ddl	m
} trN|||d|| | j|| \}}n#|sq|| }	|	| }
|
|j| j krit|| j|	|}n|| j|	|}|j|jksyJ || d S )Nshard_offsetr[   rT   rU   r]   r[   r   rV   )get
isinstancer   r
   
packed_dimrO    adjust_shard_indexes_for_packingr,   rZ   rX   rW   rY   r&   r   r'   )r7   r$   r1   r]   r[   rT   rU   r\   rW   	start_idxend_idxr    r    r!   rE      sF   







z._ColumnvLLMParameter.load_merged_column_weightc                 K   s   | d}| d}| d}| d}t| ttfr*| j| jkr*| j||d\}}| j}	|dkr3|n|| }|	| j||}	t	rXddl
m}
 |
|	|d|| | j|| \}	}n|sd|| j|| |}|	j|jksvJ d	|	jd
|j|	| d S )Nr]   r[   shard_id	num_headsr^   qr   rV   zparam_data.shape=r%   )r_   r`   r   r
   rO   ra   rb   r,   rZ   rY   rX   rW   r&   r'   )r7   r$   rT   rU   r1   r]   r[   re   rf   r\   rW   r    r    r!   rF      s@   






z$_ColumnvLLMParameter.load_qkv_weightF)rG   rH   rI   rJ   intr8   rL   rO   r   rK   boolrA   rE   rF   rM   r    r    r2   r!   rN   x   s,    


 5rN   c                       sN   e Zd ZdZdef fddZedd Z	ddej	d	ed
e
fddZ  ZS )r   z
    Parameter class defining weight_loading functionality
    (load_row_parallel_weight) for parameters being loaded
    into linear layers with row parallel functionality.
    Requires an input_dim to be defined.
    	input_dimc                    rP   rQ   )
_input_dimr.   r8   )r7   rk   r1   r2   r    r!   r8     rS   zRowvLLMParameter.__init__c                 C   r9   r:   )rl   r;   r    r    r!   rk     r<   zRowvLLMParameter.input_dimFr$   rT   rU   c           	      C   s   |sP| j j| j }ddlm} tr/|| j |d|| | j|\}}|j|jks(J || d S || }|| }||j| j krHt|| j||}n|| j||}t	|jdkr\|
d}| j j|jkseJ | j | d S )Nr   rV   r   )r,   r&   rk   rX   rW   rY   r'   r   rZ   lenreshape)	r7   r$   rT   rU   r[   rW   r\   rc   rd   r    r    r!   rC     s8   	


z)RowvLLMParameter.load_row_parallel_weightrh   )rG   rH   rI   rJ   ri   r8   rL   rk   r   rK   rj   rC   rM   r    r    r2   r!   r     s    
r   c                   @      e Zd ZdZdS )r   z]
    Parameter class for linear layer weights. Uses both column and
    row parallelism.
    NrG   rH   rI   rJ   r    r    r    r!   r   N      r   c                   @   ro   )r   z
    Parameter class for weight scales loaded for weights with
    grouped quantization. Uses both column and row parallelism.
    Nrp   r    r    r    r!   r   W  rq   r   c                   @   ro   )r   z
    Parameter class for weight scales loaded for weights with
    channel-wise quantization. Equivalent to _ColumnvLLMParameter.
    Nrp   r    r    r    r!   r   `  rq   r   c                   @   ro   )r   z
    Parameter class for weight scales loaded for weights with
    block-wise quantization. Uses both column and row parallelism.
    Nrp   r    r    r    r!   r   i  rq   r   c                       s~   e Zd ZdZ fddZdeeef defddZ fdd	Z	d
d Z
dd Z fddZdejdeeef fddZ  ZS )r   a  
    Parameter class for scales where the number of scales is
    equivalent to the number of logical matrices in fused linear
    layers (e.g. for QKV, there are 3 scales loaded from disk).
    This is relevant to weights with per-tensor quantization.
    Adds functionality to map the scalers to a shard during
    weight loading.

    Note: additional parameter manipulation may be handled
    for each quantization config specifically, within
    process_weights_after_loading
    c                    s$   dddd| _ t jdi | d S )Nr   r   r   )rg   kvr    )qkv_idxsr.   r8   )r7   r1   r2   r    r!   r8     s   z PerTensorScaleParameter.__init__re   r   c                 C   s4   t |tr|S t |tsJ || jv sJ | j| S r:   )r`   ri   strrt   )r7   re   r    r    r!   _shard_id_as_int  s
   

z(PerTensorScaleParameter._shard_id_as_intc                    .   | dd  | dd  t j|i | d S NrT   rU   popr.   rC   r7   argsr1   r2   r    r!   rC        z0PerTensorScaleParameter.load_row_parallel_weightc                 O      | j |i | d S r:   _load_into_shard_idr{   r    r    r!   rE        z1PerTensorScaleParameter.load_merged_column_weightc                 O   r~   r:   r   r{   r    r    r!   rF     r   z'PerTensorScaleParameter.load_qkv_weightc                    rw   rx   ry   r{   r2   r    r!   rA     r}   z3PerTensorScaleParameter.load_column_parallel_weightr$   c                 K   s^   | j }| |}t|jdkr|jd dksJ |d }|| }|j|jks(J || dS )zU
        Slice the parameter data based on the shard id for
        loading.
        r   r   N)r,   rv   rm   r&   r'   )r7   r$   re   r1   r\   r    r    r!   r     s   
z+PerTensorScaleParameter._load_into_shard_id)rG   rH   rI   rJ   r8   r   ru   ri   rv   rC   rE   rF   rA   r   rK   r   rM   r    r    r2   r!   r   r  s    
r   c                       j   e Zd ZdZ	ddeeef dedee f fddZe	dd	 Z
e	d
d Ze	dd Zdd Z  ZS )r   z
    Parameter for model parameters which are packed on disk
    and support column parallelism only. See PackedvLLMParameter
    for more details on the packed properties.
    Npacked_factorra   marlin_tile_sizec                    (   || _ || _|| _t jdi | d S rQ   _packed_factor_packed_dim_marlin_tile_sizer.   r8   r7   r   ra   r   r1   r2   r    r!   r8        zPackedColumnParameter.__init__c                 C   r9   r:   r   r;   r    r    r!   ra     r<   z PackedColumnParameter.packed_dimc                 C   r9   r:   r   r;   r    r    r!   r     r<   z#PackedColumnParameter.packed_factorc                 C   r9   r:   r   r;   r    r    r!   r     r<   z&PackedColumnParameter.marlin_tile_sizec                 C      t ||| j| jdS Nr[   r]   r   r   !_adjust_shard_indexes_for_packingr   r   r7   r[   r]   r    r    r!   rb        z6PackedColumnParameter.adjust_shard_indexes_for_packingr:   rG   rH   rI   rJ   r   ri   r   r   r8   rL   ra   r   r   rb   rM   r    r    r2   r!   r     s"    




r   c                       r   )r
   a  
    Parameter for model weights which are packed on disk.
    Example: GPTQ Marlin weights are int4 or int8, packed into int32.
    Extends the ModelWeightParameter to take in the
    packed factor, the packed dimension, and optionally, marlin
    tile size for marlin kernels. Adjusts the shard_size and
    shard_offset for fused linear layers model weight loading
    by accounting for packing and optionally, marlin tile size.
    Nr   ra   r   c                    r   rQ   r   r   r2   r    r!   r8     r   zPackedvLLMParameter.__init__c                 C   r9   r:   r   r;   r    r    r!   ra     r<   zPackedvLLMParameter.packed_dimc                 C   r9   r:   r   r;   r    r    r!   r     r<   z!PackedvLLMParameter.packed_factorc                 C   r9   r:   r   r;   r    r    r!   r     r<   z$PackedvLLMParameter.marlin_tile_sizec                 C   r   r   r   r   r    r    r!   rb     r   z4PackedvLLMParameter.adjust_shard_indexes_for_packingr:   r   r    r    r2   r!   r
     s"    



r
   paramrk   rO   c                    s6  t | dd t | dd du sdu r| j dksJ d du r1dus+J dd d  du rC dus=J d d d  fdd	t| j D }||  || d
|v rtt| d
rp| j||d
  kstJ d| jj| | _t| dr|| _t| dr|| _	d
|v rt| dr|d
 | _
| S )a
  
    Permute a parameter's layout to the specified input and output dimensions,
    useful for forcing the parameter into a known layout, for example, if I need
    a packed (quantized) weight matrix to be in the layout
        {input_dim = 0, output_dim = 1, packed_dim = 0}
    then I can call:
        permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
    to ensure x is in the correct layout (permuting it to the correct layout if
    required, asserting if it cannot get it to the correct layout)
    rk   NrO   r   z`permute_param_layout_ only supports 2D parameters when either input_dim or output_dim is not setz&either input or output dim must be setr   c                    s   g | ]
}| fvr|qS r    r    ).0icurr_input_dimcurr_output_dimr    r!   
<listcomp>-  s    z)permute_param_layout_.<locals>.<listcomp>ra   z9permute_param_layout_ currently doesn't support repackingrl   rR   r   )getattrr,   dimrangeinserthasattrra   permuterl   rR   r   )r   rk   rO   r1   permr    r   r!   permute_param_layout_
  s>   


r   c                 C   s   | | || fS r:   r    r[   r]   r   r    r    r!    _adjust_shard_indexes_for_marlinD  s   r   c                 C   s.   | | } || }|d urt | ||dS | |fS )Nr   )r   r   r    r    r!   r   H  s   r   )&rJ   logging	fractionsr   typingr   r   r   r   torch.nnr   sglang.srt.layers.utilsr   sglang.srt.utilsr   __all__	getLoggerrG   loggerrY   r   ri   r"   rK   r+   r	   rN   r   r   r   r   r   r   r   r
   r   r   r   r    r    r    r!   <module>   sF    
, ?				D(,
: