o
    پirZ                     @   s  d dl Z d dlmZ d dlmZmZmZmZ d dlZd dl	m
Z
mZ d dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZmZ d d
lmZ d dlm Z m!Z!m"Z" d dl#m$Z$ d dl%m&Z&m'Z'm(Z(m)Z)m*Z* dZ+e& Z,e( Z-e) Z.e /e0Z1e+fde2de2de2fddZ3	 d-de2de2de2dee2 fddZ4	 d-de2de2de2de2dee2 f
ddZ5eG dd dZ6ej7de' e.d d!ej8d"e2d#e2d$e2d%e2d&e2deej8ej8f fd'd(Z9G d)d* d*ej:j;Z<G d+d, d,e<Z=dS ).    N)	dataclass)ListOptionalSequenceTuple)	ParameterUninitializedParameter)divideget_tensor_model_parallel_rank$get_tensor_model_parallel_world_sizeget_tp_group tensor_model_parallel_all_reduce)use_symmetric_memory)PackWeightMethod)get_attn_tp_context)attn_tp_all_reduceget_attention_tp_rankget_attention_tp_sizeis_allocation_symmetric)BasevLLMParameter)QuantizationConfigQuantizeMethodBase method_has_implemented_embedding)UnquantizedEmbeddingMethod)cpu_has_amx_supportget_compiler_backendis_cpuis_npuset_weight_attrs@   
vocab_sizepad_toreturnc                 C   s   | | d | | S )z&Pad the vocab size to the given value.    )r    r!   r$   r$   ^/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/vocab_parallel_embedding.pypad_vocab_size4   s   r&   per_partition_vocab_sizerankoffsetc                 C   s    ||  }||  }|| || fS Nr$   )r'   r(   r)   index_findex_lr$   r$   r%   )vocab_range_from_per_partition_vocab_size9   s   r-   global_vocab_size
world_sizec                 C   s   t | |}t|||dS )Nr)   )r	   r-   )r.   r(   r/   r)   r'   r$   r$   r%   "vocab_range_from_global_vocab_sizeA   s   
r1   c                   @   s   e Zd ZU dZeed< eed< eed< eed< eed< eed< eed< eed	< ed
efddZed
efddZed
efddZ	ed
efddZ
ed
efddZed
efddZed
efddZdd ZdS )"VocabParallelEmbeddingShardIndicesz2Indices for a shard of a vocab parallel embedding.padded_org_vocab_start_indexpadded_org_vocab_end_indexpadded_added_vocab_start_indexpadded_added_vocab_end_indexorg_vocab_start_indexorg_vocab_end_indexadded_vocab_start_indexadded_vocab_end_indexr"   c                 C      | j | j S r*   )r8   r7   selfr$   r$   r%   num_org_elementsX      z3VocabParallelEmbeddingShardIndices.num_org_elementsc                 C   r;   r*   )r:   r9   r<   r$   r$   r%   num_added_elements\   r?   z5VocabParallelEmbeddingShardIndices.num_added_elementsc                 C   r;   r*   )r4   r3   r<   r$   r$   r%   num_org_elements_padded`   r?   z:VocabParallelEmbeddingShardIndices.num_org_elements_paddedc                 C   r;   r*   )r6   r5   r<   r$   r$   r%   num_added_elements_paddedd   r?   z<VocabParallelEmbeddingShardIndices.num_added_elements_paddedc                 C   r;   r*   )rA   r>   r<   r$   r$   r%   num_org_vocab_paddingh   r?   z8VocabParallelEmbeddingShardIndices.num_org_vocab_paddingc                 C   r;   r*   )rB   r@   r<   r$   r$   r%   num_added_vocab_paddingl   r?   z:VocabParallelEmbeddingShardIndices.num_added_vocab_paddingc                 C   s   | j | j S r*   )rA   rB   r<   r$   r$   r%   num_elements_paddedp   r?   z6VocabParallelEmbeddingShardIndices.num_elements_paddedc                 C   s   | j | jksJ | j| jksJ | j| jksJ | j| jks J | j| j ks(J | j| jks0J | j| jks8J | j| jks@J | j| j	ksHJ | j
| jksPJ d S r*   )r3   r4   r5   r6   r7   r8   r9   r:   r>   rA   r@   rB   r<   r$   r$   r%   __post_init__t   s   z0VocabParallelEmbeddingShardIndices.__post_init__N)__name__
__module____qualname____doc__int__annotations__propertyr>   r@   rA   rB   rC   rD   rE   rF   r$   r$   r$   r%   r2   J   s2   
 r2   T)dynamicbackenddisableinput_r7   r8   rC   r9   r:   c                 C   s^   | |k| |k @ }| |k| |k @ }|||  | }|| ||  }	||B }
|
| |	  } | |
 fS r*   r$   )rQ   r7   r8   rC   r9   r:   org_vocab_maskadded_vocab_maskadded_offsetvalid_offset
vocab_maskr$   r$   r%   get_masked_input_and_mask   s    
rW   c                       s   e Zd ZdZddedddddddeded	eej d
ee dedee	 de
dededef fddZedededededededefddZdeee  fddZdedejfd d!Zd"d# Zde
fd$d%Z  ZS )&VocabParallelEmbeddinga  Embedding parallelized in the vocabulary dimension.

    Adapted from torch.nn.Embedding, note that we pad the vocabulary size to
    make sure it is divisible by the number of model parallel GPUs.

    In order to support various loading methods, we ensure that LoRA-added
    embeddings are always at the end of TP-sharded tensors. In other words,
    we shard base embeddings and LoRA embeddings separately (both padded),
    and place them in the same tensor.
    In this example, we will have the original vocab size = 1010,
    added vocab size = 16 and padding to 64. Therefore, the total
    vocab size with padding will be 1088 (because we first pad 1010 to
    1024, add 16, and then pad to 1088).
    Therefore, the tensor format looks like the following:
    TP1, rank 0 (no sharding):
                            |< --------BASE-------- >|< -BASE PADDING-- >|< -----LORA------ >|< -LORA PADDING-- >|
    corresponding token_id: |  0  |  1  | ... | 1009 |  -1  | ... |  -1  | 1010 | ... | 1015 |  -1  | ... |  -1  |
                     index: |  0  |  1  | ... | 1009 | 1010 | ... | 1023 | 1024 | ... | 1039 | 1040 | ... | 1087 |

    TP2, rank 0:
                            |< --------------------BASE--------------------- >|< -----LORA------ >|< -LORA PADDING- >|
    corresponding token_id: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 1000 | ... | 1015 |  -1  | ... |  -1 |
                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 527  |  520 | ... | 543 |
    TP2, rank 1:
                            |< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >|
    corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1  | ...  | -1  |  -1  | ... |  -1  | -1  | ... |   -1 |
                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 519  | 520 | ... |  543 |

    Args:
        num_embeddings: vocabulary size.
        embedding_dim: size of hidden state.
        params_dtype: type of the parameters.
        org_num_embeddings: original vocabulary size (without LoRA).
        padding_size: padding size for the vocabulary.
        quant_config: quant config for the layer
        prefix: full name of the layer in the state dict
    N TF)params_dtypeorg_num_embeddingspadding_sizequant_configprefix	enable_tpuse_attn_tp_groupuse_presharded_weightsnum_embeddingsembedding_dimrZ   r[   r\   r]   r^   r_   r`   ra   c             	      s  t    || _|| _|	| _| jr#|	rt }t | _nt }t	 | _n|	du s)J d}d| _|| _
|p4|| _trHt| j|| j dkrH|| j9 }|| _|| j }|
| _|
r]|dks]J dt| j| j| _t| j| | j| _| j| jkswJ | | j| j| j
| j|| j| _|| _d }|d ur|j| |d}|d u rt }t| jtu }tt|}|r|stdt|j d|| _|d u rt }| j
| j | _ t!| j| j| _"| jj#| j"ksJ | jj$| jj% | _&| jj'| jj( | _)| jj*| | j| j"g| j| j|| j+d d S )	NFr   r#   z.Lora is not supported with presharded weights.)r^   z
The class zG must implement the 'embedding' method, see UnquantizedEmbeddingMethod.)rZ   weight_loader),super__init__r]   r_   r`   r   r   tp_sizer
   r   rb   org_vocab_size_is_cpur&   r\   ra   org_vocab_size_paddednum_embeddings_padded_get_indicesshard_indicesrc   get_quant_methodr   type	__class__rX   r   NotImplementedErrorrG   quant_methodtorchget_default_dtypenum_added_embeddingsr	   num_embeddings_per_partitionrE   r8   r7    num_org_embeddings_per_partitionr:   r9   "num_added_embeddings_per_partitioncreate_weightsrd   )r=   rb   rc   rZ   r[   r\   r]   r^   r_   r`   ra   tp_rankru   rr   is_embedding_layer!quant_method_implements_embeddingrp   r$   r%   rf      s   







zVocabParallelEmbedding.__init__vocab_size_paddedrj   r    rh   rz   rg   r"   c              	   C   sj   || }t |||\}}	t ||||d\}
}t||}t|	|}t|
|}t||}t||	|
|||||S )zGet start and end indices for vocab parallel embedding, following the
        layout outlined in the class docstring, based on the given tp_rank and
        tp_size.r0   )r1   minr2   )clsr~   rj   r    rh   rz   rg   num_added_embeddings_paddedr3   r4   r5   r6   r7   r8   r9   r:   r$   r$   r%   rl   =  s*   




z#VocabParallelEmbedding._get_indicesc           	   	   C   s  | j dk rdS g }g }g }t| j D ]g}| | j| j| j| j|| j }| j| }| j|d  }|t|||j	  |t||j	 ||j
  |t||j
 ||j
 |j  |t||j
 |j ||j
 |j  ||j
 |j |ksyJ q|| | }t|| jksJ |S )a   Get a mapping that can be used to reindex the gathered
        logits for sampling.

        During sampling, we gather logits from all ranks. The relationship
        of index->token_id will follow the same format as outlined in the class
        docstring. However, after the gather, we want to reindex the final
        logits tensor to map index->token_id one-to-one (the index is always
        equal the token_id it corresponds to). The indices returned by this
        method allow us to do that.
           Nr#   )rg   rangerl   rk   rj   rb   rh   rv   extendr>   rA   r@   rB   len)	r=   base_embeddingsadded_embeddingspaddingrz   rm   range_start	range_endretr$   r$   r%   get_sharded_to_full_mappingc  sv   


z2VocabParallelEmbedding.get_sharded_to_full_mappingparamloaded_weightc           	      C   s  t |dd }t |dd }t |dd r|j| | |_d S t|tr@t|j}|d ur6|| | j	 ||< |j
t||jd |d u rU|jj|jksMJ |j| d S | jj}| jj| }|d ur||krt|tro|jn|j}|j| | j|j ksJ || }|| }n"|j| | j| jr| j	nd ksJ d| jd| jd|j| | js||||}|d |jd	  j| ||jd	 d  jd	 d S )
N
output_dim
packed_dimis_gguf_weight_typedtyper#   zself.org_vocab_size=z self.use_presharded_weights=z! loaded_weight.shape[output_dim]=r   )getattrdatacopy_itemweight_type
isinstancer   listshaperg   materializetupler   rm   r7   r8   r   packed_factorrh   ra   narrowfill_)	r=   r   r   r   r   r   	start_idx
shard_sizer   r$   r$   r%   rd     sL   




z$VocabParallelEmbedding.weight_loaderc                 C   s   | j dkrt|| jj| jj| jj| jj| jj\}}n|}tt	 t
  d | j| | }W d    n1 s:w   Y  | j dkr^||dd t js^| jrZt|}|S t|}|S )Nr#   )disabledr   )rg   rW   rm   r7   r8   rC   r9   r:   r   r   r   rr   	embeddinglongmasked_fill_	unsqueezer   input_scatteredr`   r   r   )r=   rQ   masked_input
input_maskoutput_parallelr$   r$   r%   forward  s.   

	

zVocabParallelEmbedding.forwardc                 C   sV   d| j  }|d| j 7 }|d| j 7 }|d| j 7 }| jr)|d| j 7 }|S )Nznum_embeddings=z, embedding_dim=z, org_vocab_size=z, num_embeddings_padded=z
, tp_size=)rv   rc   rh   rk   r_   rg   )r=   sr$   r$   r%   
extra_repr  s   z!VocabParallelEmbedding.extra_repr)rG   rH   rI   rJ   DEFAULT_VOCAB_PADDING_SIZErK   r   rs   r   r   strboolrf   classmethodr2   rl   r   r   r   Tensorrd   r   r   __classcell__r$   r$   r}   r%   rX      sf    +	
u%A3rX   c                       s   e Zd ZdZdddeddddddededed	eej	 d
ee dedee
 dededef fddZdefddZdd Z  ZS )ParallelLMHeada  Parallelized LM head.

    Output logits weight matrices used in the Sampler. The weight and bias
    tensors are padded to make sure they are divisible by the number of
    model parallel GPUs.

    Args:
        num_embeddings: vocabulary size.
        embedding_dim: size of hidden state.
        bias: whether to use bias.
        params_dtype: type of the parameters.
        org_num_embeddings: original vocabulary size (without LoRA).
        padding_size: padding size for the vocabulary.
    FNrY   )biasrZ   r[   r\   r]   r^   r`   ra   rb   rc   r   rZ   r[   r\   r]   r^   r`   ra   c                   s   t  j||||||||	|
d	 || _tr,tr,t| dr,| jjtj	tj
fv r,tdgd| _|rEttj| j|d| _t| jd| jd d S | dd  d S )N)rZ   r[   r\   r]   r^   r`   ra   weight)weight_namesr   r   )r   rd   r   )re   rf   r]   ri   _is_cpu_amx_availablehasattrr   r   rs   bfloat16float16r   rr   r   emptyrv   r   r   rd   register_parameter)r=   rb   rc   r   rZ   r[   r\   r]   r^   r`   ra   r}   r$   r%   rf     s:   zParallelLMHead.__init__embed_tokensc                 C   s$   | j r| j  dkr|S |j| _| S )z%Tie the weights with word embeddings.gguf)r]   get_namer   )r=   r   r$   r$   r%   tie_weightsA  s   zParallelLMHead.tie_weightsc                 C   s
   ~t d)Nz/LMHead's weights should be used in the sampler.)RuntimeError)r=   rQ   r$   r$   r%   r   J  s   zParallelLMHead.forward)rG   rH   rI   rJ   r   rK   r   r   rs   r   r   r   rf   rX   r   r   r   r$   r$   r}   r%   r      sB    	
1	r   )r   )>loggingdataclassesr   typingr   r   r   r   rs   torch.nn.parameterr   r   sglang.srt.distributedr	   r
   r   r   r   <sglang.srt.distributed.device_communicators.pynccl_allocatorr   sglang.srt.layers.amx_utilsr   sglang.srt.layers.communicatorr   sglang.srt.layers.dp_attentionr   r   r   r   sglang.srt.layers.parameterr   *sglang.srt.layers.quantization.base_configr   r   r   &sglang.srt.layers.quantization.unquantr   sglang.srt.utilsr   r   r   r   r   r   r   ri   _is_npu	getLoggerrG   loggerrK   r&   r-   r1   r2   compiler   rW   nnModulerX   r   r$   r$   r$   r%   <module>   s   

	
	:  a