o
    پi(                     @   s  d Z ddlZddlZddlmZmZmZmZmZ ddl	Z	ddl	m
Z
 ddlmZmZmZ ddlmZ ddlmZmZ ddlmZmZmZ dd	lmZmZ dd
lmZ ddlmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z% e&e'Z(de)de)de)fddZ*		d+de	j
j+de	j,de	j,de	j,de	j,de#de-de.e fddZ/e/ed< G dd  d eZ0G d!d" d"eZ1d#e
j2d$ed% d&edeeef fd'd(Z3G d)d* d*e
j+Z4e4gZ5dS ),z$Wrapper around `transformers` models    N)IterableLiteralOptionalTupleUnion)nn)	AutoModelPretrainedConfigPreTrainedModel)ALL_ATTENTION_FUNCTIONS)divide$get_tensor_model_parallel_world_size)ColumnParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessorLogitsProcessorOutput)QuantizationConfig)RadixAttention)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)default_weight_loaderprefixnamereturnc                 C   s   | s|S |  d| S )a  Add a prefix to a name if the prefix is non-empty.

    Args:
        prefix: The prefix to add. If empty, no prefix will be added.
        name: The name to potentially prefix.

    Returns:
        The string "prefix.name" if prefix was non-empty, otherwise just "name".
    . )r   r   r   r   R/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/transformers.pymaybe_prefix/   s   
r   modulequerykeyvalueattention_maskforward_batchscalingattention_instancesc           
         st   || j  }	|d urt||	_|jd  dd |||fD \}}} fdd|||fD \}}}|	j||||dd fS )Nc                 s   s    | ]	}| d dV  qdS )      N)	transpose.0xr   r   r   	<genexpr>N       z1sglang_flash_attention_forward.<locals>.<genexpr>c                 3   s    | ]	}|  d V  qdS )N)reshaper,   hiddenr   r   r/   O   r0   )r%   )	layer_idxfloatr&   shapeforward)
r    r!   r"   r#   r$   r%   r&   r'   kwargs	self_attnr   r3   r   sglang_flash_attention_forward<   s   


r;   sglangc                       *   e Zd Zdejdejf fddZ  ZS )HFColumnParallelLinearinputr   c                       t  |d S Nr   superr8   selfr?   	__class__r   r   r8   X      zHFColumnParallelLinear.forward__name__
__module____qualname__torchTensorr8   __classcell__r   r   rF   r   r>   V       "r>   c                       r=   )HFRowParallelLinearr?   r   c                    r@   rA   rB   rD   rF   r   r   r8   ^   rH   zHFRowParallelLinear.forwardrI   r   r   rF   r   rQ   \   rP   rQ   linearstylecolwiserowwisequant_configc                    s^   t |tstdt| dttd|t G  fddd }|| j| j	| j
du|dS )a  
    Replace nn.Linear with one of vLLM's tensor parallel linear classes.

    Args:
        linear (nn.Linear): `nn.Linear` to be replaced.
        style (str): Tensor parallel style of the new linear, e.g. "colwise".
        quant_config (QuantConfig): Quantization config for the new linear.
    Returns:
        Union[ColumnParallelLinear, RowParallelLinear]: The new linear.
    z Unsupported parallel style type z, expected strrT   c                       sD   e Zd ZdZedeffddZdejdejf fddZ	  Z
S )z0replace_linear_class.<locals>.HFCompatibleLinearzP
        Wrapper class that removes `output_bias` from returned output.
        r   c                    s    S )Nr   )rE   sglang_linear_clsr   r   
parent_cls   s   z;replace_linear_class.<locals>.HFCompatibleLinear.parent_clsr?   c                    r@   rA   rB   rD   rF   r   r   r8      rH   z8replace_linear_class.<locals>.HFCompatibleLinear.forward)rJ   rK   rL   __doc__propertytyperZ   rM   rN   r8   rO   r   rX   rF   r   HFCompatibleLinearz   s
    "r^   N)
input_sizeoutput_sizebiasrW   )
isinstancestr
ValueErrorr]   r   r   getr   in_featuresout_featuresra   )rR   rS   rW   r^   r   rX   r   replace_linear_classb   s   
rh   c                       s   e Zd Z		ddedee deddf fddZd	ed
ej	dej	fddZ
defddZdej	fddZe 		d dejdejdedejdedefddZdeeeejf  fddZ  ZS )!TransformersForCausalLMN configrW   r   r   c                    s   t    td |_ _ j_ j_tj	jt
 ddd_t  t ds8 j j n j fddt jD _j t j jjt|dd	_ jrkj jj_t _d S )
NzUsing Transformers backend.r<   T)torch_dtypeattn_implementationtrust_remote_codehead_dimc                    s>   g | ]}t t jd  t j|j| ddqS )g      z.attn)	num_headsro   r&   num_kv_headslayer_idrW   r   )r   r   num_attention_headsnum_key_value_headsrW   )r-   irk   ro   rE   tp_sizer   r   
<listcomp>   s    

z4TransformersForCausalLM.__init__.<locals>.<listcomp>lm_head)rW   r   )rC   __init__loggerinforW   rk   
vocab_sizeunpadded_vocab_sizer   from_configrM   get_default_dtypemodelr   tensor_parallelhasattrhidden_sizers   ro   rangenum_hidden_layersr'   replace_vocab_embed_classr   r   ry   tie_word_embeddingsget_input_embeddingsweightr   logits_processor)rE   rk   rW   r   rF   rv   r   rz      s>   


z TransformersForCausalLM.__init__r   
old_module
new_modulec                 C   s   t d||| d S )Nz%s: %s -> %s)r{   debug)rE   r   r   r   r   r   r   log_replacement   s   z'TransformersForCausalLM.log_replacementrw   c                    s`   t jjddp	i s|dkrttj dd
dtjdtf fdd	  j dS )zo
        Apply the model's tensor parallelization plan.
        Currently only supports linear layers.
        base_model_tp_planNr)   z& does not support tensor parallel yet!rj   r    r   c                    s~   |   D ]8\}}t||} D ]$\}}t||r5t|tjr5t||j	}t
| || ||| q ||d qd S )N)r   )named_childrenr   itemsrematchrb   r   Linearrh   rW   setattrr   )r    r   
child_namechild_module	qual_namepatternrS   r   _tensor_parallelrE   tp_planr   r   r      s   
zATransformersForCausalLM.tensor_parallel.<locals>._tensor_parallel)rj   )getattrr   rk   rd   r]   r   Modulerc   )rE   rw   r   r   r   r      s   z'TransformersForCausalLM.tensor_parallelr    c                 C   s>   t | j| jj| jjd d}| d| j | | j| d S )N)org_num_embeddingsrW   zinput embedding)r   r}   rk   r   r   r   r   set_input_embeddings)rE   r    r   r   r   r   r      s   z1TransformersForCausalLM.replace_vocab_embed_classF	input_ids	positionsr%   input_embedsget_embeddingc                 C   sP   |du sJ dd }| j |d d|d || jddd d }| ||| j||S )NFzembedding is not supported yet)N.)	use_cacheposition_idsr%   r'   return_dictr   )r   .)r   r'   r   ry   )rE   r   r   r%   r   r   aux_hidden_stateshidden_statesr   r   r   r8      s"   	zTransformersForCausalLM.forwardweightsc                 C   s^   t |  }|D ]$\}}||vr| jj d| }||v r,|| }t|dt}||| qd S )Nr   weight_loader)dictnamed_parametersr   base_model_prefixr   r   )rE   r   params_dictr   loaded_weightparamr   r   r   r   load_weights  s   
z$TransformersForCausalLM.load_weights)Nrj   )NF)rJ   rK   rL   r	   r   r   rc   rz   r   r   r   intr   r   rM   no_gradrN   r   boolr   r8   r   r   r   rO   r   r   rF   r   ri      s@    ?$ri   )NN)6r[   loggingr   typingr   r   r   r   r   rM   r   transformersr   r	   r
   transformers.modeling_utilsr   sglang.srt.distributedr   r   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   r   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   	getLoggerrJ   r{   rc   r   r   rN   r6   listr;   r>   rQ   r   rh   ri   
EntryClassr   r   r   r   <module>   sh   




, 
