o
    پiEH                     @   s  U d Z ddlZddlmZmZmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& da'ee( e)d< da*ee( e)d< dedej+de(fddZ,G dd de	j-Z.dedej+de/fddZ0G dd de	j-Z1G dd  d e	j-Z2G d!d" d"e	j-Z3G d#d$ d$e	j-Z4G d%d& d&e4Z5e4e5gZ6dS )'aB  
Inference-only LLaMA model compatible with HuggingFace weights.

This model supports tensor parallelism (TP) using the PyTorch tensor parallel package.
Reference: https://pytorch.org/docs/stable/distributed.tensor.parallel.html

Here is a quick example to enable TP:
```python
from sglang.srt.layers.model_parallel import tensor_parallel

device_mesh = torch.distributed.init_device_mesh("cuda", (tp_size,))
tensor_parallel(model, device_mesh)
```

An end-to-end example can be found in `python/sglang/bench_one_batch.py`.
You can run it with the following command:
```bash
$ python3 -m sglang.bench_one_batch --correct   --model meta-llama/Meta-Llama-3-8B   --json-model-override-args '{"architectures": ["TorchNativeLlamaForCausalLM"]}'   --tensor-parallel-size 2   --disable-cuda-graph
```
We will enable CUDA Graph support soon.
    N)AnyDictIterableOptionalTuple)nn)	Parameter)LlamaConfig)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)
SiluAndMul)RMSNorm)LogitsProcessorLogitsProcessorOutput)QuantizationConfig)RadixAttention)get_rope)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)default_weight_loader)
add_prefixtp_sizetp_rankparamloaded_weightloaded_shard_idc                 C   s   i }d}t | jD ]\}}|t }||f||< ||7 }q	||jd kr.|jdd| |_|t| jk s7J |j}|| \}	}
|d|	|
}|dt|
 |
}|j|jksXJ |	| d S )Nr   )
	enumerateoutput_sizesr   shapedatanarrowclonelenr   copy_)selfr   r   r   gate_up_offsetscurrent_shard_offsetioutput_size
param_datashard_offset
shard_size r-   X/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/torch_native_llama.pygate_up_proj_weight_loaderI   s   
r/   c                       sR   e Zd ZdddZ		ddededed	ee d
eddf fddZdd Z	  Z
S )LlamaMLPColwise_ShardedRowwise)gate_up_proj	down_projN hidden_sizeintermediate_size
hidden_actquant_configprefixreturnc                    s   t    tjj||d dd| _|gd | j_tt	| j| j_
| jj
| jj_
tjj||dd| _|dkr>td| dt | _d S )N   FbiassiluzUnsupported activation: z!. Only silu is supported for now.)super__init__torchr   Linearr3   r   types
MethodTyper/   weight_loaderweightr4   
ValueErrorr   act_fn)r%   r6   r7   r8   r9   r:   	__class__r-   r.   rA   l   s"   

zLlamaMLP.__init__c                 C   s"   |  |}| |}| |}|S N)r3   rI   r4   )r%   xgate_upr-   r-   r.   forward   s   


zLlamaMLP.forward)Nr5   )__name__
__module____qualname___tp_planintstrr   r   rA   rO   __classcell__r-   r-   rJ   r.   r0   f   s(    
r0   c                 C   s   | j t }| jt }d|| j f|| j || j f|| | j || j fd}|d d |d d  }||jd krE|jdd| |_|| \}}	|j}
|
d||	}
|dt|	 |	}|
j|jksfJ |
	| d S )Nr   )qkvrY      )
	num_headsr   num_kv_heads	head_sizer   r    r!   r"   r   r$   )r%   r   r   r   r[   r\   qkv_offsets
total_sizer+   r,   r*   r-   r-   r.   qkv_proj_weight_loader   s"   

r`   c                       s   e Zd ZdddZ								dd
ededededededeee	e
f  dededee de	ddf fddZdejdejdedejfddZ  ZS )LlamaAttentionr1   r2   )qkv_projo_projr   '  NT    r5   configr6   r[   r\   layer_id
rope_thetarope_scalingrope_is_neox_stylemax_position_embeddingsr9   r:   r;   c                    s  t    || _|| _| jt dksJ | jt | _|| _| jtkr,| jt dks+J n	t| j dks5J td| jt | _t	|d| j| j | _
| j| j
 | _| j| j
 | _| j
d | _|| _|	| _tjj|| jd| j  | j
 dd| _| j
| j_| j| j_| j| j_tt| j| j_| jj| jj_d| jj_tjj| j| j
 |dd| _t| j
| j
|	|||d| _t| j| j
| j| j|d	| _d S )
Nr   rZ   head_dimg      r<   Fr=   )
rotary_dimmax_positionbaseri   is_neox_style)r\   rg   ) r@   rA   r6   total_num_headsr   r[   total_num_kv_headsmaxr\   getattrrl   q_sizekv_sizescalingrh   rk   rB   r   rC   rb   r]   rD   rE   r`   rF   rG   
output_dimrc   r   
rotary_embr   attn)r%   rf   r6   r[   r\   rg   rh   ri   rj   rk   r9   r:   rJ   r-   r.   rA      sf   






zLlamaAttention.__init__	positionshidden_statesforward_batchc           
      C   sZ   |  |}|j| j| j| jgdd\}}}| |||\}}| ||||}| |}	|	S )N)dim)rb   splitru   rv   ry   rz   rc   )
r%   r{   r|   r}   qkvrW   rX   rY   attn_outputoutputr-   r-   r.   rO      s   
 
zLlamaAttention.forward)r   rd   NTre   Nr5   )rP   rQ   rR   rS   r	   rT   floatr   r   rU   r   boolr   rA   rB   Tensorr   rO   rV   r-   r-   rJ   r.   ra      sZ    	
Ira   c                       sr   e Zd Z			ddededee deddf
 fd	d
Zde	j
de	j
dedee	j
 dee	j
e	j
f f
ddZ  ZS )LlamaDecoderLayerr   Nr5   rf   rg   r9   r:   r;   c           	         s   t    |j| _t|dd}t|dd }|d ur$t|dd r$|j|d< t|dd}t|dd}t|| j|j|j||||||td	|d
| _	t
| j|j|j|td|d| _t|j|jd| _t|j|jd| _d S )Nrh   rd   ri    original_max_position_embeddingsrj   Trk   re   	self_attn)rf   r6   r[   r\   rg   rh   ri   rj   rk   r9   r:   mlp)r6   r7   r8   r9   r:   eps)r@   rA   r6   rt   r   ra   num_attention_headsnum_key_value_headsr   r   r0   r7   r8   r   r   rms_norm_epsinput_layernormpost_attention_layernorm)	r%   rf   rg   r9   r:   rh   ri   rj   rk   rJ   r-   r.   rA     sF   

zLlamaDecoderLayer.__init__r{   r|   r}   residualc                 C   sZ   |d u r|}|  |}n|  ||\}}| j|||d}| ||\}}| |}||fS )N)r{   r|   r}   )r   r   r   r   )r%   r{   r|   r}   r   r-   r-   r.   rO   8  s   
zLlamaDecoderLayer.forward)r   Nr5   )rP   rQ   rR   r	   rT   r   r   rU   rA   rB   r   r   r   rO   rV   r-   r-   rJ   r.   r     s4    ,r   c                       s\   e Zd Z	ddedee ddf fddZ	ddejdejd	e	d
ejdejf
ddZ
  ZS )
LlamaModelNrf   r9   r;   c                    s   t    td u rt atd u rt a | _ j| _ j	| _	t
 j	 j| _t fddt jD | _t j jd| _d S )Nc                    s"   g | ]}t  |d | dqS )zmodel.layers.)r9   r:   )r   ).0r(   rf   r9   r-   r.   
<listcomp>g  s    z'LlamaModel.__init__.<locals>.<listcomp>r   )r@   rA   r   r   r   r
   rf   pad_token_idpadding_idx
vocab_sizer   r6   embed_tokensr   
ModuleListrangenum_hidden_layerslayersr   r   normr%   rf   r9   rJ   r   r.   rA   R  s$   
zLlamaModel.__init__	input_idsr{   r}   input_embedsc           
      C   s`   |d u r
|  |}n|}d }tt| jD ]}| j| }|||||\}}q| ||\}}	|S rL   )r   r   r#   r   r   )
r%   r   r{   r}   r   r|   r   r(   layer_r-   r-   r.   rO   p  s   

zLlamaModel.forwardrL   )rP   rQ   rR   r	   r   r   rA   rB   r   r   rO   rV   r-   r-   rJ   r.   r   Q  s*    #r   c                       s   e Zd Z	ddedee ddf fddZe 	ddej	dej	d	e
d
ej	def
ddZdd Zdd Zdedeeeej	f  fddZdeeeej	f  fddZ  ZS )TorchNativeLlamaForCausalLMNrf   r9   r;   c                    sf   t    || _|| _d| _t||d| _| jjr| jj| _	nt
|j|j| _	t|| _dtjj_d S )NT)r9   ATEN)r@   rA   rf   r9   supports_torch_tpr   modeltie_word_embeddingsr   lm_headr   r   r6   r   logits_processorrB   	_inductormax_autotune_gemm_backendsr   rJ   r-   r.   rA     s   

z$TorchNativeLlamaForCausalLM.__init__r   r{   r}   r   c                 C   s"   |  ||||}| ||| j|S rL   )r   r   r   )r%   r   r{   r}   r   r|   r-   r-   r.   rO     s   
z#TorchNativeLlamaForCausalLM.forwardc                 C   sZ   g d}|D ]\}}}}||v r!| ||d td  |f  S q|d td  dfS )N))rb   q_projrW      )rb   k_projrX   r   )rb   v_projrY   r   )r3   	gate_projr   r<   )r3   up_projrZ   r<   z.weightrZ   )replacer#   )r%   namestacked_params_mapping
param_nameweight_nameshard_id	num_shardr-   r-   r.    get_module_name_from_weight_name  s   z<TorchNativeLlamaForCausalLM.get_module_name_from_weight_namec                 C   s   t |  }t|S rL   )dictnamed_parametersr#   )r%   params_dictr-   r-   r.   get_num_params  s   z*TorchNativeLlamaForCausalLM.get_num_paramsfqnweightsc                 C   s  g d}|  |}t|j|dd}|D ]m\}}d|v s d|v r!qd|v s)d|v r*q|dr4||vr4q| jjr=d	|v r=q|D ](\}}	}
|	|vrIq?||	|}|d
sX||vrYq?|| }|j}||||
  n|d
sq||vrrq|| }t	|dt
}||| qdS )z2Load weights onto submodule pointed by path `fqn`.))	.qkv_projz.q_projrW   )r   z.k_projrX   )r   z.v_projrY   ).gate_up_projz
.gate_projr   )r   z.up_projrZ   F)r:   recursezrotary_emb.inv_freq	projectorzrotary_emb.cos_cachedzrotary_emb.sin_cachedzmodel.vision_towerzlm_head.weightz.biasrF   N)get_submoduler   r   
startswithrf   r   r   endswithrF   rt   r   )r%   r   r   r   moduler   r   r   r   r   r   r   rF   r-   r-   r.   load_weights_to_module  s:   

z2TorchNativeLlamaForCausalLM.load_weights_to_modulec                 C   s   |  d| dS )z!Load weights onto the full model.r5   N)r   )r%   r   r-   r-   r.   load_weights  s   z(TorchNativeLlamaForCausalLM.load_weightsrL   )rP   rQ   rR   r	   r   r   rA   rB   no_gradr   r   r   rO   r   r   rU   r   r   r   r   rV   r-   r-   rJ   r.   r     s@    
0r   c                   @   s   e Zd ZdS )TorchNativePhi3ForCausalLMN)rP   rQ   rR   r-   r-   r-   r.   r     s    r   )7__doc__rD   typingr   r   r   r   r   rB   r   torch.nn.parameterr   transformersr	   sglang.srt.distributedr
   r   sglang.srt.layers.activationr   sglang.srt.layers.layernormr   "sglang.srt.layers.logits_processorr   r   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.utilsr   r   rT   __annotations__r   r   r/   Moduler0   rU   r`   ra   r   r   r   r   
EntryClassr-   r-   r-   r.   <module>   sR   
(
 ]F7o