o
    پiC                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZ ddlm Z  G dd dej!Z"G dd dej!Z#G dd dej!Z$G dd de Z%e%Z&dS )z?Inference-only LLaMA model compatible with HuggingFace weights.    )Iterable)AnyN)nn)BaseEncoderOutputLlamaConfig)get_tp_world_size)
SiluAndMul)LocalAttention)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)get_rope)VocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)TextEncoderc                       sN   e Zd Z			ddededededB ded	ed
df fddZdd Z  Z	S )LlamaMLPNF hidden_sizeintermediate_size
hidden_actquant_configbiasprefixreturnc                    sh   t    t||gd ||| dd| _t||||| dd| _|dkr.td| dt | _d S )	N   z.gate_up_proj)
input_sizeoutput_sizesr   r   r   z
.down_projr   output_sizer   r   r   siluzUnsupported activation: z!. Only silu is supported for now.)	super__init__r   gate_up_projr   	down_proj
ValueErrorr   act_fn)selfr   r   r   r   r   r   	__class__ g/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/models/encoders/llama.pyr$   >   s(   
	
zLlamaMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)r%   r(   r&   )r)   x_r,   r,   r-   forward^   s   
zLlamaMLP.forward)NFr   )
__name__
__module____qualname__intstrr   boolr$   r1   __classcell__r,   r,   r*   r-   r   <   s(     r   c                       s   e Zd Z							ddededed	ed
edeeef dB dede	dB de
de
deddf fddZdejdejdejfddZ  ZS )LlamaAttention'  N    Fr   configr   	num_headsnum_kv_heads
rope_thetarope_scalingmax_position_embeddingsr   r   bias_o_projr   r   c              	      s  t    || _t }|| _| j| dksJ | j| | _|| _| j|kr/| j| dks.J n	|| j dks8J td| j| | _t	|d| j| j | _
t	|dd}t|| j
 | _| j| j
 | _| j| j
 | _| j
d | _|| _|| _t|| j
| j| j|	|| dd| _t| j| j
 ||
|| dd	| _d
}|ot|do| dk}|r|jdkrd}t| j
| j|t|||d| _t| j| j
| j| jd
|jd| _d S )Nr      head_dimpartial_rotary_factorg      z	.qkv_proj)r   	head_sizetotal_num_headstotal_num_kv_headsr   r   r   z.o_projr    Tget_nameggufllamaF)
rotary_dimmax_positionbaser@   is_neox_style)softmax_scalecausalsupported_attention_backends)r#   r$   r   r   rG   r=   rH   maxr>   getattrrD   r5   rL   q_sizekv_sizescalingr?   rA   r   qkv_projr   o_projhasattrrI   
model_typer   
rotary_embr	   _supported_attention_backendsattn)r)   r<   r   r=   r>   r?   r@   rA   r   r   rB   r   tp_sizerE   rO   is_ggufr*   r,   r-   r$   g   sz   




	zLlamaAttention.__init__	positionshidden_statesc                 C   s   |  |\}}|j| j| j| jgdd\}}}| |||\}}|jd }|jd }	|||	| j| j}|||	| j	| j}|||	| j	| j}| 
|||}
|
||	| j| j }
| |
\}}|S )N)dimr   rC   )rX   splitrU   rV   r\   shapereshaper=   rD   r>   r^   rY   )r)   ra   rb   qkvr0   qkv
batch_sizeseq_lenattn_outputoutputr,   r,   r-   r1      s    

zLlamaAttention.forward)r:   Nr;   NFFr   )r2   r3   r4   r   r5   floatdictr6   r   r   r7   r$   torchTensorr1   r8   r,   r,   r*   r-   r9   e   sP    	
Yr9   c                
       sh   e Zd Z		ddededB deddf fddZd	ejd
ejdejdB de	ejejf fddZ
  ZS )LlamaDecoderLayerNr   r<   r   r   r   c           	         s  t    |j| _t|dd}t|dd }|d ur$t|dd r$|j|d< t|dd}t|ddp5t|d	d}|}t|d
r@|j}t|| j|jt|d|j||||||| dd| _	t
| j|j|j|t|dd| dd| _t|j|jd| _t|j|jd| _d S )Nr?   r:   r@    original_max_position_embeddingsrA   r;   attention_biasFr   qkv_biasnum_key_value_headsz
.self_attn)r<   r   r=   r>   r?   r@   rA   r   r   rB   r   mlp_biasz.mlp)r   r   r   r   r   r   eps)r#   r$   r   rT   ru   rZ   rw   r9   num_attention_heads	self_attnr   r   r   mlpr
   rms_norm_epsinput_layernormpost_attention_layernorm)	r)   r<   r   r   r?   r@   rA   rv   rB   r*   r,   r-   r$      sV   



zLlamaDecoderLayer.__init__ra   rb   residualc                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)ra   rb   )r   r}   r   r~   )r)   ra   rb   r   r,   r,   r-   r1     s   
zLlamaDecoderLayer.forward)Nr   )r2   r3   r4   r   r   r6   r$   rr   rs   tupler1   r8   r,   r,   r*   r-   rt      s*    7rt   c                       s   e Zd Zdef fddZdejdejfddZ				ddejdB d	ejdB d
ejdB dejdB dedB de	fddZ
deeeejf  dee fddZ  ZS )
LlamaModelr<   c                    s   t     | _| jj| _ jd ur0d}d}t jdr! jj}t jdr+ jj}|| }nd} j| | _ j| _	t
| j j j jd| _t fddt jD | _t j jd| _d S )	NrC   	max_loraslora_extra_vocab_sizer   )org_num_embeddingsr   c                    s(   g | ]}t   j j d | dqS )z.layers.)r<   r   r   )rt   r   r   ).0ir<   r,   r-   
<listcomp>I  s    z'LlamaModel.__init__.<locals>.<listcomp>rz   )r#   r$   r<   r   lora_configrZ   r   r   
vocab_sizeorg_vocab_sizer   r   embed_tokensr   
ModuleListrangenum_hidden_layerslayersr
   r   norm)r)   r<   r   lora_vocab_size
lora_vocabr*   r   r-   r$   ,  s4   



zLlamaModel.__init__	input_idsr   c                 C   s
   |  |S r.   )r   )r)   r   r,   r,   r-   get_input_embeddingsU  s   
zLlamaModel.get_input_embeddingsNposition_idsattention_maskinputs_embedsoutput_hidden_statesc                 K   s   |d ur|n| j j}|d ur|}n| |}d }|d u r+tjd|jd |jdd}|r/dnd }	| jD ]}
|	d urH|	|d u rB|fn|| f7 }	|
|||\}}q4| 	||\}}|	d urb|	|f7 }	t
||	d}|S )Nr   rC   )devicer,   )last_hidden_staterb   )r<   r   r   rr   arangerf   r   	unsqueezer   r   r   )r)   r   r   r   r   r   kwargsrb   r   all_hidden_stateslayerr0   ro   r,   r,   r-   r1   X  s>   


zLlamaModel.forwardweightsc                 C   s  t |  }t }|D ]x\}}d|v rqd|v sd|v rqd|v r-t||}|d u r+q|}| jjjD ]-\}}}	||vr<q2|||}|drL||vrLq2||vrQq2|| }
|
j	}||
||	  n|drj||vrjq||vroq|| }
t
|
dt}||
| || q|S )Nzrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cachedscalez.biasweight_loader)rq   named_parameterssetr   r<   arch_configstacked_params_mappingreplaceendswithr   rT   r   add)r)   r   params_dictloaded_paramsnameloaded_weightkv_scale_name
param_nameweight_nameshard_idparamr   r,   r,   r-   load_weights  sJ   

zLlamaModel.load_weights)NNNN)r2   r3   r4   r   r$   rr   rs   r   r7   r   r1   r   r   r6   r   r   r8   r,   r,   r*   r-   r   *  s.    )
,5r   )'__doc__collections.abcr   typingr   rr   r   -sglang.multimodal_gen.configs.models.encodersr   r   )sglang.multimodal_gen.runtime.distributedr   /sglang.multimodal_gen.runtime.layers.activationr   .sglang.multimodal_gen.runtime.layers.attentionr	   .sglang.multimodal_gen.runtime.layers.layernormr
   +sglang.multimodal_gen.runtime.layers.linearr   r   r   1sglang.multimodal_gen.runtime.layers.quantizationr   5sglang.multimodal_gen.runtime.layers.rotary_embeddingr   =sglang.multimodal_gen.runtime.layers.vocab_parallel_embeddingr   1sglang.multimodal_gen.runtime.loader.weight_utilsr   r   2sglang.multimodal_gen.runtime.models.encoders.baser   Moduler   r9   rt   r   
EntryClassr,   r,   r,   r-   <module>   s,   )wN #