o
    پi9                     @   s  d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d d	lmZ d d
lmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlm Z  G dd dej!Z"G dd dej!Z#G dd dej!Z$G dd de Z%e%Z&dS )    )Iterable)AnyN)nn)BaseEncoderOutput)Qwen3TextConfig)get_tp_world_size)
SiluAndMul)LocalAttention)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)get_rope)VocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)TextEncoderc                       s`   e Zd ZdZ			ddededededB d	ed
eddf fddZde	j
de	j
fddZ  ZS )Qwen3MLPz8Qwen3 MLP with SwiGLU activation and tensor parallelism.NF hidden_sizeintermediate_size
hidden_actquant_configbiasprefixreturnc                    sh   t    t||gd ||| dd| _t||||| dd| _|dkr.td| dt | _d S )	N   z.gate_up_proj)
input_sizeoutput_sizesr   r   r   z
.down_projr   output_sizer   r   r   siluzUnsupported activation: z. Only silu is supported.)	super__init__r   gate_up_projr   	down_proj
ValueErrorr   act_fn)selfr   r   r   r   r   r   	__class__ g/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/models/encoders/qwen3.pyr$   !   s(   
	
zQwen3MLP.__init__xc                 C   s*   |  |\}}| |}| |\}}|S N)r%   r(   r&   )r)   r.   _r,   r,   r-   forward?   s   
zQwen3MLP.forward)NFr   )__name__
__module____qualname____doc__intstrr   boolr$   torchTensorr1   __classcell__r,   r,   r*   r-   r      s*    r   c                       s   e Zd ZdZ						ddeded	ed
ededeee	f dB dede
dB dededdf fddZdejdejdejfddZ  ZS )Qwen3AttentionzQwen3 attention with QK-Norm and tensor parallelism.

    Key difference from LLaMA: RMSNorm is applied to Q and K before attention.
        .AN   Fr   configr   	num_headsnum_kv_heads
rope_thetarope_scalingmax_position_embeddingsr   r   r   r   c              	      s  t    || _t }|| _| j| dksJ | j| | _|| _| j|kr/| j| dks.J n	|| j dks8J td| j| | _t	|d| j| j | _
| j
| _| j| j
 | _| j| j
 | _| j
d | _|| _|| _t|| j
| j| j|	||
 dd| _t| j| j
 ||	||
 dd| _t	|d	d
}t| j
|d| _t| j
|d| _t| j
| j|t||dd| _t| j| j
| j| jd|jd| _d S )Nr      head_dimg      z	.qkv_proj)r   	head_sizetotal_num_headstotal_num_kv_headsr   r   r   z.o_projr    rms_norm_epsgư>epsT)
rotary_dimmax_positionbaserC   is_neox_style)softmax_scalecausalsupported_attention_backends)r#   r$   r   r   rH   r@   rI   maxrA   getattrrF   rM   q_sizekv_sizescalingrB   rD   r   qkv_projr   o_projr
   q_normk_normr   r6   
rotary_embr	   _supported_attention_backendsattn)r)   r?   r   r@   rA   rB   rC   rD   r   r   r   tp_sizerJ   r*   r,   r-   r$   L   sn   


	
zQwen3Attention.__init__	positionshidden_statesc                 C   s  |  |\}}|j| j| j| jgdd\}}}|jd |jd }}	|||	| j| j}|||	| j| j}|||	| j| j}| 	|}| 
|}|||	d}|||	d}| |||\}}|||	| j| j}|||	| j| j}| |||}
|
||	d}
| |
\}}|S )N)dimr   rE   )rY   splitrV   rW   shapereshaper@   rF   rA   r[   r\   r]   r_   rZ   )r)   ra   rb   qkvr0   qkv
batch_sizeseq_lenattn_outputoutputr,   r,   r-   r1      s"    

zQwen3Attention.forward)r=   Nr>   NFr   )r2   r3   r4   r5   r   r6   floatdictr7   r   r   r8   r$   r9   r:   r1   r;   r,   r,   r*   r-   r<   F   sL    	
Qr<   c                
       sl   e Zd ZdZ		ddededB deddf fdd	Zd
ej	dej	dej	dB de
ej	ej	f fddZ  ZS )Qwen3DecoderLayerz Qwen3 transformer decoder layer.Nr   r?   r   r   r   c                    s   t    |j| _t|dd}t|dd }t|dd}t|dd}t|| j|jt|d|j|||||| d	d

| _t| j|j|j	|t|dd| dd| _
t|j|jd| _t|j|jd| _d S )NrB   r=   rC   rD   r>   attention_biasFnum_key_value_headsz
.self_attn)
r?   r   r@   rA   rB   rC   rD   r   r   r   mlp_biasz.mlp)r   r   r   r   r   r   rK   )r#   r$   r   rU   r<   num_attention_heads	self_attnr   r   r   mlpr
   rJ   input_layernormpost_attention_layernorm)r)   r?   r   r   rB   rC   rD   rs   r*   r,   r-   r$      s@   

zQwen3DecoderLayer.__init__ra   rb   residualc                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)ra   rb   )ry   rw   rz   rx   )r)   ra   rb   r{   r,   r,   r-   r1      s   
zQwen3DecoderLayer.forward)Nr   )r2   r3   r4   r5   r   r   r7   r$   r9   r:   tupler1   r;   r,   r,   r*   r-   rr      s,    (rr   c                       s   e Zd ZdZdeddf fddZdejdejfdd	Z					ddejdB d
ejdB dejdB dejdB de	dB de
fddZdeeeejf  dee fddZ  ZS )Qwen3ForCausalLMa  Qwen3 causal language model for text encoding in diffusion models.

    Features:
    - Tensor parallelism support
    - FlashAttention/SageAttn/SDPA support via LocalAttention
    - QK-Norm for better training stability
    - FSDP sharding for CPU offload
    r?   r   Nc                    s   t     | _ j| _ jd ur%t jdd}t jdd}|| }nd} j| | _ j| _t| j j	 j jd| _
t fddt jD | _t j	 jd| _d S )	N	max_lorasrE   lora_extra_vocab_sizer   )org_num_embeddingsr   c                    s(   g | ]}t   j j d | dqS )z.layers.)r?   r   r   )rr   r   r   ).0ir?   r,   r-   
<listcomp>'  s    z-Qwen3ForCausalLM.__init__.<locals>.<listcomp>rK   )r#   r$   r?   r   lora_configrU   
vocab_sizeorg_vocab_sizer   r   embed_tokensr   
ModuleListrangenum_hidden_layerslayersr
   rJ   norm)r)   r?   r~   lora_vocab_size
lora_vocabr*   r   r-   r$     s,   


zQwen3ForCausalLM.__init__	input_idsc                 C   s
   |  |S r/   )r   )r)   r   r,   r,   r-   get_input_embeddings4  s   
z%Qwen3ForCausalLM.get_input_embeddingsposition_idsattention_maskinputs_embedsoutput_hidden_statesc                 K   s   |d ur|n| j j}|d ur|}n| |}d }|d u r+tjd|jd |jdd}|r/dnd }	| jD ]}
|	d urH|	|d u rB|fn|| f7 }	|
|||\}}q4| 	||\}}|	d urb|	|f7 }	t
||	dS )Nr   rE   )devicer,   )last_hidden_staterb   )r?   r   r   r9   arangerf   r   	unsqueezer   r   r   )r)   r   r   r   r   r   kwargsrb   r{   all_hidden_stateslayerr0   r,   r,   r-   r1   7  s<   


zQwen3ForCausalLM.forwardweightsc                 C   s"  t |  }t }|D ]\}}|dr|dd }d|v rqd|v s'd|v r(qd|v r8t||}|du r6q|}| jjjD ]-\}}}	||vrGq=|||}|	drW||vrWq=||vr\q=|| }
|
j
}||
||	  n|	dru||vruq||vrzq|| }
t|
d	t}||
| || q|S )
zFLoad weights with support for tensor parallelism and weight remapping.zmodel.   Nzrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cachedscalez.biasweight_loader)rq   named_parametersset
startswithr   r?   arch_configstacked_params_mappingreplaceendswithr   rU   r   add)r)   r   params_dictloaded_paramsnameloaded_weightkv_scale_name
param_nameweight_nameshard_idparamr   r,   r,   r-   load_weightsh  sN   


zQwen3ForCausalLM.load_weights)NNNNN)r2   r3   r4   r5   r   r$   r9   r:   r   r8   r   r1   r   r|   r7   r   r   r;   r,   r,   r*   r-   r}     s.    	&
,1r}   )'collections.abcr   typingr   r9   r   -sglang.multimodal_gen.configs.models.encodersr   3sglang.multimodal_gen.configs.models.encoders.qwen3r   )sglang.multimodal_gen.runtime.distributedr   /sglang.multimodal_gen.runtime.layers.activationr   .sglang.multimodal_gen.runtime.layers.attentionr	   .sglang.multimodal_gen.runtime.layers.layernormr
   +sglang.multimodal_gen.runtime.layers.linearr   r   r   1sglang.multimodal_gen.runtime.layers.quantizationr   5sglang.multimodal_gen.runtime.layers.rotary_embeddingr   =sglang.multimodal_gen.runtime.layers.vocab_parallel_embeddingr   1sglang.multimodal_gen.runtime.loader.weight_utilsr   r   2sglang.multimodal_gen.runtime.models.encoders.baser   Moduler   r<   rr   r}   
EntryClassr,   r,   r,   r-   <module>   s,    (~@ #