o
    پic                     @   s  d Z ddlZddlmZmZmZmZ ddlZddl	Z	ddl
mZ ddlm  mZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/m0Z0m1Z1 G dd dej2Z3G dd dej2Z4dd Z5G dd deZ6G dd dej2Z7G dd de)Z8G dd de,Z9G d d! d!eZ:e:Z;dS )"zBInference-only Qwen3-VL model compatible with HuggingFace weights.    N)IterableListOptionalTuple)PreTrainedModel)ACT2FN)BaseModelOutput)Qwen3OmniMoeAudioEncoderConfigQwen3OmniMoeThinkerConfigQwen3OmniMoeVisionEncoderConfig)Qwen3VLMoeConfig)VisionAttention)ColumnParallelLinearRowParallelLinear)FusedMoE)QuantizationConfig)MultimodalDataItem)default_weight_loader)Qwen3VLMoeVisionModel)Qwen3MoeLLMModel"Qwen3VLMoeForConditionalGenerationload_fused_expert_weights)
add_prefixis_npuloggerc                       sP   e Zd Z		ddedee def fddZdej	d	ej	d
ej	fddZ
  ZS )Qwen3OmniMoeAudioEncoderLayerN configquant_configprefixc                    s   t    |j}|j| _t||j|ddd|td|d| _t	| j| _
|j| _t|j | _|j| _t| j|j| _t|j| j| _t	| j| _d S )NTattn)	embed_dim	num_headsprojection_sizeuse_qkv_parallel	proj_biasflatten_batchr   r   )super__init__d_modelr!   r   encoder_attention_headsr   	self_attnnn	LayerNormself_attn_layer_normdropoutr   activation_functionactivation_fnactivation_dropoutLinearencoder_ffn_dimfc1fc2final_layer_norm)selfr   r   r   r!   	__class__ T/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/qwen3_omni_moe.pyr(   2   s(   

z&Qwen3OmniMoeAudioEncoderLayer.__init__hidden_states
cu_seqlensreturnc                 K   s   |}|  |}| j||d}|| }|}| |}| |}| |}| |}|| }|jtjkrDt	|jj
d }tj|| |d}|f}|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )xr>   i  )minmax)r.   r+   r7   r5   r1   r6   dtypetorchfloat16finforB   clamp)r8   r=   r>   kwargsresidualclamp_valueoutputsr;   r;   r<   forwardM   s(   




z%Qwen3OmniMoeAudioEncoderLayer.forwardNr   )__name__
__module____qualname__r	   r   r   strr(   rD   TensorrL   __classcell__r;   r;   r9   r<   r   1   s"    r   c                       s,   e Zd Zd fdd	ZdefddZ  ZS )SinusoidsPositionEmbedding'  c                    s   t    |d dkrtdt||d d  }t| t|d   }t|d d tj	f |tj	d d f  }| j
dtjt|t|gdddd d S )	N   r   z4SinusoidsPositionEmbedding needs even channels input   positional_embeddingdimF)
persistent)r'   r(   
ValueErrornplogrD   exparangefloatnewaxisregister_buffercatsincos)r8   lengthchannelsmax_timescalelog_timescale_incrementinv_timescalesscaled_timer9   r;   r<   r(   v   s   
(
z#SinusoidsPositionEmbedding.__init__seqlenc                 C   s   | j d |d d f S N)rX   )r8   rm   r;   r;   r<   rL      s   z"SinusoidsPositionEmbedding.forward)rU   )rN   rO   rP   r(   intrL   rS   r;   r;   r9   r<   rT   u   s    rT   c                 C   sD   | d }|d d d }|d d d d d d | d d  }|S )zk
    Computes the output length of the convolutional layers and the output length of the audio encoder
    d   rW   rV      r;   )input_lengthsinput_lengths_leavefeat_lengthsoutput_lengthsr;   r;   r<    _get_feat_extract_output_lengths   s
   &rv   c                       sr   e Zd ZU eed< def fddZdd ZdejfddZ	d	ejfd
dZ
		dddZdejfddZ  ZS )Qwen3OmniMoeAudioEncoderr   c                    sR  t     j| _ j} j| _ j| _ jrt|nd| _	 j
| _
t| j|| _t fddt jD | _t j| _d| _tjd jdddd| _tj j jdddd| _tj j jdddd| _tj j jd d d d d d   jdd	| _t j j| _t j | _t j j | _!| j"j#| _#| j"j$| _$d S )
Ng      ?c                    s   g | ]}t  qS r;   )r   .0_r   r;   r<   
<listcomp>   s    z5Qwen3OmniMoeAudioEncoder.__init__.<locals>.<listcomp>FrW      rV   )padding)bias)%r'   r(   r/   r)   num_mel_binsmax_source_positionsscale_embeddingmathsqrtembed_scalen_windowrT   rX   r,   
ModuleListrangeencoder_layerslayersr-   ln_postgradient_checkpointingConv2ddownsample_hidden_sizeconv2d1conv2d2conv2d3r3   conv_outproj1r   r0   act
output_dimproj2r   n_window_inferconv_chunksize)r8   r   r!   r9   r{   r<   r(      sX   

z!Qwen3OmniMoeAudioEncoder.__init__c                 C   s   |   D ]}d|_qd| _d S )NF)
parametersrequires_grad_requires_grad)r8   paramr;   r;   r<   _freeze_parameters   s   
z+Qwen3OmniMoeAudioEncoder._freeze_parametersr?   c                 C      | j S rn   conv1r8   r;   r;   r<   get_input_embeddings   s   z-Qwen3OmniMoeAudioEncoder.get_input_embeddingsvaluec                 C   s
   || _ d S rn   r   )r8   r   r;   r;   r<   set_input_embeddings   s   
z-Qwen3OmniMoeAudioEncoder.set_input_embeddingsNc                    s  t |}t|| jd   }tj| jd g|  tj|jd}tj	|ddd
ddd }|| jd  ||< | jd ||dk< |jj| dd	}tjjj|d
ddd t |}tjjj fdd|D d
d}	 d g }
 j| jdd	D ]}t| |}t| |}t| |}|
| q}tj|
dd	}| \}}}}| |dddd |||| }| j j d|j!d ddf d"|j#}|| }||	 }dg}|	j!d | j$| jd   }|D ]}||g||  7 }|| }|dkr	||g7 }qtj||jdj
dtj%d}t& r!|"d}| j'D ]}|||}|d }q$| (|}| )|}| *|}| +|}t,|dS )z
        feature_lens (`torch.LongTensor` of shape `(batch_size,)`):
            mel length
        aftercnn_lens (`torch.LongTensor` of shape `(batch_size,)`):
            mel length after cnn
        rV   rC   device)rW   r   )r   r   rW   NrY   T)batch_firstc                    s    g | ]}t j|t j jd qS )r   )rD   onesboolr   )ry   rg   padded_featurer;   r<   r|      s    z4Qwen3OmniMoeAudioEncoder.forward.<locals>.<listcomp>r}   )r   )rC   cpu)last_hidden_state)-rv   rD   ceilr   longtensorsumr   FpadcumsumTsplittolistr,   utilsrnnpad_sequence	transpose	unsqueezer   gelur   r   r   appendrd   sizer   permute
contiguousviewrX   shapetorC   r   int32r   r   r   r   r   r   r   )r8   input_featuresfeature_lensaftercnn_lens	chunk_numchunk_lengthstail_chunk_index
chunk_listfeature_lens_after_cnnpadded_mask_after_cnnpadded_embedschunkpadded_embedbcftrX   r=   cu_chunk_lenswindow_aftercnncnn_len	remainderr>   encoder_layerlayer_outputsr;   r   r<   rL      s   

 








z Qwen3OmniMoeAudioEncoder.forwardrr   c                 C   s(   |d d d }|d d d }||fS )zs
        Computes the output length of the convolutional layers and the output length of the audio encoder
        rW   rV   r;   )r8   rr   ru   r;   r;   r<   rv   -  s   z9Qwen3OmniMoeAudioEncoder._get_feat_extract_output_lengthsNN)rN   rO   rP   r	   __annotations__r(   r   r,   Moduler   r   rL   rD   
LongTensorrv   rS   r;   r;   r9   r<   rw      s   
 /
Xrw   c                       sZ   e Zd Z				ddedededee d	ed
df fddZdej	d
ej	fddZ
  ZS )Qwen3OmniMoeVisionPatchMergerrV   Nr   FrZ   context_dimspatial_merge_sizer   r   r?   c                    s   t    ||d  | _|| _tj|r| jn|dd| _tt| j| jd|t	d|dt
 t| j|d|t	d|dg| _d S )NrV   ư>)epsTzmlp.0)r   r   r   zmlp.2)r'   r(   hidden_sizeuse_postshuffle_normr,   r-   ln_qr   r   r   GELUr   mlp)r8   rZ   r   r   r   r   r   r9   r;   r<   r(   8  s0   
	
z&Qwen3OmniMoeVisionPatchMerger.__init__r@   c                 C   sv   | j r
|d| jn|d|jd }| |d| j}| jD ]}t|tr+|d }||}q t|tr9|d }|S )Nr   r   )r   r   r   r   r   r   
isinstancetuple)r8   r@   hiddenlayerr;   r;   r<   rL   [  s   



z%Qwen3OmniMoeVisionPatchMerger.forward)rV   Nr   F)rN   rO   rP   ro   r   r   rQ   r(   rD   rR   rL   rS   r;   r;   r9   r<   r   6  s&    #r   c                       sr   e Zd ZU eed< 		ddedee def fddZe	dd Z
e	d	ejfd
dZe	d	ejfddZ  ZS )Qwen3OmniMoeVisionEncoderr   Nr   r   c              	      sl   t  j t ddd t j j jdtdd| _t	
 fddtt jD | _| `d S )	Nrms_norm_epsr   )vision_configr   norm_epsFmerger)rZ   r   r   r   r   r   c                    s,   g | ]}t  j j jd tddqS )Tmerger_list)rZ   r   r   r   r   r   )r   out_hidden_sizer   r   r   rx   r   r   r   r;   r<   r|     s    	z6Qwen3OmniMoeVisionEncoder.__init__.<locals>.<listcomp>)r'   r(   getattrr   r   r   r   r   r   r,   r   r   lendeepstack_visual_indexesr   deepstack_merger_list)r8   r   r   r   rH   r9   r   r<   r(   p  s&   
	z"Qwen3OmniMoeVisionEncoder.__init__c                 C   r   rn   )r   r   r;   r;   r<   r     s   z/Qwen3OmniMoeVisionEncoder.deepstack_merger_listr?   c                 C      | j jjjS rn   )patch_embedprojweightrC   r   r;   r;   r<   rC        zQwen3OmniMoeVisionEncoder.dtypec                 C   r   rn   )r   r   r   r   r   r;   r;   r<   r     r   z Qwen3OmniMoeVisionEncoder.devicer   )rN   rO   rP   r   r   r   r   rQ   r(   propertyr   rD   rC   r   rS   r;   r;   r9   r<   r   m  s"   
 $
r   c                       sP   e Zd ZU eed< 		ddedee def fddZde	e
 fd	d
Z  ZS )+Qwen3OmniMoeThinkerForConditionalGenerationr   Nr   r   r   c                    sf   t  j|||td t|j| _t|j|t|ddt	d|d| _
| jjd ur.| jj| _d S d| _d S )N)language_model_clsr   r   visual)r   r   r   r   )r'   r(   r   rw   audio_configaudio_towerr   r   r   r   r  r   pad_token_idr8   r   r   r   r9   r;   r<   r(     s   
z4Qwen3OmniMoeThinkerForConditionalGeneration.__init__itemsc                 C   s   t jdd |D ddt j}t dd |D | jjt| j j	}|d urCt j
|dd}|ddd|  dd}nd }|d urK|n|
d}| j||d	}|j}|S )
Nc                 S      g | ]}|j qS r;   )feature_attention_maskry   itemr;   r;   r<   r|         zQQwen3OmniMoeThinkerForConditionalGeneration.get_audio_feature.<locals>.<listcomp>r   rY   c                 S   r	  r;   )featurer  r;   r;   r<   r|     r  rW   rV   r   )r   )rD   rd   typer   r  rC   r   nextr   r   r   r   r   r   )r8   r  r
  r   audio_feature_lengthsr   audio_outputsaudio_featuresr;   r;   r<   get_audio_feature  s6   
z=Qwen3OmniMoeThinkerForConditionalGeneration.get_audio_featurerM   )rN   rO   rP   r
   r   r   r   rQ   r(   r   r   r  rS   r;   r;   r9   r<   r    s   
 r  c                       sP   e Zd Z		ddedee def fddZdee	ee
jf  fd	d
Z  ZS )$Qwen3OmniMoeForConditionalGenerationNr   r   r   r   c                    sB   t  | || _t|j||d| _d| _| jj| _| jj| _d S )N)r   r   F)	r'   r(   r   r  thinker_configthinkerenable_talkerpad_input_idsrL   r  r9   r;   r<   r(     s   
z-Qwen3OmniMoeForConditionalGeneration.__init__weightsc              	   C   s  g d}t jddd| jjd}d}d}dd	g}| jj}t| d
s't|  | _| j}|D ]'\}	}
|	dd}	d|	v s?d|	v rC| j	sCq,|	dd}	|D ]C\}}}d|	v sXd|	v r\d}|}||	vraqKd|	v rfqKd|	v rkqK|	||}	|	
|r{|	|vr{qK|	|vrqK||	 }|j}|||
|  nd}|D ]t}|\}}}}||	vrqd|	v sd|	v rqd}|	||}|r|
dd}
d|	v r|
jddd}
t|||
d d| t|||
d d| n+t|||
|| n"|
|r||vrq|| v r|| }nq|j}|||
|||d |}	 nL|rq,d|	v sd|	v r(|	d d!}	|	d"d#}	|	d$d%}	|	
|r4|	|vr4q,|	| v rK||	 }t|d&t}|||
 q,td'|	d( q,d S ))N))	.qkv_projz.q_projq)r  z.k_projk)r  z.v_projv)gate_up_projup_projrW   )r  	gate_projr   r!  	down_projr   )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namenum_experts)
z.bias_biasz.k_scale_k_scalez.v_scale_v_scalez.weight_scale_weight_scalez.input_scale_input_scaleF)zexperts.w13_weightexperts.gate_up_projr   w1)zexperts.w2_weightexperts.down_projr   w2_cached_params_dictzmodel.language_model.zmodel.talkercode2wavz.self_attn.out_projz.self_attn.projr,  r.  Tr  zmlp.expertsr  r   rV   rY   r   r-  rW   w3)shard_id	expert_idz	attn.qkv.zattn.qkv_proj.zmodel.visual.zvisual.zattn.out_proj.z
attn.proj.weight_loaderzLoaded weight with name=z not found in params_dict)r   make_expert_params_mappingr   r&  hasattrdictnamed_parametersr0  replacer  endswithr7  r   r   r   keysr   r   r   warning)r8   r  stacked_params_mappingexpert_params_mappingignore_suffixesis_fused_expertfused_expert_params_mappingr&  params_dictnameloaded_weight
param_nameweight_namer5  r   r7  is_expert_weightmappingr6  name_mappedr;   r;   r<   load_weights  s   	



z1Qwen3OmniMoeForConditionalGeneration.load_weightsrM   )rN   rO   rP   r   r   r   rQ   r(   r   r   rD   rR   rM  rS   r;   r;   r9   r<   r    s    $r  )<__doc__r   typingr   r   r   r   numpyr]   rD   torch.nnr,   torch.nn.functional
functionalr   transformersr   transformers.activationsr   transformers.modeling_outputsr   sglang.srt.configs.qwen3_omnir	   r
   r   sglang.srt.configs.qwen3_vlr   "sglang.srt.layers.attention.visionr   sglang.srt.layers.linearr   r   ,sglang.srt.layers.moe.fused_moe_triton.layerr   *sglang.srt.layers.quantization.base_configr   "sglang.srt.managers.schedule_batchr   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.qwen3_vlr   sglang.srt.models.qwen3_vl_moer   r   r   sglang.srt.utilsr   r   r   r   r   rT   rv   rw   r   r   r  r  
EntryClassr;   r;   r;   r<   <module>   s@   D 746 B