o
    پiV                     @   s   d dl mZ d dlmZ d dlmZ G dd deZG dd deZG dd	 d	eZG d
d deZ	G dd deZ
G dd deZG dd deZG dd deZG dd deZdS )    )PretrainedConfig)layer_type_validation)loggerc                       sD   e Zd ZdZ												
						d fdd	Z  ZS )Qwen3OmniMoeAudioEncoderConfigqwen3_omni_moe_audio_encoder                r   geluF{Gz?  d            c                    s   t  jdi | || _|| _|| _|| _|| _|| _|| _|| _	|	| _
|| _|| _|
| _|| _|| _|| _|| _|| _|| _d S N )super__init__num_mel_binsd_modelencoder_layersencoder_attention_headsencoder_ffn_dimdropoutattention_dropoutactivation_functionactivation_dropoutnum_hidden_layersinitializer_rangescale_embeddingmax_source_positionsn_window
output_dimn_window_inferconv_chunksizedownsample_hidden_size)selfr   r   r   r   r   r   r   r   r    r#   r"   r$   r%   r&   r'   r(   r)   kwargs	__class__r   Q/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/configs/qwen3_omni.pyr   
   s(   
z'Qwen3OmniMoeAudioEncoderConfig.__init__)r   r   r	   r
   r   r   r   r   r   Fr   r   r   r   r   r   r   )__name__
__module____qualname__
model_typer   __classcell__r   r   r,   r.   r      s(    r   c                       sD   e Zd ZdZdZdddddddd	d	d
dg ddf fdd	Z  ZS )Qwen3OmniMoeVisionEncoderConfigqwen3_omni_moe_vision_encodervision_config   i  gelu_pytorch_tanhi           r   i 	  )   r9      r   c                    sd   t  jdi | || _|| _|| _|| _|| _|| _|| _|| _	|	| _
|
| _|| _|| _|| _d S r   )r   r   depthhidden_size
hidden_actintermediate_size	num_headsin_channels
patch_sizespatial_merge_sizetemporal_patch_sizeout_hidden_sizenum_position_embeddingsr"   deepstack_visual_indexes)r*   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   r"   r+   r,   r   r.   r   ;   s   
z(Qwen3OmniMoeVisionEncoderConfig.__init__)r/   r0   r1   r2   base_config_keyr   r3   r   r   r,   r.   r4   7   s"    r4   c                       s   e Zd ZdZdgZddddddddddd
Zdgdgfdd	gdgfdgdgfd
Z																									d  fdd	Z  ZS )!Qwen3OmniMoeTextConfigqwen3_omni_moe_textpast_key_valuescolwiserowwise
layers.*.self_attn.q_projlayers.*.self_attn.k_projlayers.*.self_attn.v_projlayers.*.self_attn.o_projz layers.*.mlp.experts.*.gate_projzlayers.*.mlp.experts.*.up_projz layers.*.mlp.experts.*.down_projlayers.*.mlp.gate_projlayers.*.mlp.up_projlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_maskembed_tokenslayersnormr       J        silu   r   ư>TF    .ANr         r<   r   MbP?c                       t  jdd|i| || _|| _|| _|| _|| _|| _|| _|| _	|| _
|	| _|
| _|| _|| _|| _|| _|| _| jd urMd| jv rM| jd | jd< || _|| _|| _|| _|| _|| _|| _|d u rkg | _d S || _d S Ntie_word_embeddingstype	rope_typer   r   r   
vocab_sizemax_position_embeddingsr?   rA   r!   num_attention_headssliding_windownum_key_value_headsr@   r"   rms_norm_eps	use_cache
rope_thetarope_scalingattention_biasr   decoder_sparse_stepmoe_intermediate_sizenum_experts_per_toknum_expertsnorm_topk_proboutput_router_logitsrouter_aux_loss_coefmlp_only_layersr*   rq   r?   rA   r!   rs   ru   r@   rr   r"   rv   rw   rm   rx   ry   rz   rt   r   r{   r|   r}   r~   r   r   r   r   r+   r,   r   r.   r   t   >   
zQwen3OmniMoeTextConfig.__init__)r   r`   ra   rb   rb   rc   rd   re   r   rf   TFrg   NFNr   rh   ri   r<   r   TFrj   N	r/   r0   r1   r2   keys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr   r3   r   r   r,   r.   rK   ]   sX    

rK   c                       sN   e Zd ZdZddddZeeedZ							
				d fdd	Z	  Z
S )Qwen3OmniMoeThinkerConfigqwen3_omni_moe_thinkerimage_token_indexvideo_token_indexaudio_token_index)image_token_idvideo_token_idaudio_token_id)audio_configr6   text_configN^P gP hP    _P h  r   c                    s   t  jdi | |	| _|| _|| _|
| _t|tr"tdi |}n|d u r)t }|| _	t|tr9t
di |}n|d u r@t
 }|| _t|trPtdi |}n|d u rWt }|| _|| _|| _|| _d S r   )r   r   user_token_idposition_id_per_secondsaudio_start_token_idr"   
isinstancedictr4   r6   r   r   rK   r   r   r   r   )r*   r   r6   r   r   r   r   r   r   r   r"   r+   r,   r   r.   r      s.   



z"Qwen3OmniMoeThinkerConfig.__init__)
NNNr   r   r   r   r   r   r   )r/   r0   r1   r2   attribute_mapr   r4   rK   sub_configsr   r3   r   r   r,   r.   r      s*    r   c                       s   e Zd ZdZdgZddddddddZdgdgfdd	gdgfdgdgfd
Z																				d fdd	Z  ZS )%Qwen3OmniMoeTalkerCodePredictorConfig$qwen3_omni_moe_talker_code_predictorrM   rN   rO   )rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r`            r9   r<   r   rd   re   r   rf   TF'  Nr   r   c                    s   t  jdd|i| | _|	 _| _| _| _| _| _|d u r&|}| _	| _
| _|
 _| _| _| _| _| _| _ jd urVd jv rV jd  jd< | _ jd u rk fddt jD  _t j j | _d S )Nrm   rn   ro   c                    s(   g | ]} j d ur| jkrdndqS )Nsliding_attentionfull_attention)rt   max_window_layers).0ir*   r   r.   
<listcomp>;  s    zBQwen3OmniMoeTalkerCodePredictorConfig.__init__.<locals>.<listcomp>r   )r   r   rq   rr   r?   rA   r!   rs   rt   ru   head_dimr@   r"   rv   rw   rx   ry   rz   r   layer_typesranger   num_code_groups)r*   rq   r?   rA   r!   rs   ru   r   r@   rr   r"   rv   rw   rm   rx   ry   rz   rt   r   r   r   r+   r,   r   r.   r     sB   



z.Qwen3OmniMoeTalkerCodePredictorConfig.__init__)r`   r   r   r   r9   r<   r   rd   re   r   rf   TFr   NFNNr   r   r   r   r   r,   r.   r      sH    


r   c                       s   e Zd ZdZdgZddddddddddd
Zdgdgfdd	gdgfdgdgfd
Z																									d! fdd 	Z  ZS )"Qwen3OmniMoeTalkerTextConfigqwen3_omni_moe_talker_textrM   rN   rO   rP   rX   rY   rZ   r[   r\   r   r   r`   r	   r9   r;   rd   re   r   rf   TFr   Nr   rh     r<   r   rj   c                    rk   rl   rp   r   r,   r   r.   r   _  r   z%Qwen3OmniMoeTalkerTextConfig.__init__)r   r   r`   r	   r9   r;   rd   re   r   rf   TFr   NFNr   rh   r   r<   r   FFrj   Nr   r   r   r,   r.   r   G  sX    

r   c                       sL   e Zd ZeedZ											
								d fdd	Z  ZS )Qwen3OmniMoeTalkerConfig)code_predictor_configr   Nr   r`   f     k  l  m  d  e  r   r   r   dP r   uP c                    s   t  jdi | |d u ri }t | _td nt|tr"|| _ntdi || _|d u r:i }t | _td nt|trC|| _ntdi || _|| _	|| _
|| _|| _|| _|| _|	| _|
| _|| _|| _|| _|| _|| _|| _|| _|| _d S )Nz[code_predictor_config is None. Initializing code_predictor_config model with default valueszNtalker text_config is None. Initializing talker text model with default valuesr   )r   r   r   r   r   infor   r   r   r   thinker_hidden_sizecodec_eos_token_idaccept_hidden_layercodec_nothink_idcodec_think_bos_idcodec_think_eos_idcodec_pad_idcodec_bos_idr   r   r   r   r   vision_start_token_id
speaker_id)r*   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r+   r,   r   r.   r     sJ   


z!Qwen3OmniMoeTalkerConfig.__init__)NNr   r`   r   r   r   r   r   r   r   r   r   r   r   r   r   N)r/   r0   r1   r   r   r   r   r3   r   r   r,   r.   r     s.    r   c                       sN   e Zd Z												
							d fdd	Zedd Z  ZS )Qwen3OmniMoeCode2WavConfigr`   r   @  r   r9   FH   r   rd   {Gz?h㈵>r<   r<   r   rc   r:   r;   r;              c                    s   t  jdi | || _|| _|| _|| _|| _|| _|| _|| _	|	| _
|
| _|| _|| _|| _|| _|| _|| _|| _|| _d S r   )r   r   codebook_sizer?   rr   rx   rs   ru   rz   rt   rA   r@   layer_scale_initial_scalerv   r!   num_quantizersupsample_ratesupsampling_ratiosdecoder_dimr   )r*   r   r?   rr   rx   rs   ru   rz   rt   rA   r@   r   rv   r!   r   r   r   r   r   r+   r,   r   r.   r     s&   
z#Qwen3OmniMoeCode2WavConfig.__init__c                 C   s   dg| j  S )zC
        All layer in code2wav should be sliding attention
        r   )r!   r   r   r   r.   r     s   z&Qwen3OmniMoeCode2WavConfig.layer_types)r`   r   r   r   r9   r9   Fr   r   rd   r   r   r<   r9   r   r   r   r   )r/   r0   r1   r   propertyr   r3   r   r   r,   r.   r     s,    *r   c                       sR   e Zd ZdZeeedZ											
		d fdd	ZddddZ	  Z
S )Qwen3OmniMoeConfigqwen3_omni_moe)thinker_configtalker_configcode2wav_configNT\P ]P wP xP yP "  r   #- c                    s   t  jdi | |d u ri }td |d u ri }td |d u r*i }td tdi || _tdi || _tdi || _	|| _
|| _|| _|| _|| _|	| _|
| _|| _|| _d S )NzFthinker_config is None. Initializing thinker model with default valueszDtalker_config is None. Initializing talker model with default valueszHcode2wav_config is None. Initializing code2wav model with default valuesr   )r   r   r   r   r   r   r   r   r   r   enable_audio_outputim_start_token_idim_end_token_idtts_pad_token_idtts_bos_token_idtts_eos_token_idsystem_token_idr   assistant_token_id)r*   r   r   r   r   r   r   r   r   r   r   r   r   r+   r,   r   r.   r   %  s8   
zQwen3OmniMoeConfig.__init__Freturnr   c                 C   s
   | j  S )as  
        Returns the config that is meant to be used with text IO. On most models, it is the original config instance
        itself. On specific composite models, it is under a set of valid names.

        Args:
            decoder (`Optional[bool]`, *optional*, defaults to `False`):
                If set to `True`, then only search for decoder config names.
        )r   get_text_config)r*   decoderr   r   r.   r   U  s   
z"Qwen3OmniMoeConfig.get_text_config)NNNTr   r   r   r   r   r   r   r   )F)r   r   )r/   r0   r1   r2   r   r   r   r   r   r   r3   r   r   r,   r.   r     s(    0r   N)transformersr    transformers.configuration_utilsr   sglang.utilsr   r   r4   rK   r   r   r   r   r   r   r   r   r   r.   <module>   s    0&Y7ZZG4