o
    پie                     @   s  U d dl Z d dlZ d dlZd dlmZmZ d dlmZmZmZm	Z	m
Z
mZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZmZ d d	lmZm Z m!Z! d d
l"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) e*e+Z,e-e.e-e-f B ee- B Z/e
e0d< de/de.e-e-f fddZ1					d1de-de-de-de-de-de-fddZ2G dd  d ej3Z4G d!d" d"ej3Z5G d#d$ d$ej3Z6G d%d& d&ej3Z7G d'd( d(ej3Z8G d)d* d*ej3Z9G d+d, d,ej3Z:G d-d. d.ej3Z;G d/d0 d0ej3Z<e<gZ=dS )2    N)CallableSequence)IterableListOptionalTuple	TypeAliascast)PretrainedConfig)VisionAttention)ColumnParallelLinearRowParallelLinear)QuantizationConfig)/MultiModalityDataPaddingPatternMultimodalTokensgeneral_mm_embed_routine)ModalityMultimodalDataItemMultimodalInputs)ForwardBatch)default_weight_loader)Qwen2ForCausalLM)
add_prefix_Tuple2xreturnc                 C   sP   t | tjjr$t| dksJ d|  dt|  ttttf t| S | | fS )N   z%Expected a sequence of length 2, got z with length )
isinstancecollectionsabcr   lenr	   tupleint)r    r"   Q/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/midashenglm.py_resolve_tuple2!   s   r$            T   audio_length_samplesn_ffthop_sizedasheng_subsamplingmodel_subsamplingc                 C   s(   |r| | } t d| | |  | | S )z/Calculate the number of Mel-spectrogram frames.   )r!   )r)   r*   r+   r,   centerr-   r"   r"   r#   calculate_mel_frames_dasheng*   s   	r0   c                       sd   e Zd Z							ddeded	ed
edededB def fddZdej	dej	fddZ
  ZS )AudioPatchEmbed@      r.      NF
input_size
patch_sizepatch_stridein_chans	embed_dim
norm_layerflattenc                    s   t    t|| _t|| _t|| _| jd | jd  | jd | jd  f| _| jd | jd  | _|| _t	j
||| j| jd| _|rL||| _d S t	 | _d S )Nr   r.   )kernel_sizestride)super__init__r$   r5   r6   r7   	grid_sizenum_patchesr;   nnConv2dprojIdentitynorm)selfr5   r6   r7   r8   r9   r:   r;   	__class__r"   r#   r?   >   s    




 zAudioPatchEmbed.__init__r   r   c                 C   s4   |  |}| jrtt|ddd}| |}|S )Nr      r   r   r.   )rD   r;   torchpermuterF   rG   r   r"   r"   r#   forwardZ   s
   

zAudioPatchEmbed.forward)r2   r3   r3   r.   r4   NF)__name__
__module____qualname__r   r!   r   boolr?   rL   TensorrO   __classcell__r"   r"   rH   r#   r1   =   s0    r1   c                       s4   e Zd Zd	 fdd	ZdejdejfddZ  ZS )

LayerScaleh㈵>Fc                    s*   t    || _t|t| | _d S N)r>   r?   inplacerB   	ParameterrL   onesgamma)rG   diminit_valuesrY   rH   r"   r#   r?   c   s   
zLayerScale.__init__r   r   c                 C   s   | j r	|| jS || j S rX   )rY   mul_r\   rN   r"   r"   r#   rO   h   s   zLayerScale.forward)rW   F)rP   rQ   rR   r?   rL   rT   rO   rU   r"   r"   rH   r#   rV   b   s    rV   c                       s^   e Zd Z				ddededB dedB dee def
 fdd	Zd
ej	dej	fddZ
  ZS )
DashengMlpN in_featureshidden_featuresout_featuresquant_configprefixc                    s\   t    |p|}|p|}t||d|td|d| _t | _t||d|td|d| _	d S )NTfc1r5   output_sizebiasre   rf   fc2)
r>   r?   r   r   rg   rB   GELUactr   rk   )rG   rb   rc   rd   re   rf   rH   r"   r#   r?   m   s$   

zDashengMlp.__init__r   r   c                 C   s*   |  |\}}| |}| |\}}|S rX   )rg   rm   rk   )rG   r   _r"   r"   r#   rO      s   
zDashengMlp.forward)NNNra   )rP   rQ   rR   r!   r   r   strr?   rL   rT   rO   rU   r"   r"   rH   r#   r`   l   s"    r`   c                       s`   e Zd ZdZ				ddededed	ee d
ef
 fddZ	dde
jde
jdB fddZ  ZS )DashengAttentionz@Audio encoder attention using VisionAttention for compatibility.   FNra   r]   	num_headsqkv_biasre   rf   c                    sh   t    || dksJ d|| _|| _| j| j | _| jd | _t|||dd|ddd||d| _d S )Nr   z$dim should be divisible by num_headsg      TsdpaF)r9   rr   projection_sizeuse_qkv_parallel	proj_biasrs   qkv_backendsoftmax_in_single_precisionflatten_batchre   rf   )r>   r?   r9   rr   head_dimscaler   attn)rG   r]   rr   rs   re   rf   rH   r"   r#   r?      s&   
zDashengAttention.__init__r   maskc                 C   sB   d}|dur| d d}| }d| d }| j||d}|S )z]
        Args:
            x: [B, N, C] tensor
            mask: [B, N] boolean mask
        Nr.   r   g      ?g     )	attn_mask)	unsqueezefloatr}   )rG   r   r~   r   r"   r"   r#   rO      s   zDashengAttention.forward)rq   FNra   rX   )rP   rQ   rR   __doc__r!   rS   r   r   ro   r?   rL   rT   rO   rU   r"   r"   rH   r#   rp      s$    $rp   c                       sr   e Zd Z					ddedededed	edB d
ee def fddZ		dde
jde
jdB de
jfddZ  ZS )DashengBlock      @FNra   r]   rr   	mlp_ratiors   r^   re   rf   c                    s   t    tj|dd| _t||||td|d| _|r"t||dnt	 | _
tj|dd| _t|t|| |td|d| _|rJt||d| _d S t	 | _d S )Nư>epsr}   )rr   rs   re   rf   )r^   mlp)rb   rc   re   rf   )r>   r?   rB   	LayerNormnorm1rp   r   r}   rV   rE   ls1norm2r`   r!   r   ls2)rG   r]   rr   r   rs   r^   re   rf   rH   r"   r#   r?      s,   


zDashengBlock.__init__r   r~   r   c                 C   s:   ||  | | || }|| | | | }|S rX   )r   r}   r   r   r   r   )rG   r   r~   r"   r"   r#   rO      s   zDashengBlock.forward)r   FNNra   rX   )rP   rQ   rR   r!   r   rS   r   r   ro   r?   rL   rT   rO   rU   r"   r"   rH   r#   r      s:    $r   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	DashengFrontendz?Audio frontend that converts waveforms to log mel-spectrograms.configc                    s   t    |j| _|j| _|j| _|j| _t|j}| jd|dd |  t	j
|jd d |j|j|j|jd}| jd|dd |  d S )Nspectrogram_windowF)
persistentr   r.   )n_freqsf_minf_maxn_melssample_ratemelscale_fbanks)r>   r?   r*   
hop_length
win_lengthr/   rL   hann_windowregister_bufferFr   r   r   r   r   )rG   r   r   r   rH   r"   r#   r?      s*   
zDashengFrontend.__init__waveformr   c                 C   sp   t j|tjd| j| j| j| jdd| j	d	}|j
| jtj j
}t j|dddddd	d}||jS )
zConvert waveform to log mel-spectrogram.

        Args:
            waveform: [B, T] tensor of audio samples

        Returns:
            log_mel_spectrogram: [B, n_mels, time] tensor
        r   r   F)	r   padwindowr*   r   r   power
normalizedr/   r.   
   g|=x   )
multiplieramindb_multipliertop_db)r   spectrogramtorL   float32r   r*   r   r   r/   mTr   amplitude_to_DBr   squeezedtype)rG   r   r   mel_spectrogramlog_mel_spectrogramr"   r"   r#   rO     s,   	
zDashengFrontend.forward)
rP   rQ   rR   r   r
   r?   rL   rT   rO   rU   r"   r"   rH   r#   r      s    r   c                
       s   e Zd ZdZ		ddedee def fddZ	dd	e	j
d
e	j
dB de	j
fddZde	j
dede	j
fddZ	dd	e	j
de	j
dB dee	j
e	j
dB f fddZ  ZS )DashengAudioTransformerzAudio encoder transformer.Nra   r   re   rf   c              	      s   t     j| _ j| _t | _tj jdd| _	t
 j jf j j jd jd| _ttd jd| jjd | _ttd j| jjd d| _t fddt jD | _tj jd	d
| _d S )Ng{Gz?)momentumF)r5   r9   r8   r6   r;   r7   r.   r   c                 3   s<    | ]}t  j j j j jtd | dV  qdS )zblocks.)r]   rr   r   rs   r^   re   rf   N)r   r9   rr   r   rs   r^   r   ).0ir   rf   re   r"   r#   	<genexpr>@  s    

z3DashengAudioTransformer.__init__.<locals>.<genexpr>r   r   )r>   r?   target_lengthr   r   	front_endrB   BatchNorm2dr   init_bnr1   r9   input_channelsr6   r7   patch_embedrZ   rL   emptyr@   time_pos_embedfreq_pos_embed
ModuleListrangedepthblocksr   rF   )rG   r   re   rf   rH   r   r#   r?   '  s.   




z DashengAudioTransformer.__init__r   r~   r   c                 C   s   |j d }|| jd d d d d d d |f  }|| jd d d d d d d d f  }tt|ddd}| jD ]}|||}q9| |}|S )Nr   rJ   rK   )shaper   r   rL   rM   r;   r   rF   )rG   r   r~   tblockr"   r"   r#   forward_featuresN  s   
&&

z(DashengAudioTransformer.forward_featureslengths
max_lengthc                 C   s@   t |}tj||jd}||||}||dk  }|S )Ndevicer   )r   rL   aranger   repeatviewr   rS   )rG   r   r   
batch_sizeidxr~   r"   r"   r#   _to_mask\  s
   z DashengAudioTransformer._to_maskx_lengthc                 C   s:  |  |}|| jj}| jd }|d}t|d}| |}t|d}| 	|}|j
d }|j|dd}|durit|t|ksHJ d|jdksQJ d|| jd   }| j||d	}|j|dd}n	d}dgt| }g }	t||D ]\}
}i }||d
< | j|
fi |}
|	|
 qytj|	dd}||fS )z
        Args:
            x: [B, T] audio waveform tensor
            x_length: [B] tensor of audio lengths

        Returns:
            x: [B, seq_len, embed_dim] encoded features
            mask: [B, seq_len] mask tensor
        r'   r.   )r   r   r.   rJ   r   r]   Nz2batchsizes of input x and x_length need to be samezLengths are of size (B,))r   r   r~   )r   r   r   r   r   r   rL   rM   r   r   r   splitr   ndimr   longr   zipr   appendcat)rG   r   r   target_length_in_patchesr   input_splitsscaled_lengthsr~   split_masksoutputssplit_x
split_maskforward_kwargsr"   r"   r#   rO   c  s<   





zDashengAudioTransformer.forwardNra   rX   )rP   rQ   rR   r   r
   r   r   ro   r?   rL   rT   r   r!   r   r    rO   rU   r"   r"   rH   r#   r   $  s8    *

r   c                       sT   e Zd ZdZ				ddededejdB dee d	e	f
 fd
dZ
dddZ  ZS )AudioProjectorSubsamplez!Audio projector with subsampling.r(   Nra   in_dimout_dimr   re   rf   c                    sX   t    || _t|| j |d|td|d| _t | _t	||d|td|d| _
d S )NFznet.0rh   znet.2)r>   r?   kr   r   rg   rB   rl   rm   r   rk   )rG   r   r   downsample_rater   re   rf   rH   r"   r#   r?     s"   
	
z AudioProjectorSubsample.__init__c                 C   s   |j \}}}|| j }|dkr,|d d d | d d f }|d ur,|d d d | f }|d u r?tj|j d d tj|jd}||d| j| }| |\}}| |}| 	|\}}||d| j}|j
dd }||fS )Nr   r   )r   r   r   )r   r   rL   r[   r   r   reshaperg   rm   rk   any)rG   r   r~   r   seq_lenr]   num_frames_to_discardrn   r"   r"   r#   rO     s   

zAudioProjectorSubsample.forward)r(   NNra   rX   )rP   rQ   rR   r   r!   rL   r   r   r   ro   r?   rO   rU   r"   r"   rH   r#   r     s$    r   c                	       s   e Zd ZdZg dZddddddZ			
d#dedee de	dd	f fddZ
dee defddZdee dejfddZdd Ze dejdejdefddZdeee	ejf  fdd Zd!d" Z  ZS )$MiDashengLMModelz0MiDashengLM model for audio-language processing.).fc1..fc2.z.gate_up_proj.z.down_proj.z.q_proj.z.k_proj.z.v_proj.z.o_proj.)qkv_projr   )r   r.   )r   r   )gate_up_projr   )r   r.   )q_projk_projv_proj	gate_projup_projNra   r   re   rf   r   c                    s   t    || _t|jdr+|jjr+d|jjv r+dd |jj D }|r'|nd |j_t|j|t	d|d| _
t|jj|jj|j|t	d|d| _t|j|t	d	|d| _| jj| _|| _d S )
Nrope_scalingmrope_sectionc                 S   s   i | ]\}}|d kr||qS )r   r"   )r   r   vr"   r"   r#   
<dictcomp>  s
    z-MiDashengLMModel.__init__.<locals>.<dictcomp>audio_encoder)re   rf   audio_projector)r   r   r   re   rf   decoder)r>   r?   r   hasattrtext_configr   itemsr   audio_encoder_configr   r   r   r9   hidden_sizesubsample_factorr   r   language_modellogits_processorre   )rG   r   re   rf   new_rope_scalingrH   r"   r#   r?     s>   





zMiDashengLMModel.__init__	input_ids	mm_inputsc                 C   s   t  }|||S )z%Pad input IDs with multimodal tokens.)r   pad_input_tokens)rG   r  r	  patternr"   r"   r#   pad_input_ids   s   zMiDashengLMModel.pad_input_idsr  c              
   C   s:  t d t dt| d t d t|D ]>\}}t d| d|jj  t d| dt|dd  t d| d	t|d
d  t d| dt|dd  qtjdd |D dd}t d|j  g }|D ]}t	|dr|j
dur||j
 qq||jjd  qqtj||jd}t d|  | ||\}}t d|j  | ||\}	}
|	|j}	t d|	j  |	j\}}}t d| d |	d|}t d|j  t d|  dd|  d t d|j d|j  t d |ddd!f    t d |S )"zProcess audio inputs and return embeddings.

        Args:
            items: List of multimodal data items containing audio features

        Returns:
            audio_embeddings: Concatenated audio embeddings
        P================================================================================zget_audio_feature called with z itemszItem z feature shape: z audio_length: audio_lengthzNOT SETz pad_value: 	pad_valuez hash: hashc                 S   s   g | ]}|j qS r"   )feature)r   itemr"   r"   r#   
<listcomp>  s    z6MiDashengLMModel.get_audio_feature.<locals>.<listcomp>r   r   z!Concatenated input_values shape: Nr   r   zaudio_length: zEncoder output shape: zProjector output shape: z
Using all z# audio tokens from projector outputzFinal output shape: zStats: min=z.4fz, max=zAudio embeddings dtype: z
, device: z%First 5 values of first audio token: r(   )loggerdebugr   	enumerater  r   getattrrL   r   r   r  r   tensorr   r   r   r   r   r   minr  maxtolist)rG   r  r   r  input_valuesaudio_lengthsr  encoder_outencoder_attsaudio_embeddingsrn   r   max_audio_tokensr9   masked_audio_featuresr"   r"   r#   get_audio_feature  sN   
	
 "
z"MiDashengLMModel.get_audio_featurec                 C   s
   | j jjS rX   )r  modelembed_tokensrG   r"   r"   r#   get_input_embeddings7  s   
z%MiDashengLMModel.get_input_embeddings	positionsforward_batchc                 K   s(  |  rtd td|j  td|dd    tdtt|  |jrt|jdkr|jd }|rt|j	dkr|j	d j
}td|  td	||k    t|d
r|jrtd|j  td||jk    td t||| j|tj| jidS )a4  Run forward pass for MiDashengLM.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a batch.
            positions: Flattened (concatenated) position ids corresponding to a batch.
            forward_batch: Forward batch information including multimodal data.
        r  zinput_ids shape: zinput_ids first 20: N   zinput_ids unique values count: r   zExpected pad_value: z!Count of pad_value in input_ids: audio_token_idzaudio_token_id: z&Count of audio_token_id in input_ids: )r  r)  r  r(  data_embedding_funcs)contains_mm_inputsr  r  r   r  r   rL   uniquer	  mm_itemsr  sumr  r   r+  r   r  r   AUDIOr#  )rG   r  r(  r)  kwargsmm_inputr  r"   r"   r#   rO   :  s8   



zMiDashengLMModel.forwardweightsc                 C   s  t | jdd}t |  }g }g }g }g }|D ]\}}	d|v r!qd|v s)d|v r*q|dr7|||	f q|}
d|v rRd|v rH|dd	}n
d
|v rR|d
d}d|v r`d|v r`|dd}d|v rnd|v rn|dd}d|v r~|dd}|dd}|dr||vr||vr||
 d q||v r|| }t|dt}|||	 n%||v r|| 	|	 nd|
v r||
 d| d n||
 d qd|
v r||
 qd|
v r||
 q|rt
dt| d dd |D }| j| t
d  t
d!t|  t
d"t|  t
d#t|  t
d$t|  d%d |D }d&d |D }|rLt
d' |D ]}t
d(|  q@|rzt
d)t|  d*d |D }|rzt
d+ |d,d- D ]}t
d.|  qnt
d  d,S )/zLoad model weights.F)remove_duplicatezrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cachedr   zaudio_encoder.front_endz.mel_scale.fbz.melscale_fbanksz.spectrogram.windowz.spectrogram_windowr   z
.attn.qkv.z.attn.attn.qkv_proj.z.attn.proj.z.attn.attn.proj.r   z.net.0.r   z.net.2.r   z.biasz (bias not in params/buffers)weight_loaderz -> z (NOT IN MODEL)z (not in model)zPassing z1 decoder weights to language_model.load_weights()c                 S   s"   g | ]\}}| d dd|fqS )zdecoder.ra   r.   )replace)r   nameweightr"   r"   r#   r    s    z1MiDashengLMModel.load_weights.<locals>.<listcomp>r  zAudio encoder weights loaded: z Audio projector weights loaded: z*Decoder weights passed to language_model: zSkipped weights: c                 S      g | ]}d |v r|qS )r   r"   r   sr"   r"   r#   r        c                 S   r:  )r   r"   r;  r"   r"   r#   r    r=  z Skipped audio_projector weights:z  zSkipped audio_encoder weights: c                 S   s   g | ]}d |vr|qS )rj   r"   r;  r"   r"   r#   r    r=  z  First 10 non-bias skipped:Nr   z    )dictnamed_parametersnamed_buffers
startswithr   r7  endswithr  r   copy_r  r  r   r  load_weights)rG   r4  params_dictbuffers_dictaudio_encoder_loadedaudio_projector_loadedskipped_weightsdecoder_weightsr8  loaded_weightoriginal_nameparamr6  decoder_weights_strippedencoder_skippedprojector_skippedr<  non_bias_skippedr"   r"   r#   rD  g  s   




zMiDashengLMModel.load_weightsc                 C   s   | j jjj| j jjfS rX   )r  r$  r%  r9  lm_headr&  r"   r"   r#   get_embed_and_head  s   
z#MiDashengLMModel.get_embed_and_headr   )rP   rQ   rR   r   #default_bitsandbytes_target_modules#bitsandbytes_stacked_params_mappingr
   r   r   ro   r?   r   r!   r   r  r   rL   rT   r#  r'  no_gradr   rO   r   r   rD  rS  rU   r"   r"   rH   r#   r     sB    *2,Tr   )r%   r&   r'   Tr(   )>r   collections.abcloggingr   r   typingr   r   r   r   r   r	   rL   torch.nnrB   torchaudio.functional
functionalr   transformersr
   "sglang.srt.layers.attention.visionr   sglang.srt.layers.linearr   r   *sglang.srt.layers.quantization.base_configr   sglang.srt.managers.mm_utilsr   r   "sglang.srt.managers.schedule_batchr   r   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.qwen2r   sglang.srt.utilsr   	getLoggerrP   r  r!   r    r   __annotations__r$   r0   Moduler1   rV   r`   rp   r   r   r   r   r   
EntryClassr"   r"   r"   r#   <module>   sf   
  
 
%
#0,9l0  
