o
    
۾il                     @   s  U d Z ddlZddlZddlmZmZmZmZ ddlmZm	Z	m
Z
mZ ddlZddlZddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7 ddl8m9Z9 ddl:m;Z; ddl<m=Z=m>Z> ddl?m@Z@mAZAmBZB ddlCmDZDmEZEmFZF eGeHeGeGf B eeG B ZIe
eJd< deIdeHeGeGf fddZK				 	!dDd"eGd#eGd$eGd%eGd&eGdeGfd'd(ZLG d)d* d*ejMZNG d+d, d,ejMZOG d-d. d.ejMZPG d/d0 d0ejMZQG d1d2 d2ejMZRG d3d4 d4ejMZSG d5d6 d6ejMZTG d7d8 d8ejMZUG d9d: d:e=ZVG d;d< d<e4ZWG d=d> d>e2eW ZXG d?d@ d@e3eW ZYe)jZeYeWeXdAG dBdC dCejMeAeBZ[dS )EzEInference-only MiDashengLM model compatible with HuggingFace weights.    N)CallableIterableMappingSequence)	AnnotatedAny	TypeAliascast)scaled_dot_product_attention)BatchFeature)
VllmConfig)BaseDummyOptions)$get_tensor_model_parallel_world_size)
get_act_fn)Conv2dLayer)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)MultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)DashengConfig)TensorSchemaTensorShape   )MultiModalEmbeddingsSupportsMultiModal
SupportsPP)AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefix_Tuple2xreturnc                 C   sP   t | tjjr$t| dksJ d|  dt|  ttttf t| S | | fS )N   z%Expected a sequence of length 2, got z with length )
isinstancecollectionsabcr   lenr	   tupleint)r-    r6   Z/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/midashenglm.py_resolve_tuple2J   s   r8            T   audio_length_samplesn_ffthop_sizedasheng_subsamplingmodel_subsamplingc                 C   s(   |r| | } t d| | |  | | S )z/Calculate the number of Mel-spectrogram frames.r%   )r5   )r=   r>   r?   r@   centerrA   r6   r6   r7   calculate_mel_frames_dashengS   s   	rC   c                       sd   e Zd Z							ddeded	ed
edededB def fddZdej	dej	fddZ
  ZS )AudioPatchEmbed@      r%      NF
input_size
patch_sizepatch_stridein_chans	embed_dim
norm_layerflattenc                    s   t    t|| _t|| _t|| _| jd | jd  | jd | jd  f| _| jd | jd  | _|| _t	||| j| jd| _
|rK||| _d S t | _d S )Nr   r%   )kernel_sizestride)super__init__r8   rH   rI   rJ   	grid_sizenum_patchesrN   r   projnnIdentitynorm)selfrH   rI   rJ   rK   rL   rM   rN   	__class__r6   r7   rR   g   s    




 zAudioPatchEmbed.__init__r-   r.   c                 C   s4   |  |}| jrtt|ddd}| |}|S )Nr/      r   r/   r%   )rU   rN   torchpermuterX   rY   r-   r6   r6   r7   forward   s   

zAudioPatchEmbed.forward)rE   rF   rF   r%   rG   NF)__name__
__module____qualname__r,   r5   r   boolrR   r^   Tensorra   __classcell__r6   r6   rZ   r7   rD   f   s0    rD   c                       s4   e Zd Zd	 fdd	ZdejdejfddZ  ZS )

LayerScaleh㈵>Fc                    s*   t    || _t|t| | _d S N)rQ   rR   inplacerV   	Parameterr^   onesgamma)rY   diminit_valuesrk   rZ   r6   r7   rR      s   
zLayerScale.__init__r-   r.   c                 C   s   | j r	|| jS || j S rj   )rk   mul_rn   r`   r6   r6   r7   ra      s   zLayerScale.forward)ri   F)rb   rc   rd   rR   r^   rf   ra   rg   r6   r6   rZ   r7   rh      s    rh   c                       s^   e Zd Z				ddededB dedB dedB def
 fdd	Zd
ejdejfddZ	  Z
S )
DashengMlpN in_featureshidden_featuresout_featuresquant_configprefixc                    sX   t    |p|}|p|}t|||| dd| _td| _t|||| dd| _d S )Nz.fc1rH   output_sizerw   rx   geluz.fc2)rQ   rR   r   fc1r   actr   fc2)rY   rt   ru   rv   rw   rx   rZ   r6   r7   rR      s    

zDashengMlp.__init__r-   r.   c                 C   s*   |  |\}}| |}| |\}}|S rj   )r|   r}   r~   )rY   r-   _r6   r6   r7   ra      s   
zDashengMlp.forward)NNNrs   )rb   rc   rd   r5   r   strrR   r^   rf   ra   rg   r6   r6   rZ   r7   rr      s"    rr   c                       s\   e Zd Z				ddededededB d	ef
 fd
dZddej	dej	dB fddZ
  ZS )DashengAttention   FNrs   ro   	num_headsqkv_biasrw   rx   c              	      s  t    || dksJ d|| _t }|| _| j| dks!J | j| | _| j|kr6| j| dks5J n	|| j dks?J td| j| | _| j| j | _| j| j | _	| j| j | _
| jd | _t| j| j| j| j||| dd| _t|||| dd| _d S )	Nr   z$dim should be divisible by num_headsr%   g      z.qkv)hidden_size	head_sizetotal_num_headstotal_num_kv_headsbiasrw   rx   z.projry   )rQ   rR   rL   r   r   r   maxnum_kv_headshead_dimq_sizekv_sizescaler   qkvr   rU   )rY   ro   r   r   rw   rx   tp_sizerZ   r6   r7   rR      s<   

	zDashengAttention.__init__r-   maskc              
   C   s   |j \}}}| |\}}|||d| j|| j }|ddddd}|d\}}	}
t||	|
|d ur?|d d d d d d f nd d}|dd|||}| |\}}|S )Nr\   r/   r   r%   r;   )	attn_mask)	shaper   reshaper   r_   unbindr
   	transposerU   )rY   r-   r   BNCr   r   qkvr6   r6   r7   ra      s   "zDashengAttention.forward)r   FNrs   rj   )rb   rc   rd   r5   re   r   r   rR   r^   rf   ra   rg   r6   r6   rZ   r7   r      s"    $-r   c                       sr   e Zd Z					ddedededed	edB d
edB def fddZ	dde	j
de	j
dB de	j
fddZ  ZS )DashengBlock      @FNrs   ro   r   	mlp_ratior   rp   rw   rx   c                    s   t    tj|dd| _t||||| dd| _|r"t||dnt | _	tj|dd| _
t|t|| || dd| _|rJt||d| _d S t | _d S )Nư>epsz.attn)r   r   rw   rx   )rp   z.mlp)rt   ru   rw   rx   )rQ   rR   rV   	LayerNormnorm1r   attnrh   rW   ls1norm2rr   r5   mlpls2)rY   ro   r   r   r   rp   rw   rx   rZ   r6   r7   rR      s,   


zDashengBlock.__init__r-   r   r.   c                 C   s:   ||  | | || }|| | | | }|S rj   )r   r   r   r   r   r   )rY   r-   r   r6   r6   r7   ra     s   zDashengBlock.forward)r   FNNrs   rj   )rb   rc   rd   r5   floatre   r   r   rR   r^   rf   ra   rg   r6   r6   rZ   r7   r      s:    &r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )DashengFrontendconfigc                    sz   t    || _t| jj}| jd|dd |  tj| jj	d d | jj
| jj| jj| jjd}| jd|dd |  d S )Nspectrogram_windowF)
persistentr/   r%   )n_freqsf_minf_maxn_melssample_ratemelscale_fbanks)rQ   rR   r   r^   hann_window
win_lengthregister_bufferFr   r>   r   r   r   r   )rY   r   r   r   rZ   r6   r7   rR   *  s$   
zDashengFrontend.__init__waveformr.   c                 C   sx   t j|tjd| j| jj| jj| jj	dd| jj
d	}|j| jtj j}t j|dddddd	d}||jS )
Nr   r/   F)	r   padwindowr>   
hop_lengthr   power
normalizedrB   r%   
   g|=x   )
multiplieramindb_multipliertop_db)r   spectrogramtor^   float32r   r   r>   r   r   rB   mTr   amplitude_to_DB	unsqueezesqueezedtype)rY   r   r   mel_spectrogramlog_mel_spectrogramr6   r6   r7   ra   @  s,   
zDashengFrontend.forward)	rb   rc   rd   r"   rR   r^   rf   ra   rg   r6   r6   rZ   r7   r   )  s    r   c                
       s   e Zd Z		ddededB def fddZ	ddejd	ejdB d
ejfddZ	dejde
d
ejfddZ	ddejdejdB d
eejejdB f fddZ  ZS )DashengAudioTransformerNrs   r   rw   rx   c              	      s   t     j| _ j| _t | _tj jdd| _	t
 j jf j j jd jd| _ttd jd| jjd | _ttd j| jjd d| _t fddt jD | _tj jd	d
| _d S )Ng{Gz?)momentumF)rH   rL   rK   rI   rN   rJ   r%   r   c                 3   s:    | ]}t  j j j j j d | dV  qdS )z.blocks.)ro   r   r   r   rp   rw   rx   N)r   rL   r   r   r   rp   ).0ir   rx   rw   r6   r7   	<genexpr>}  s    

z3DashengAudioTransformer.__init__.<locals>.<genexpr>r   r   )rQ   rR   target_lengthr   r   	front_endrV   BatchNorm2dr   init_bnrD   rL   input_channelsrI   rJ   patch_embedrl   r^   emptyrS   time_pos_embedfreq_pos_embed
ModuleListrangedepthblocksr   rX   )rY   r   rw   rx   rZ   r   r7   rR   _  s.   


	

z DashengAudioTransformer.__init__r-   r   r.   c                 C   s   |j d }|| jd d d d d d d |f  }|| jd d d d d d d d f  }tt|ddd}| jD ]}|||}q9| |}|S )Nr/   r\   r]   )r   r   r   r^   r_   rN   r   rX   )rY   r-   r   tblockr6   r6   r7   forward_features  s   
&$

z(DashengAudioTransformer.forward_featureslengths
max_lengthc                 C   s@   t |}tj||jd}||||}||dk  }|S )Ndevicer   )r3   r^   aranger   repeatviewr   re   )rY   r   r   
batch_sizeidxr   r6   r6   r7   _to_mask  s
   z DashengAudioTransformer._to_maskx_lengthc                 C   s:  |  |}|| jj}| jd }|d}t|d}| |}t|d}| 	|}|j
d }|j|dd}|d urit|t|ksHJ d|jdksQJ d|| jd   }| j||d}|j|dd}n	d }d gt| }g }	t||D ]\}
}i }||d	< | j|
fi |}
|	|
 qytj|	dd}||fS )
Nr;   r%   )r   r/   r%   r\   r   ro   z2batchsizes of input x and x_length need to be samezLengths are of size (B,))r   r   r   )r   r   r   r   r   r   r^   r_   r   r   r   splitr3   ndimr   longr   zipr   appendcat)rY   r-   r   target_length_in_patchesr   input_splitsscaled_lengthsr   split_masksoutputssplit_x
split_maskforward_kwargsr6   r6   r7   ra     s8   





zDashengAudioTransformer.forward)Nrs   rj   )rb   rc   rd   r"   r   r   rR   r^   rf   r   r5   r   r4   ra   rg   r6   r6   rZ   r7   r   ^  s6    /

r   c                       sP   e Zd Z				ddededejdB dedB def
 fd	d
ZdddZ	  Z
S )AudioProjectorSubsampler<   Nrs   in_dimout_dimr   rw   rx   c                    sV   t    || _tt|| j ||| dddtdt|||| ddd| _d S )Nz.net.0F)rH   rz   rw   rx   return_biasr{   z.net.2)	rQ   rR   r   rV   
Sequentialr   r   r   net)rY   r  r  downsample_rater   rw   rx   rZ   r6   r7   rR     s&   
	
z AudioProjectorSubsample.__init__c                 C   s   |j \}}}|| j }|dkr,|d d d | d d f }|d ur,|d d d | f }|d u r?tj|j d d tj|jd}||d| j| }| jD ]}||}qL||d| j}|jdd }||fS )Nr   r   )r   r   r   )	r   r   r^   rm   r   r   r   r  any)rY   r-   r   r   seq_lenro   num_frames_to_discardlayerr6   r6   r7   ra     s$   


zAudioProjectorSubsample.forward)r<   NNrs   rj   )rb   rc   rd   r5   r^   r   r   r   rR   ra   rg   r6   r6   rZ   r7   r    s"    r  c                   @   s@   e Zd ZU dZeejeddf ed< eejedf ed< dS )MiDashengLMAudioInputszi

    Dimensions:
        - bn: Batch size * number of audios
        - p: Number of sampling points
    npinput_valuesaudio_lengthN)	rb   rc   rd   __doc__r   r^   rf   r$   __annotations__r6   r6   r6   r7   r    s   
 r  c                   @   sN   e Zd Zdd Zdd Zdd ZdeeedB f fd	d
Z	dd Z
dd ZdS )MiDashengLMProcessingInfoc                 C   s
   | j  S rj   )ctxget_hf_configrY   r6   r6   r7   r    s   
z'MiDashengLMProcessingInfo.get_hf_configc                 C   s   |   }|j}|S rj   )get_hf_processorfeature_extractor)rY   hf_processorr  r6   r6   r7   get_feature_extractor  s   z/MiDashengLMProcessingInfo.get_feature_extractorc                 C   s   |   }t|j|  dS )N)	target_srexpected_hidden_size)r  r   sampling_rate_get_expected_hidden_size)rY   r  r6   r6   r7   get_data_parser  s
   z)MiDashengLMProcessingInfo.get_data_parserr.   Nc                 C   s   dd iS )Naudior6   r  r6   r6   r7   get_supported_mm_limits  s   z1MiDashengLMProcessingInfo.get_supported_mm_limitsc                 C      dS )Ni  r6   r  r6   r6   r7   get_min_audio_len!     z+MiDashengLMProcessingInfo.get_min_audio_lenc                 C   r"  )Ni q r6   r  r6   r6   r7   get_max_audio_len$  r$  z+MiDashengLMProcessingInfo.get_max_audio_len)rb   rc   rd   r  r  r  r   r   r5   r!  r#  r%  r6   r6   r6   r7   r    s    r  c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )MiDashengLMDummyInputsBuilder	mm_countsr.   c                 C   s@   | dd}| j }|j}|j}|j}| | | }|| S )Nr   r   )getinfor  audio_tokenaudio_bos_tokenaudio_eos_token)rY   r'  
num_audiosr  r*  r+  r,  single_audio_textr6   r6   r7   get_dummy_text)  s   
z,MiDashengLMDummyInputsBuilder.get_dummy_textNr	  
mm_optionsc                 C   s8   | dd}|r| dnd }d| j| j ||diS )Nr   r   )lengthr-  	overrides)r(  _get_dummy_audiosr)  r%  )rY   r	  r'  r0  r-  audio_overridesr6   r6   r7   get_dummy_mm_data4  s   z/MiDashengLMDummyInputsBuilder.get_dummy_mm_datarj   )
rb   rc   rd   r   r   r5   r/  r   r   r5  r6   r6   r6   r7   r&  (  s    
r&  c                
       s   e Zd Zdedeeef deeef deeef def
 fddZded	eeef deee	f fd
dZ
ded	eeef dedee fddZ  ZS )MiDashengLMMultiModalProcessorpromptmm_data	mm_kwargs
tok_kwargsr.   c                    s   | dg }| j   fdd|D }|r||d< |dg s7| j |}| |}tt|gdddS td	i |}t	 j
||||dS )
Naudiosc                    sJ   g | ]!}t |tjr!|jd   k r!tj|d |jd   fdddn|qS )r   r   constant)modeconstant_values)r0   npndarrayr   r   )r   r   min_audio_lenr6   r7   
<listcomp>U  s    	
zEMiDashengLMMultiModalProcessor._call_hf_processor.<locals>.<listcomp>r   )	input_idspt)tensor_type)r7  r8  r9  r:  r6   )popr)  r#  r(  get_tokenizerencode_apply_hf_processor_tokens_onlyr   dictrQ   _call_hf_processor)rY   r7  r8  r9  r:  r;  processed_audios
prompt_idsrZ   rA  r7   rL  J  s(   

	
z1MiDashengLMMultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s   t tdtddS )Nr   r  r  )rK  r   batched)rY   rO  rP  r6   r6   r7   _get_mm_fields_configt  s   z4MiDashengLMMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    s   | j jdi |}| j  }| }t|dd}|| | }|d}	|	d u r,g  nt|	tj	r8|	
  n|	}
dd |
D  dtf fdd}td	||d
gS )Nr*  z	<|AUDIO|>r  c                 S      g | ]}t d tt|qS r%   r   rC   r5   r   r1  r6   r6   r7   rC        zFMiDashengLMMultiModalProcessor._get_prompt_updates.<locals>.<listcomp>item_idxc                    s     |  }g| }t j|dS )N)embed_token_id)r    select_token_id)r[  num_featuresaudio_tokensaudio_output_lengthsaudio_token_idr6   r7   get_replacement_midashenglm  s   
zWMiDashengLMMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_midashenglmr   )modalitytargetreplacementr6   )r)  r  rH  	get_vocabgetattrget_datar(  r0   r^   rf   cpunumpyr5   r   )rY   rT  rP  rU  	processor	tokenizervocabr*  out_mm_datar  audio_length_nprc  r6   r`  r7   _get_prompt_updates~  s.   



z2MiDashengLMMultiModalProcessor._get_prompt_updates)rb   rc   rd   r   r   objectr   r   rL  r   rS  r   r   r   r   rq  rg   r6   r6   rZ   r7   r6  G  s8    


*




r6  )r)  dummy_inputsc                       s  e Zd Zg dddgdZededededB fd	d
Zdddedef fddZ	de
dedB fddZdedeejdf fddZde
defddZ		d&dejdB dejdedB dejdB de
dejeB fddZd ejdejdB fd!d"Zd#eeeejf  dee fd$d%Z  ZS )'MiDashengLMModel)q_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projrd  r   r.   Nc                 C   s   | drdS td)Nr   z#<|audio_bos|><|AUDIO|><|audio_eos|>z Only audio modality is supported)
startswith
ValueError)clsrd  r   r6   r6   r7   get_placeholder_str  s   
z$MiDashengLMModel.get_placeholder_strrs   )rx   vllm_configrx   c             	      s   t    |jj}|j}|| _|| _| |d& t|j|t	|dd| _
t|jj|jj|j|t	|dd| _W d    n1 sAw   Y  | | t||jt	|ddgd| _W d    n1 sdw   Y  | jj| _d S )	Nr   audio_encoder)rw   rx   audio_projector)r  r  r  rw   rx   decoderQwen2ForCausalLM)r  	hf_configrx   architectures)rQ   rR   model_configr  rw   r   _mark_tower_modelr   audio_encoder_configr+   r  r  rL   text_configr   subsample_factorr  _mark_language_modelr*   r  make_empty_intermediate_tensors)rY   r  rx   r   rw   rZ   r6   r7   rR     s:   


	zMiDashengLMModel.__init__kwargsc                 K   sN   | dd }| dd }|d u rd S t|tr!tjjjj|dd}t||dS )Nr  r  T)batch_firstrQ  )	rG  r0   listr^   rV   utilsrnnpad_sequencer  )rY   r  r  r  r6   r6   r7   _parse_and_validate_audio_input  s   

z0MiDashengLMModel._parse_and_validate_audio_inputaudio_input.c                 C   s   |d }|d }|  ||\}}| ||\}}||d j}|j\}}	}
dd | D }tj||jd}tj	|	|jd
d||	|
dk }|| d|
}t|| S )	Nr  r  c                 S   rV  rW  rX  rY  r6   r6   r7   rC    rZ  z9MiDashengLMModel._process_audio_input.<locals>.<listcomp>r   r   r%   r   )r  r  r   r   r   tolistr^   tensorr   r   r   expandr   r   )rY   r  r  r  encoder_outencoder_attsaudio_embeddingsr   r   max_audio_tokensrL   ra  audio_feature_maskmasked_audio_featuresr6   r6   r7   _process_audio_input  s.   z%MiDashengLMModel._process_audio_inputc                 K   s&   | j di |}|d u rg S | |S )Nr6   )r  r  )rY   r  r  r6   r6   r7   embed_multimodal  s   
z!MiDashengLMModel.embed_multimodalrD  	positionsintermediate_tensorsinputs_embedsc                 K   s    |d urd }| j j||||dS )N)r  )r  model)rY   rD  r  r  r  r  r6   r6   r7   ra   !  s   zMiDashengLMModel.forwardhidden_statesc                 C   s   | j |S rj   )r  compute_logits)rY   r  r6   r6   r7   r  3  s   zMiDashengLMModel.compute_logitsweightsc                 C   s   t | }||S rj   )r)   load_weights)rY   r  loaderr6   r6   r7   r  9  s   
zMiDashengLMModel.load_weights)NN)rb   rc   rd   packed_modules_mappingclassmethodr   r5   r  r   rR   rr  r  r  r4   r^   rf   r  r&   r  r!   ra   r  r   setr  rg   r6   r6   rZ   r7   rt    sR    !

 

,rt  )r9   r:   r;   Tr<   )\r  r1   collections.abcr   r   r   r   typingr   r   r   r	   rk  r?  r^   torch.nnrV   torchaudio.functional
functionalr   torch.nn.functionalr
   transformersr   vllm.configr   vllm.config.multimodalr   vllm.distributedr   %vllm.model_executor.layers.activationr   vllm.model_executor.layers.convr   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser   r   vllm.multimodal.processingr   r   r   r   r   r    vllm.sequencer!   +vllm.transformers_utils.configs.midashenglmr"   vllm.utils.tensor_schemar#   r$   
interfacesr&   r'   r(   r  r)   r*   r+   r5   r4   r,   r  r8   rC   ModulerD   rh   rr   r   r   r   r   r  r  r  r&  r6  register_processorrt  r6   r6   r6   r7   <module>   s     
(
!B.5o4
e