o
    -i                  	   @   s2  U d dl mZmZmZ d dlmZmZmZmZm	Z	 d dl
Zd dlZd dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZ d d
lmZ d dl m!Z! d dl"m#Z# d dl$m%Z%m&Z&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2m3Z3 d dl4m5Z5m6Z6m7Z7m8Z8m9Z9 d dl:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@ d dlAmBZB d dlCmDZD d dlEmFZF d dlGmHZHmIZI ddlJmKZKmLZLmMZMmNZNmOZOmPZPmQZQ ddlRmSZSmTZTmUZUmVZVmWZW ddlXmYZYmZZZm[Z[ ddl\m]Z] G dd dej^Z_G d d! d!ej^Z`G d"d# d#ej^ZaG d$d% d%ej^ZbG d&d' d'ZcG d(d) d)ej^ZdG d*d+ d+eHZeG d,d- d-eHZfeeefB Zgeehd.< G d/d0 d0ej^ZiG d1d2 d2e=ZjG d3d4 d4e;ej Zkd5eelejmf d6enele2f fd7d8ZoG d9d: d:e9ZpG d;d< d<e<d2 Zqe/jreqejekd=G d>d? d?ej^eUeVeTeWZsdS )@    )IterableMappingSequence)	AnnotatedAnyLiteral	TypeAliascastN)BatchFeature)GlmAsrConfigGlmAsrProcessor)WhisperFeatureExtractor)ModelConfigSpeechToTextConfig
VllmConfig)BaseDummyOptions)$get_tensor_model_parallel_world_size)
PromptType)
get_act_fn)MMEncoderAttention)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)ApplyRotaryEmb)MultiModelKeys)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)DictEmbeddingItemsModalityDataModalityDataItemsMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseMultiModalProcessorBaseProcessingInfoPromptReplacementPromptUpdatePromptUpdateDetails)IntermediateTensors)cached_tokenizer_from_config)cached_processor_from_config)TensorSchemaTensorShape   )DEFAULT_CONV_PARAMSDEFAULT_MAX_AUDIO_LEN_SDEFAULT_MERGE_FACTOR!_flatten_audio_features_by_length#_get_audio_output_lengths_for_tower_group_audio_embeddings_normalize_chunk_counts)MultiModalEmbeddingsSupportsLoRASupportsMultiModal
SupportsPPSupportsTranscription)AutoWeightsLoaderinit_vllm_registered_modelmaybe_prefix)ISO639_1_SUPPORTED_LANGSc                       s6   e Zd ZdZd	 fddZdedejfddZ  Z	S )
GlmAsrEncoderRotaryEmbeddinga$  
    Rotary Position Embedding for GLM-ASR encoder.

    Computes rotary position embeddings on-demand for efficiency.
    Only caches inv_freq as a buffer; cos/sin are computed during forward
    to avoid wasted computation during initialization and ensure correct
    device placement.
    returnNc                    s   t    t|d|j|j }t|dr4|jr4|jdd}|jdd}t|| }|jdd| _	nt|dd}|}d| _	|| _
|| _d|tjd|d	tjd
|   }| jd|dd d S )Nhead_dimrope_parameters
rope_thetag     @partial_rotary_factorg      ?attention_scalingr      dtypeinv_freqF)
persistent)super__init__getattrhidden_sizenum_attention_headshasattrrD   getintrG   dimrC   torcharangefloatregister_buffer)selfconfigrC   baserF   rU   rK   	__class__ ^/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/glmasr.pyrN   T   s(   
 z%GlmAsrEncoderRotaryEmbedding.__init__seq_lenc                 C   s0   t j|| jj| jjd}t || j}|| j S )a9  
        Compute rotary position frequencies for given sequence length.

        Args:
            seq_len: The sequence length to compute embeddings for.

        Returns:
            Frequency tensor with shape [seq_len, dim/2]. Use .cos() and
            .sin() to get the rotary embedding components.
        )devicerJ   )rV   rW   rK   rb   rJ   outerrG   )rZ   ra   seqfreqsr_   r_   r`   forwardr   s
   
z$GlmAsrEncoderRotaryEmbedding.forward)rB   N)
__name__
__module____qualname____doc__rN   rT   rV   Tensorrf   __classcell__r_   r_   r]   r`   rA   J   s    	rA   c                       V   e Zd ZdZ		ddedB def fddZdejd	ejd
ejdejfddZ	  Z
S )GlmAsrEncoderAttentiona$  
    Optimized Multi-headed Grouped Query Attention for GLM-ASR encoder.

    Uses vLLM's QKVParallelLinear for fused projections, ApplyRotaryEmb for
    rotary position embeddings, and MMEncoderAttention for hardware-optimized
    attention computation with automatic backend selection.
    N quant_configprefixc              	      s  t    || _|j| _|j| _t|d|j| _| j| j | _t	 | _
| j| j
 | _td| j| j
 | _t| j| j| j| jd|| dd| _t| j| jd|| dd| _t|dd }|re|dd	}nt|dd	}t| j| | _tdd
| _t| j| j| jd | j| dd| _d S )Nnum_key_value_headsr0   Tz	.qkv_projbiasrp   rq   z.o_projrD   rF   g      ?)enforce_enableg      z.attn)	num_heads	head_sizescalenum_kv_headsrq   )rM   rN   r[   rP   rQ   rv   rO   ry   rC   r   tp_sizenum_heads_per_rankmaxnum_kv_heads_per_rankr   qkv_projr   o_projrS   rT   
rotary_dimr   apply_rotary_embr   attn)rZ   r[   rp   rq   rope_paramsrF   r]   r_   r`   rN      sP   


zGlmAsrEncoderAttention.__init__hidden_statesrotary_pos_emb_cosrotary_pos_emb_sinrB   c                 C   s
  |j \}}}| |\}}| j| j }| j| j }	|j||	|	gdd\}
}}|
||| j| j}
|||| j| j}|||| j| j}| |
dd| jf |||
dd| jf< | |dd| jf |||dd| jf< | 	|
||}|||d}| 
|\}}|S )@  
        Args:
            hidden_states: [batch_size, seq_len, hidden_size]
            rotary_pos_emb_cos: [seq_len, rotary_dim/2] - cosine of rotary embeddings
            rotary_pos_emb_sin: [seq_len, rotary_dim/2] - sine of rotary embeddings

        Returns:
            [batch_size, seq_len, hidden_size]
        rU   .N)shaper~   r{   rC   r}   splitviewr   r   r   r   )rZ   r   r   r   
batch_sizera   _qkvq_sizekv_sizeqkvattn_outputoutputr_   r_   r`   rf      s$   zGlmAsrEncoderAttention.forwardNro   rg   rh   ri   rj   r   strrN   rV   rk   rf   rl   r_   r_   r]   r`   rn      s$    <rn   c                       sJ   e Zd ZdZ		ddedB def fddZdejd	ejfd
dZ	  Z
S )GlmAsrEncoderMLPzk
    Optimized MLP for GLM-ASR encoder.
    Uses vLLM's parallel linear layers for better performance.
    Nro   rp   rq   c                    sl   t    || _|j| _|j| _t| j| jd|| dd| _t|j| _	t
| j| jd|| dd| _d S )NTz.fc1rs   z.fc2)rM   rN   r[   rP   intermediate_sizer   fc1r   
hidden_actact_fnr   fc2rZ   r[   rp   rq   r]   r_   r`   rN     s&   
zGlmAsrEncoderMLP.__init__r   rB   c                 C   s*   |  |\}}| |}| |\}}|S N)r   r   r   )rZ   r   r   r_   r_   r`   rf   !     
zGlmAsrEncoderMLP.forwardr   r   r_   r_   r]   r`   r      s    r   c                       rm   )GlmAsrEncoderLayerz
    Optimized Transformer encoder layer for GLM-ASR.
    Combines attention and MLP with residual connections and layer norms.
    Nro   rp   rq   c                    sr   t    |j| _t||| dd| _t||| dd| _t|dd}tj	| j|d| _
tj	| j|d| _d S )Nz
.self_attnrp   rq   z.mlplayer_norm_epsh㈵>eps)rM   rN   rP   rn   	self_attnr   mlprO   nn	LayerNorminput_layernormpost_attention_layernormrZ   r[   rp   rq   r   r]   r_   r`   rN   .  s"   
zGlmAsrEncoderLayer.__init__r   r   r   rB   c                 C   sJ   |}|  |}| j|||d}|| }|}| |}| |}|| }|S )r   )r   r   r   )r   r   r   r   )rZ   r   r   r   residualr_   r_   r`   rf   I  s   


zGlmAsrEncoderLayer.forwardr   r   r_   r_   r]   r`   r   (  s$    r   c                   @   s$   e Zd ZdZdZdejfddZdS )_GlmAsrEncoderOutputa  
    Simple output container compatible with transformers' BaseModelOutput.

    This lightweight container holds the encoder output and is compatible
    with the transformers library's output format while being more efficient
    than a full dataclass.

    Attributes:
        last_hidden_state: Final layer hidden states from the encoder.
            Shape: [batch_size, seq_len, hidden_size]
    last_hidden_stater   c                 C   s
   || _ d S r   r   )rZ   r   r_   r_   r`   rN   z  s   
z_GlmAsrEncoderOutput.__init__N)rg   rh   ri   rj   	__slots__rV   rk   rN   r_   r_   r_   r`   r   k  s    r   c                       s   e Zd ZdZdg diZ		ddedB def fdd	Zd
ej	de
ej	ej	f fddZdej	defddZdee
eej	f  dee fddZ  ZS )GlmAsrEncodera  
    Optimized GLM-ASR Audio Encoder with vLLM native implementation.

    This encoder processes audio features through convolutional layers
    followed by transformer layers with rotary position embeddings.
    Optimized for performance with:
    - QKVParallelLinear for fused attention projections
    - Tensor parallelism support via ColumnParallelLinear/RowParallelLinear
    - Quantization support
    - Flash Attention (SDPA)
    r~   q_projk_projv_projNro   rp   rq   c                    s   t     | _tj j jddd| _tj j jdddd| _t	 fddt
 jD | _t dd	}tj j|d
| _t | _d S )N   r0   )kernel_sizepaddingrH   )r   strider   c                    s$   g | ]}t   d | dqS )z.layers.r   )r   ).0	layer_idxr[   rq   rp   r_   r`   
<listcomp>  s    z*GlmAsrEncoder.__init__.<locals>.<listcomp>r   r   r   )rM   rN   r[   r   Conv1dnum_mel_binsrP   conv1conv2
ModuleListrangenum_hidden_layerslayersrO   r   normrA   
rotary_embr   r]   r   r`   rN     s.   
	zGlmAsrEncoder.__init__input_lengthsrB   c                 C   s0   |d d d d }|d d d d }||fS )z
        Compute the output length after convolutions.

        Args:
            input_lengths: Input sequence lengths [batch_size]

        Returns:
            Tuple of (output after conv1, output after conv2)
        rH   r   r0   r_   )rZ   r   output_lengths_conv1output_lengths_conv2r_   r_   r`    _get_feat_extract_output_lengths  s   z.GlmAsrEncoder._get_feat_extract_output_lengthsinput_featuresc                 C   s   t jj| |}t jj| |}|dd}|jd }| |}|	 j
|jd}| j
|jd}| jD ]}||||}q9| |}t|dS )ag  
        Forward pass through the encoder.

        Args:
            input_features: [batch_size, num_mel_bins, seq_len]

        Returns:
            _GlmAsrEncoderOutput: Object with .last_hidden_state attribute                 containing [batch_size, seq_len', hidden_size] where seq_len'                 is the sequence length after convolutions
        r0   rH   rI   r   )rV   r   
functionalgelur   r   	transposer   r   costorJ   sinr   r   r   )rZ   r   r   output_seq_lenrotary_pos_embr   r   encoder_layerr_   r_   r`   rf     s   




zGlmAsrEncoder.forwardweightsc                 C   s   ddl m} g d}t|  }t }|D ]R\}}|D ](\}}	}
|	|vr%q||	|}|dr5||vr5q|| }|j}||||
  n|drN||vrNq||vrSq|| }t|d|}||| |	| q|S )zICustom weight loading to handle q_proj/k_proj/v_proj -> qkv_proj mapping.r   )default_weight_loader))r~   r   r   )r~   r   r   )r~   r   r   z.biasweight_loader)
-vllm.model_executor.model_loader.weight_utilsr   dictnamed_parameterssetreplaceendswithr   rO   add)rZ   r   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr   r_   r_   r`   load_weights  s0   
zGlmAsrEncoder.load_weightsr   )rg   rh   ri   rj   packed_modules_mappingr   r   rN   rV   rk   tupler   r   rf   r   r   r   rl   r_   r_   r]   r`   r   ~  s$    +
,%r   c                   @   s   e Zd ZU dZed ed< eeje	ej B e
ddddhdf ed< eeje	ej B e
dddhdf ed	< eeje	ej B e
d
f ed< dS )GlmAsrFeatureInputsz
    Dimensions:
        - num_chunks: Number of audio chunks (flattened)
        - nmb: Number of mel bins
        - num_audios: Number of original audio files
    audio_featurestype
num_chunksnmbchunk_lengthdynamic_dimsr   feature_attention_mask
num_audioschunk_countsN)rg   rh   ri   rj   r   __annotations__r   rV   rk   listr/   r_   r_   r_   r`   r     s$   
 r   c                   @   sF   e Zd ZU dZdZed ed< eee	j
 eddddhdf ed< dS )	GlmAsrEmbeddingInputsz
    Dimensions:
        - bn: Batch size
        - naf: Number of audio features
        - hs: Hidden size (must match the hidden size of language model
          backbone)
    audio_embedsr   bnnafhsr   N)rg   rh   ri   rj   r   r   r   r   r   rV   rk   r/   r_   r_   r_   r`   r   2  s   
 r   GlmAsrInputsc                       sN   e Zd ZdZ		ddededB def fddZd	ej	d
ej	fddZ
  ZS )GlmAsrMultiModalProjectora  
    Projects audio encoder outputs to language model hidden space.

    This projector uses a two-layer MLP to map audio features from the
    encoder's intermediate size to the language model's hidden size.
    Uses vLLM's parallel linear layers for tensor parallelism support.

    Architecture:
        - Linear layer: intermediate_size -> hidden_size * 2
        - Activation function (e.g., GELU)
        - Linear layer: hidden_size * 2 -> hidden_size
    Nro   r[   rp   rq   c                    sb   t    t|jj|jjd || dd| _t|j	| _
t|jjd |jj|| dd| _d S )NrH   z	.linear_1)
input_sizeoutput_sizerp   rq   z	.linear_2)rM   rN   r   audio_configr   text_configrP   linear_1r   projector_hidden_actactr   linear_2r   r]   r_   r`   rN   S  s   


z"GlmAsrMultiModalProjector.__init__r   rB   c                 C   s*   |  |\}}| |}| |\}}|S r   )r  r
  r  )rZ   r   r   r   r_   r_   r`   rf   h  r   z!GlmAsrMultiModalProjector.forwardr   )rg   rh   ri   rj   r   r   r   rN   rV   rk   rf   rl   r_   r_   r]   r`   r  E  s    r  c                   @   s\   e Zd ZdZdefddZdedefddZdede	fdd	Z
deeed
B f fddZd
S )GlmAsrProcessingInfoz
    Processing information provider for GLM-ASR model.

    Provides access to model configuration, processor, and feature extractor
    needed for audio preprocessing and multimodal integration.
    rB   c                 C   s   | j tS r   )ctxget_hf_configr   rZ   r_   r_   r`   r  w  s   z"GlmAsrProcessingInfo.get_hf_configkwargsc                 K   s   | j jtfi |S r   )r  get_hf_processorr   rZ   r  r_   r_   r`   r  z  s   z%GlmAsrProcessingInfo.get_hf_processorc                 K   s   | j di |jS Nr_   )r  feature_extractorr  r_   r_   r`   get_feature_extractor}  s   z*GlmAsrProcessingInfo.get_feature_extractorNc                 C   s   dd iS )Naudior_   r  r_   r_   r`   get_supported_mm_limits  s   z,GlmAsrProcessingInfo.get_supported_mm_limits)rg   rh   ri   rj   r   r  objectr   r  r   r  r   r   rT   r  r_   r_   r_   r`   r  o  s    r  c                	   @   s\   e Zd ZdZdeeef defddZ	ddedeeef deeef dB de	fd	d
Z
dS )GlmAsrDummyInputsBuilderz
    Builder for dummy inputs used in profiling and testing.

    Generates dummy text prompts and audio data that match the expected
    format for GLM-ASR model inputs. Used for memory profiling and
    performance benchmarking.
    	mm_countsrB   c                 C   s    | dd}| j }|j| S )Nr  r   )rS   infor  audio_token)rZ   r  r   hf_processorr_   r_   r`   get_dummy_text  s   

z'GlmAsrDummyInputsBuilder.get_dummy_textNra   
mm_optionsc           
      C   s`   | j  }|j}|dd}|r|dnd }t| j  dt}t|| }	d| j|	||diS )Nr  r   max_audio_len)lengthr   	overrides)	r  r  sampling_raterS   rO   r  r2   rT   _get_dummy_audios)
rZ   ra   r  r  r  r#  r   audio_overridesr   	audio_lenr_   r_   r`   get_dummy_mm_data  s   
z*GlmAsrDummyInputsBuilder.get_dummy_mm_datar   )rg   rh   ri   rj   r   r   rT   r  r   r   r'  r_   r_   r_   r`   r    s    	
r  	hf_inputsrB   c                 C   sn   |  d}|dur#ttdtjd|ddtjd|ddtddS ttdtdtdtddS )a  
    Configure multimodal field batching strategy for GLM-ASR.

    Determines how to batch audio inputs based on whether chunking is used.
    When chunk_counts is present, features are flattened across chunks;
    otherwise, they are batched normally.

    Args:
        hf_inputs: Dictionary of preprocessed inputs from HuggingFace processor.

    Returns:
        Dictionary mapping field names to MultiModalFieldConfig objects             that specify batching behavior.
    r   Nr  r   r   )r   r   r   r   )rS   r   r   batchedflat_from_sizes)r(  r   r_   r_   r`   _glmasr_field_config  s$   

r+  c                       sH   e Zd ZdZdeeejf ee	 B de
e	e	f dB f fddZ  ZS )GlmAsrMultiModalDataParserz
    Custom parser for GLM-ASR multimodal data.

    Extends the base parser to handle GLM-ASR specific audio data formats,
    including both pre-computed audio embeddings and raw audio features.
    datarB   Nc                    s(   t |trt|ddhtdS t |S )Nr  r   )modalityrequired_fieldsfields_factory)
isinstancer   r    r+  rM   _parse_audio_data)rZ   r-  r]   r_   r`   r2    s   
z,GlmAsrMultiModalDataParser._parse_audio_data)rg   rh   ri   rj   r   r   rV   rk   r!   r   r"   r2  rl   r_   r_   r]   r`   r,    s    r,  c                
       s   e Zd ZdZdefddZdee dede	dee
 fdd	Zd
edeeef deeef deeef def
 fddZdedeeef deeef fddZdedeeef dedee fddZ  ZS )GlmAsrMultiModalProcessorz
    GLM-ASR processor that inherits directly from BaseMultiModalProcessor
    for better performance and cleaner implementation.
    rB   c                 C   s   | j  }t|jdS )N)	target_sr)r  r  r,  r#  )rZ   r  r_   r_   r`   _get_data_parser  s   
z*GlmAsrMultiModalProcessor._get_data_parser
audio_listr  	processorc                 C   s   |j }|j}t|dt}t|| }t|| }g }	|D ]#}
t|
tr't|
n|
jd }t	d|| d | }|	
t|| q|	S )Nr   r   r0   )r#  r   rO   r2   rT   r1  r   lenr   r|   appendmin)rZ   r6  r  r7  r#  r   r   window_sizemax_windowsr   r  	n_samplesn_chunksr_   r_   r`   _calculate_chunk_counts  s   z1GlmAsrMultiModalProcessor._calculate_chunk_countspromptmm_data	mm_kwargs
tok_kwargsc                    sF  d|v r| d|d< |dg }|rt|ts|gn|}|s6| j |}| |}tt	|gdddS | jj
di |}t	di |d|ji}t j||||d}	d|	v ra|	 d|	d	< n%d	|	vrd
|	v r|	d
 }
t|
tjrtj|
jd |
jd tjd}||	d	< | jjdi |}| ||j|}tj|tjd|	d< |	S )Naudiosr  )	input_idspt)tensor_typer#  )r@  rA  rB  rC  input_feature_maskr   r   r   r   rI   r   r_   )poprS   r1  r   r  get_tokenizerencode_apply_hf_processor_tokens_onlyr
   r   r  r#  rM   _call_hf_processorrV   rk   onesr   longr  r?  r  tensor)rZ   r@  rA  rB  rC  r  r6  
prompt_idsr  outputsr   maskr7  r   r]   r_   r`   rM    sJ   
	z,GlmAsrMultiModalProcessor._call_hf_processorr(  hf_processor_mm_kwargsc                 C   s   t |S r   )r+  )rZ   r(  rT  r_   r_   r`   _get_mm_fields_config?  s   z/GlmAsrMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    s  | j jdi |}| j  }| }| j  }t|dd}||d u r)|jt|dt}	t|dt	}
|
 d}d}g  |d urddlm}m} |d urd	}||D ])}|| }||| }t|trqt|}|||	|
} t|   |}q[n/tt|D ](}|||d  }t|trt|d	}|||	|
} t|   qd
tf fdd}td||dgS )Nr  <|pad|>merge_factorconv_paramsr   r   r0   )_as_list_chunk_counts#_get_audio_output_lengths_from_maskr   item_idxc                    sh    r |  }n d}|d ur||  }|jd }ntd|dkr&tdgt| }tj|dS )Nr   r   z>Either feature_attention_mask or audio_embeds must be providedzAudio is too short)embed_token_id)rS   r   
ValueErrorrT   r*   select_token_id)r]  num_featuresr   embedaudio_tokensaudio_output_lengthsaudio_token_idout_mm_datar_   r`   get_replacement_glmasr}  s    

zMGlmAsrMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_glmasrr  )r.  targetreplacementr_   )r  r  rJ  	get_vocabr  rO   rS   rf  r3   r1   get_dataglmasr_utilsr[  r\  r1  r   rV   stackr9  rT   sumitemr   r8  rP  	unsqueezer(   )rZ   rV  rT  rW  r7  	tokenizervocabr[   r  rY  rZ  r   r   r[  r\  	start_idxcountend_idxrS  lengthsidxrh  r_   rd  r`   _get_prompt_updatesF  sX   







z-GlmAsrMultiModalProcessor._get_prompt_updates)rg   rh   ri   rj   r$   r5  r   r   r   r   rT   r?  r   r   r  r   r
   rM  r   rU  r#   r   r   r)   ry  rl   r_   r_   r]   r`   r3    sN    



>



r3  )r  dummy_inputsc                       s  e Zd ZeZg dddgdZdddedef fd	d
Ze	dede
dedB fddZdefddZdededB fddZdedejeejdf B fddZdedefddZ		d6dejdejdedB dejdB dedejeB fd d!Zd"ejdejdB fd#d$Zd%eeeejf  dee fd&d'Ze	d(edefd)d*Ze	d(ed+edefd,d-Z e	d.e!j"d(ed/ed0edB d+e#d1 d2ed3edB de$fd4d5Z%  Z&S )7GlmAsrForConditionalGenerationr   	gate_projup_proj)r~   gate_up_projro   )rq   vllm_configrq   c                   s   t    |jj}|j}|jj}|| _|| _|| _| |d t|j	|t
|dd| _t||t
|dd| _W d    n1 sAw   Y  | | t||jt
|ddgd| _W d    n1 sdw   Y  | jj| _d S )Nr  audio_towerr   multi_modal_projectorlanguage_modelLlamaForCausalLM)r  	hf_configrq   architectures)rM   rN   model_configr  rp   multimodal_configr[   _mark_tower_modelr   r  r?   r  r  r  _mark_language_modelr>   r  r  make_empty_intermediate_tensors)rZ   r  rq   r[   rp   r  r]   r_   r`   rN     s:   


	z'GlmAsrForConditionalGeneration.__init__r.  irB   Nc                 C   s   | drdS td)Nr  z)<|begin_of_audio|><|pad|><|end_of_audio|>z Only audio modality is supported)
startswithr_  )clsr.  r  r_   r_   r`   get_placeholder_str  s   
z2GlmAsrForConditionalGeneration.get_placeholder_strc                 C   s   t jddddS )Nzlanguage_model.zmulti_modal_projector.zaudio_tower.)r  	connectortower_model)r   from_string_fieldr  r_   r_   r`   get_mm_mapping  s
   z-GlmAsrForConditionalGeneration.get_mm_mappingr  c                 K   sX   | dd }|d urtd|dS | dd }|d u rd S td|| dd | dd dS )Nr   )r   r   r   r   r   r   )r   r   r   r   )rI  r   r   )rZ   r  r   r   r_   r_   r`   _parse_and_validate_audio_input  s   

z>GlmAsrForConditionalGeneration._parse_and_validate_audio_inputaudio_input.c                 C   sP  |d dkrt |d S |d }|d }t|tr'tj|dd}tj|dd}|jd }t|d|d}|j| j	j
jjd	}| 	|j}| jjj}| jjj}|| }	|jd
 }
|
|	 |	 }||
k ro|d d d |d d f }||d|}| |}t| jdt}t| jdt}t| j	|d||}t||}t||  }t||S )Nr   r   r   r   r   r   r   )r   rI   r0   r   rY  rZ  )r   r1  r   rV   catr   r7   rS   r   r  r   weightrJ   r   r[   r  rP   r   reshaper  rO   r3   r1   r5   ro  r4   r   flattentolistr6   )rZ   r  r   r   r   r   audio_hidden_statesrP   r   merge_ratiora   seq_len_truncatedr   rY  rZ  re  masked_audio_featureschunk_embeddingsr_   r_   r`   _process_audio_input  sR   







z3GlmAsrForConditionalGeneration._process_audio_inputc                 K   s*   | j di |}|d u rg S | |}|S r  )r  r  )rZ   r  r  r  r_   r_   r`   embed_multimodal+  s
   
z/GlmAsrForConditionalGeneration.embed_multimodalrE  	positionsintermediate_tensorsinputs_embedsc                 K   s$   |d urd }| j j||||d}|S )N)r  )r  model)rZ   rE  r  r  r  r  r   r_   r_   r`   rf   4  s   z&GlmAsrForConditionalGeneration.forwardr   c                 C   s   | j |S r   )r  compute_logits)rZ   r   r_   r_   r`   r  G  s   z-GlmAsrForConditionalGeneration.compute_logitsr   c                 C   s   dg}t | |d}||S )Nzaudio_tower.embed_positions)skip_prefixes)r=   r   )rZ   r   r  loaderr_   r_   r`   r   M  s   
z+GlmAsrForConditionalGeneration.load_weightsr  c                 C   s   t |}t|ddS )znGet the audio token from processor.

        Similar to get_placeholder_str but returns single token.
        r  rX  )r-   rO   )r  r  r7  r_   r_   r`   _get_audio_tokenR  s   z/GlmAsrForConditionalGeneration._get_audio_token	task_typec                 C   s(   t |}|j}t|dt}t||jdS )Nr   )max_audio_clip_ssample_rate)r-   r  rO   r2   r   r#  )r  r  r  r7  r  r  r_   r_   r`   get_speech_to_text_config[  s   z8GlmAsrForConditionalGeneration.get_speech_to_text_configr  
stt_configlanguage)
transcribe	translaterequest_promptto_languagec                 C   s   t |}| |}	|dkr| j||}
|	 d|
 }n|dkr&|	 d}ntd| d|dg}|j|dd	d
}||}|d|id}tt|S )z@Get the generation prompt to be used for transcription requests.r  ztranslate the speech to r  z4can you transcribe the speech into a written format?zUnsupported task type user)rolecontentFT)tokenizeadd_generation_promptr  )prompt_token_idsmulti_modal_data)	r,   r  supported_languagesrS   r_  apply_chat_templaterK  r	   r   )r  r  r  r  r  r  r  r  rr  r  full_lang_name_touser_contentmessagesr@  r  prompt_dictr_   r_   r`   get_generation_promptg  s$   


z4GlmAsrForConditionalGeneration.get_generation_prompt)NN)'rg   rh   ri   r@   r  r   r   r   rN   classmethodrT   r  r   r  r  r  r  rV   rk   r   r  r8   r  r+   rf   r  r   r   r   r   r  r   r  npndarrayr   r   r  rl   r_   r_   r]   r`   r{    s    !
@

$	r{  )tcollections.abcr   r   r   typingr   r   r   r   r	   numpyr  rV   torch.nnr   transformersr
   transformers.models.glmasrr   r   transformers.models.whisperr   vllm.configr   r   r   vllm.config.multimodalr   vllm.distributed.parallel_stater   vllm.inputs.datar   %vllm.model_executor.layers.activationr   9vllm.model_executor.layers.attention.mm_encoder_attentionr   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr   2vllm.model_executor.layers.rotary_embedding.commonr   )vllm.model_executor.models.module_mappingr   vllm.multimodalr   vllm.multimodal.inputsr   r   r   vllm.multimodal.parser    r!   r"   r#   r$   vllm.multimodal.processingr%   r&   r'   r(   r)   r*   vllm.sequencer+   vllm.tokenizersr,   !vllm.transformers_utils.processorr-   vllm.utils.tensor_schemar.   r/   rm  r1   r2   r3   r4   r5   r6   r7   
interfacesr8   r9   r:   r;   r<   utilsr=   r>   r?   whisperr@   ModulerA   rn   r   r   r   r   r   r   r  r   r  r  r  r   rk   r   r+  r,  r3  register_processorr{  r_   r_   r_   r`   <module>   sv    $	;y*C *%

% :

