o
    eiI                     @   s  d dl mZ d dlZddlmZ ddlmZmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlmZ ddl m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/ e rd dl0Z0d dl0m1Z1 e2e3Z4G dd de&Z5G dd de%Z6G dd de(Z7d0ddZ8G dd  d e*Z9G d!d" d"e1j:Z;G d#d$ d$eZ<G d%d& d&e#Z=G d'd( d(e=Z>G d)d* d*e"Z?ed+d,G d-d. d.e!Z@g d/ZAdS )1    )CallableN   )ACT2FN)
AudioInputmake_list_of_audio)Cache)BatchFeature)GradientCheckpointingLayer)BaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringis_torch_availablelogging)can_return_tuplemerge_with_config_defaults)capture_outputs   )&AudioFlamingo3ForConditionalGeneration!AudioFlamingo3MultiModalProjectorAudioFlamingo3PreTrainedModel)AudioFlamingo3ProcessorAudioFlamingo3ProcessorKwargs)GlmRotaryEmbedding)LlamaAttentioneager_attention_forwardrotate_half   )GlmAsrConfigGlmAsrEncoderConfig)nnc                   @      e Zd ZdS )GlmAsrProcessorKwargsN__name__
__module____qualname__ r)   r)   g/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/glmasr/modular_glmasr.pyr$   1       r$   c                	       sn   e Zd ZdZ				d fdd	ZdddZ	ddeee B eB deee B dB de	e
 d
efddZ  ZS )GlmAsrProcessora  
    Constructs an GlmAsr processor which wraps an GlmAsr feature extractor and an GlmAsr
    tokenizer into a single processor.

    [`GlmAsrProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and
    [`Qwen2TokenizerFast`]. See the [`~GlmAsrProcessor.__call__`] for more information.

    Args:
            feature_extractor ([`WhisperFeatureExtractor`]):
                The feature extractor is a required input.
            tokenizer ([`Qwen2TokenizerFast`]):
                The tokenizer is a required input.
            chat_template (`Optional[str]`, *optional*):
                The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
                template will be used.
            audio_token (`Optional[str]`, *optional*, defaults to `"<|pad|>`"):
                Special token used to represent audio inputs in the chat template.
            default_transcription_prompt (`str`, *optional*, defaults to `"Please transcribe this audio into text"`):
                Default prompt to use for transcription tasks when applying transcription requests.
            max_audio_len (`int`, *optional*, defaults to 655):
                Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
                655 gives approximately 8192 tokens, corresponding to the maximum sequence length of the text model.
    N<|pad|>&Please transcribe this audio into text  c                    s   t  j||||||d d S )N)chat_templateaudio_tokendefault_transcription_promptmax_audio_len)super__init__)selffeature_extractor	tokenizerr0   r1   r2   r3   	__class__r)   r*   r5   M   s   	
zGlmAsrProcessor.__init__audio_lengthstorch.Tensorreturnc                 C   sH   d}dD ]\}}}|d|  |d  d | d }q|| | d }|S )N   )r   r   r   )r   r   r   r   r   r)   )r6   r;   merge_factorpaddingkernel_sizestride
num_tokensr)   r)   r*   _get_audio_token_length_   s
   "z'GlmAsrProcessor._get_audio_token_lengthaudiopromptkwargsc           	      K   sP  t |tr	|g}n't |ttfr |r tdd |D r t|}ntt|}t r0dd |D }t|}|dkr<td|du rG| j	g| }nJt |trR|g| }n?t |ttfrt||krltdt| d	| d
g }|D ]}|du r}|
| j	 qpt |tr|
| qptdntddd t||D }| j|fdddd|S )a  
        Prepare inputs for automatic speech recognition without manually writing the default transcription prompt.

        Args:
            audio (`str`, `list[str]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
                Audio to transcribe. Strings are interpreted as local paths or URLs and will be loaded automatically by
                the chat template loader; NumPy arrays and PyTorch tensors are forwarded directly.
            prompt (`str` or `list[str]`, *optional*):
                Custom prompt(s) to include in the user turn. A list must be the same length as the batch. When `None`,
                each sample uses `"Transcribe the input speech."`.
            **kwargs:
                Additional keyword arguments forwarded to [`~AudioFlamingo3Processor.apply_chat_template`] (for example
                `text_kwargs`, `audio_kwargs`, ...).

        Returns:
            [`BatchFeature`]: Processor outputs ready to be passed to [`AudioFlamingo3ForConditionalGeneration.generate`].

        c                 s   s    | ]}t |tV  qd S N
isinstancestr.0elr)   r)   r*   	<genexpr>   s    z>GlmAsrProcessor.apply_transcription_request.<locals>.<genexpr>c                 S   s,   g | ]}t |tjr|   n|qS r)   )rK   torchTensordetachcpunumpyrM   r)   r)   r*   
<listcomp>   s   , z?GlmAsrProcessor.apply_transcription_request.<locals>.<listcomp>r   z)`audio` must contain at least one sample.Nz	Received z prompt(s) for z$ audio sample(s); counts must match.z'Each prompt must be a string or `None`.z<`prompt` must be a string, a sequence of strings, or `None`.c                 S   s@   g | ]\}}d t |trd|dnd|dd|dgdgqS )userrF   )typepath)rX   rF   text)rX   rZ   )rolecontentrJ   )rN   prompt_text
audio_itemr)   r)   r*   rV      s    T)tokenizeadd_generation_promptreturn_dict)rK   rL   listtupleallr   r   len
ValueErrorr2   append	TypeErrorzipapply_chat_template)	r6   rF   rG   rH   audio_items
batch_sizepromptsitemconversationsr)   r)   r*   apply_transcription_requestg   sP   
$


z+GlmAsrProcessor.apply_transcription_request)Nr-   r.   r/   )r;   r<   r=   r<   rI   )r&   r'   r(   __doc__r5   rE   rL   rb   r   r   r$   r   rp   __classcell__r)   r)   r9   r*   r,   4   s$    
r,   c                   @   r#   )GlmAsrRotaryEmbeddingNr%   r)   r)   r)   r*   rs      r+   rs   c                 C   s   | |}| |}|jd }| dd |f | d|d f }}|dd |f |d|d f }	}
|| t||  }|	| t|	|  }tj||gdd}tj||
gdd}||fS )N.)dim)	unsqueezeshaper   rQ   cat)qkcossinposition_idsunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embedr)   r)   r*   apply_rotary_pos_emb   s   


""r   c                       sf   e Zd Zdedef fddZ	ddejdeejejf dB de	e
 d	eejejf fd
dZ  ZS )GlmAsrAttentionconfig	layer_idxc                    s   t  || d| _tj|j|j| j dd| _tj|j|j	| j dd| _
tj|j|j	| j dd| _tj|j| j |jdd| _d S )NFT)bias)r4   r5   	is_causalr"   Linearhidden_sizenum_attention_headshead_dimq_projnum_key_value_headsk_projv_projo_projr6   r   r   r9   r)   r*   r5      s    zGlmAsrAttention.__init__Nhidden_statesposition_embeddingsrH   r=   c                 K   s   |j d d }g |d| jR }| ||dd}| ||dd}| ||dd}|\}	}
t|||	|
\}}t	| j
jt}|| |||fd | jsVdn| j| jd|\}}|jg |dR   }| |}||fS )Nrt   r   r   g        )attention_maskdropoutscaling)rw   r   r   view	transposer   r   r   r   get_interfacer   _attn_implementationr   trainingattention_dropoutr   reshape
contiguousr   )r6   r   r   rH   input_shapehidden_shapequery_states
key_statesvalue_statesr{   r|   attention_interfaceattn_outputattn_weightsr)   r)   r*   forward   s2   

zGlmAsrAttention.forwardrI   r&   r'   r(   r    intr5   rQ   rR   rc   r   r   r   rr   r)   r)   r9   r*   r      s    r   c                       s,   e Zd Z fddZdejfddZ  ZS )	GlmAsrMLPc                    s>   t    t|j|j| _t|j|j| _t|j	 | _
d S rI   )r4   r5   r"   r   r   intermediate_sizefc1fc2r   
hidden_actact_fnr6   r   r9   r)   r*   r5      s   
zGlmAsrMLP.__init__r   c                 C   s"   |  |}| |}| |}|S rI   )r   r   r   )r6   r   r)   r)   r*   r     s   


zGlmAsrMLP.forward)r&   r'   r(   r5   rQ   rR   r   rr   r)   r)   r9   r*   r      s    r   c                	       s\   e Zd Zdedef fddZ	ddejdeejejf dB de	e
 d	ejfd
dZ  ZS )GlmAsrEncoderLayerr   r   c                    sJ   t    |j| _t||d| _t|| _t|j| _	t|j| _
d S )N)r   r   )r4   r5   r   r   	self_attnr   mlpr"   	LayerNorminput_layernormpost_attention_layernormr   r9   r)   r*   r5   
  s   

zGlmAsrEncoderLayer.__init__Nr   r   rH   r=   c                 K   sT   |}|  |}| jd||d|\}}|| }|}| |}| |}|| }|S )N)r   r   r)   )r   r   r   r   )r6   r   r   rH   residual_r)   r)   r*   r     s   



zGlmAsrEncoderLayer.forwardrI   r   r)   r)   r9   r*   r   	  s    r   c                   @   r#   )GlmAsrPreTrainedModelNr%   r)   r)   r)   r*   r   ,  r+   r   c                       sb   e Zd ZU eed< dZdZdgZee	dZ
def fddZeeedee fd	d
Z  ZS )GlmAsrEncoderr   input_featuresrF   r   )r   
attentionsc                    s   t    tj j jddd| _tj j jdddd| _t fddt	 j
D | _t j| _t d| _d	| _|   d S )
Nr   r   )rB   rA   r   )rB   rC   rA   c                    s   g | ]}t  |qS r)   )r   )rN   r   r   r)   r*   rV   @  s    z*GlmAsrEncoder.__init__.<locals>.<listcomp>r   F)r4   r5   r"   Conv1dnum_mel_binsr   conv1conv2
ModuleListrangenum_hidden_layerslayersr   normrs   
rotary_embgradient_checkpointing	post_initr   r9   r   r*   r5   :  s   zGlmAsrEncoder.__init__rH   c                 K   s   t j| |}t j| |}|dd}|}| j|tj|j	d |j
dd d d f d}| jD ]}||fd|i|}q3| |}t|dS )Nr   r   device)r}   r   )last_hidden_state)r"   
functionalgelur   r   r   r   rQ   arangerw   r   r   r   r
   )r6   r   rH   inputs_embedsr   r   encoder_layerr)   r)   r*   r   G  s   "


zGlmAsrEncoder.forward)r&   r'   r(   r!   __annotations__main_input_nameinput_modalities_no_split_modulesr   r   _can_record_outputsr5   r   r   r   r   r   r   rr   r)   r)   r9   r*   r   0  s   
  r   c                       s"   e Zd Zdef fddZ  ZS )GlmAsrMultiModalProjectorr   c                    sB   t    t|jj|jjd | _t|jjd |jj| _	d S )Nr   )
r4   r5   r"   r   audio_configr   text_configr   linear_1linear_2r   r9   r)   r*   r5   \  s   
z"GlmAsrMultiModalProjector.__init__)r&   r'   r(   r    r5   rr   r)   r)   r9   r*   r   [  s    r   z~
    The GlmAsr model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Llama language model.
    custom_introc                       s   e Zd Zeedddejdejdee	 de
eB fddZ																					
ddejd	B dejd	B dejd	B dejd	B dejd	B ded	B dejd	B dejd	B ded	B dejd	B deejB dee	 def fddZ  ZS )GlmAsrForConditionalGenerationzgCompute audio embeddings from log-mel input features using the audio encoder and multi-modal projector.r   r   input_features_maskrH   r=   c                 K   s   | j |fddi|}|j}||jd d| jjj}| |}|d}dD ]\}}	}
|d|  |	d  d |
 d }q'd}|| | d }t	j
|jd |jd	d d d f |d d d f k }|||j |_|S )
Nra   Tr   rt   r?   r   r   r>   r   )audio_towerr   r   rw   r   r   r   multi_modal_projectorsumrQ   r   r   topooler_output)r6   r   r   rH   audio_outputsaudio_hidden_statesaudio_embedsr;   rA   rB   rC   r@   post_lengths
valid_maskr)   r)   r*   get_audio_featuresh  s   


"2z1GlmAsrForConditionalGeneration.get_audio_featuresNr   	input_idsr   r}   past_key_valuesr   labels	use_cachecache_positionlogits_to_keepc                    s&   t  jd|||||||	|
|d	|S )a  
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import GlmAsrForConditionalGeneration, AutoProcessor

        >>> model_id = "zai-org/GLM-ASR-Nano-2512"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = GlmAsrForConditionalGeneration.from_pretrained(model_id, dtype="auto", device_map="auto")
        >>> inputs = processor.apply_transcription_request("https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3")

        >>> inputs = inputs.to(model.device, dtype=model.dtype)

        >>> outputs = model.generate(**inputs, do_sample=False, max_new_tokens=500)

        >>> decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1] :], skip_special_tokens=True)
        >>> print(decoded_outputs)
        ```)	r   r   r}   r   r   r   r   r   r   Nr)   )r4   r   )r6   r   r   r   r   r}   r   r   r   r   r   r   rH   r9   r)   r*   r     s   +
z&GlmAsrForConditionalGeneration.forward)NNNNNNNNNNr   )r&   r'   r(   r   r   rQ   FloatTensorrR   r   r   rc   r
   r   
LongTensorr   boolr   r   r   rr   r)   r)   r9   r*   r   b  sh    	
r   )r   r   r,   r   )Nr   )Bcollections.abcr   rU   npactivationsr   audio_utilsr   r   cache_utilsr   feature_extraction_utilsr   modeling_layersr	   modeling_outputsr
   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   &audioflamingo3.modeling_audioflamingo3r   r   r   (audioflamingo3.processing_audioflamingo3r   r   glm.modeling_glmr   llama.modeling_llamar   r   r   configuration_glmasrr    r!   rQ   r"   
get_loggerr&   loggerr$   r,   rs   r   r   Moduler   r   r   r   r   r   __all__r)   r)   r)   r*   <module>   sL   
 
-#+V