o
    ei~8                     @   s   d dl Z d dlZddlmZmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZ e r5d dlZeeZG dd	 d	e	d
dZG dd de
ZdgZdS )    N   )
AudioInputmake_list_of_audio)BatchFeature)ProcessingKwargsProcessorMixinUnpack)	TextInput)is_torch_availableloggingc                   @   s,   e Zd Zddidddddddd	d
ZdS )GlmAsrProcessorKwargspaddingTi>  g      >@
max_length)sampling_ratechunk_lengthreturn_attention_maskr   ptleft)return_tensorspadding_side)text_kwargsaudio_kwargscommon_kwargsN)__name__
__module____qualname__	_defaults r   r   j/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/glmasr/processing_glmasr.pyr   (   s    
r   F)totalc                       s   e Zd ZdZ				d fdd	ZdddZ		d deee B dedB de	dB de
e d
ef
ddZed
ee fddZ	d!deee B eB deee B dB de
e d
efddZddddZded
efddZ  ZS )"GlmAsrProcessora  
    Constructs an GlmAsr processor which wraps an GlmAsr feature extractor and an GlmAsr
    tokenizer into a single processor.

    [`GlmAsrProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and
    [`Qwen2TokenizerFast`]. See the [`~GlmAsrProcessor.__call__`] for more information.

    Args:
            feature_extractor ([`WhisperFeatureExtractor`]):
                The feature extractor is a required input.
            tokenizer ([`Qwen2TokenizerFast`]):
                The tokenizer is a required input.
            chat_template (`Optional[str]`, *optional*):
                The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
                template will be used.
            audio_token (`Optional[str]`, *optional*, defaults to `"<|pad|>`"):
                Special token used to represent audio inputs in the chat template.
            default_transcription_prompt (`str`, *optional*, defaults to `"Please transcribe this audio into text"`):
                Default prompt to use for transcription tasks when applying transcription requests.
            max_audio_len (`int`, *optional*, defaults to 655):
                Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
                655 gives approximately 8192 tokens, corresponding to the maximum sequence length of the text model.
    N<|pad|>&Please transcribe this audio into text  c                    s4   || _ ||| _|| _|| _t j|||d d S )N)chat_template)audio_tokenconvert_tokens_to_idsaudio_token_iddefault_transcription_promptmax_audio_lensuper__init__)selffeature_extractor	tokenizerr$   r%   r(   r)   	__class__r   r   r+   S   s
   	zGlmAsrProcessor.__init__audio_lengthstorch.Tensorreturnc                 C   sH   d}dD ]\}}}|d|  |d  d | d }q|| | d }|S )N   ))   r   r5   )r5   r      r6   r5   r   )r,   r1   merge_factorr   kernel_sizestride
num_tokensr   r   r   _get_audio_token_lengthb   s
   "z'GlmAsrProcessor._get_audio_token_lengthFtextaudiooutput_labelskwargsc              
   K   s  | j tfd| jji|}|d }|d }|d}|dkr't| jj dt|t	r0|g}nt|t
tfr@tdd |D sDtd	i }	|d
urt|}t|t|krftdt| dt| dt|d |d  }
t| j|d  }g }g }|D ]Y}t|jd }td||
 d |
 }||krtd||d  dd| j d| j d |}|| t|||
 }t|D ]}||
 }t|d |
 |}||||  qq| j|fi |}	|	d}||	d< tdd t|d|D }| |}t|D ]\}}t !t "| j#| j#| || }|||< q| j|fi |}i ||	}|rJ|d $ }d||| j%k< d||| jj&k< ||d< t'||dS ) a=  
        Main method to prepare one or several text sequence(s) and audio waveform(s) for the model. This
        method expands `<sound>` placeholders in the text based on the post-pool frame counts of the
        audio windows, then tokenizes the provided strings as-is, and extracts log-mel features
        with [`WhisperFeatureExtractor`]. If `audio` is `None`, no audio processing is performed and
        the text is tokenized as-is (LM-only behavior).

        Args:
            text (`str` or `list[str]`):
                Input sequence or batch of sequences.
            audio (`np.ndarray` or `list[np.ndarray]`):
                Input audio or batch of audios as NumPy arrays. If provided, there must be as many `text` inputs as
                `audio` inputs.
            output_labels (bool, *optional*, default=False):
                Whether to return labels for training.

        Returns:
            [`BatchFeature`]: A dictionary with tokenized text (`input_ids`, `attention_mask`) and
            audio features (`input_features`, `input_features_mask`).
        tokenizer_init_kwargsr   r   r   r   z% only supports `return_tensors='pt'`.c                 s       | ]}t |tV  qd S N
isinstancestr).0tr   r   r   	<genexpr>       z+GlmAsrProcessor.__call__.<locals>.<genexpr>zAInvalid input text. Please provide a string, or a list of stringsNzGot z
 text but z audios; they must match 1:1.r   r   r   r5   zAudio duration (z.1fzs) exceeds zs; truncating to first zs.attention_maskinput_features_maskc                 S   s   g | ]}|  qS r   )sum)rF   sr   r   r   
<listcomp>   s    z,GlmAsrProcessor.__call__.<locals>.<listcomp>	input_idsilabels)datatensor_type)(_merge_kwargsr   r.   init_kwargsget
ValueErrorr0   r   rD   rE   listtupleallr   lenintr)   shapemaxloggerwarningappendminranger-   poptorchstacksplitrL   r;   	enumerateresubescaper%   cloner'   pad_token_idr   )r,   r<   r=   r>   r?   call_kwargsr   r   r   audio_inputswindow_sizemax_windowsper_sample_windowsflat_chunksaudio_el	n_samplesn_wintime_capistartendpadding_maskr1   audio_tokens_lengthsaudio_lengthexpandedtext_inputsrR   rQ   r   r   r   __call__j   sp   

 
$

"
 zGlmAsrProcessor.__call__c                 C   s(   | j j}| jj}tt|| dg S )NrK   )r.   model_input_namesr-   rX   dictfromkeys)r,   	tok_names	fea_namesr   r   r   r      s   z!GlmAsrProcessor.model_input_namespromptc           	      K   sP  t |tr	|g}n't |ttfr |r tdd |D r t|}ntt|}t r0dd |D }t|}|dkr<td|du rG| j	g| }nJt |trR|g| }n?t |ttfrt||krltdt| d	| d
g }|D ]}|du r}|
| j	 qpt |tr|
| qptdntddd t||D }| j|fdddd|S )a  
        Prepare inputs for automatic speech recognition without manually writing the default transcription prompt.

        Args:
            audio (`str`, `list[str]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
                Audio to transcribe. Strings are interpreted as local paths or URLs and will be loaded automatically by
                the chat template loader; NumPy arrays and PyTorch tensors are forwarded directly.
            prompt (`str` or `list[str]`, *optional*):
                Custom prompt(s) to include in the user turn. A list must be the same length as the batch. When `None`,
                each sample uses `"Transcribe the input speech."`.
            **kwargs:
                Additional keyword arguments forwarded to [`~AudioFlamingo3Processor.apply_chat_template`] (for example
                `text_kwargs`, `audio_kwargs`, ...).

        Returns:
            [`BatchFeature`]: Processor outputs ready to be passed to [`AudioFlamingo3ForConditionalGeneration.generate`].

        c                 s   rA   rB   rC   rF   elr   r   r   rH      rI   z>GlmAsrProcessor.apply_transcription_request.<locals>.<genexpr>c                 S   s,   g | ]}t |tjr|   n|qS r   )rD   re   Tensordetachcpunumpyr   r   r   r   rN      s   , z?GlmAsrProcessor.apply_transcription_request.<locals>.<listcomp>r   z)`audio` must contain at least one sample.Nz	Received z prompt(s) for z$ audio sample(s); counts must match.z'Each prompt must be a string or `None`.z<`prompt` must be a string, a sequence of strings, or `None`.c                 S   s@   g | ]\}}d t |trd|dnd|dd|dgdgqS )userr=   )typepath)r   r=   r<   )r   r<   )rolecontentrC   )rF   prompt_text
audio_itemr   r   r   rN     s    T)tokenizeadd_generation_promptreturn_dict)rD   rE   rX   rY   rZ   r   r
   r[   rW   r(   ra   	TypeErrorzipapply_chat_template)	r,   r=   r   r?   audio_items
batch_sizepromptsitemconversationsr   r   r   apply_transcription_request   sP   
$


z+GlmAsrProcessor.apply_transcription_request)strip_prefixc                   s,    j j|i |}|r fdd|D }|S )ap  
        Forward arguments to [`~PreTrainedTokenizer.batch_decode`] and optionally remove the assistant framing the model
        was trained to produce.

        AF3 transcription requests respond with sentences such as `"The spoken content of the audio is "..."."`.
        Setting `strip_prefix=True` trims the fixed prefix for just the transcription text.
        c                    s   g | ]}  |qS r   )"_strip_assistant_prefix_and_quotes)rF   r<   r,   r   r   rN   0  s    z0GlmAsrProcessor.batch_decode.<locals>.<listcomp>)r.   batch_decode)r,   r   argsr?   decodedr   r   r   r   &  s   zGlmAsrProcessor.batch_decodec                 C   s   |  }dD ]}||r|t|d   } nq|dr'|dd   }t|dkrC|d |d krC|d dv rC|dd   }|S )	zi
        Remove the assistant prefix and surrounding quotes from a decoded transcription string.
        )z"The spoken content of the audio isz!The transcription of the audio isN.rO   r6   r   >   "'r5   )strip
startswithr[   endswith)r,   r<   strippedprefixr   r   r   r   3  s   

(z2GlmAsrProcessor._strip_assistant_prefix_and_quotes)Nr!   r"   r#   )r1   r2   r3   r2   )NFrB   )r   r   r   __doc__r+   r;   r	   rX   r   boolr   r   r   r   propertyrE   r   r   r   r   __classcell__r   r   r/   r   r    :   sF    


e
Qr    )ri   r   npaudio_utilsr   r   feature_extraction_utilsr   processing_utilsr   r   r   tokenization_utils_baser	   utilsr
   r   re   
get_loggerr   r_   r   r    __all__r   r   r   r   <module>   s   
  
