o
    ei4                     @   s   d dl Z d dlZddlmZmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZ e r5d dlZeeZG dd	 d	e	d
dZG dd de
ZdgZdS )    N   )
AudioInputmake_list_of_audio)BatchFeature)ProcessingKwargsProcessorMixinUnpack)	TextInput)is_torch_availableloggingc                   @   s,   e Zd Zddidddddddd	d
ZdS )AudioFlamingo3ProcessorKwargspaddingTi>  g      >@
max_length)sampling_ratechunk_lengthreturn_attention_maskr   ptleft)return_tensorspadding_side)text_kwargsaudio_kwargscommon_kwargsN)__name__
__module____qualname__	_defaults r   r   z/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/audioflamingo3/processing_audioflamingo3.pyr   "   s    
r   F)totalc                       s   e Zd ZdZ				d fdd	ZdddZ		d deee B dedB de	dB de
e d
ef
ddZed
ee fddZ	d!deee B eB deee B dB de
e d
efddZddddZded
efddZ  ZS )"AudioFlamingo3Processora:  
    Constructs an AudioFlamingo3 processor which wraps an AudioFlamingo3 feature extractor and an AudioFlamingo3
    tokenizer into a single processor.

    [`AudioFlamingo3Processor`] offers all the functionalities of [`WhisperFeatureExtractor`] and
    [`Qwen2TokenizerFast`]. See the [`~AudioFlamingo3Processor.__call__`] for more information.

    Args:
            feature_extractor ([`WhisperFeatureExtractor`]):
                The feature extractor is a required input.
            tokenizer ([`Qwen2TokenizerFast`]):
                The tokenizer is a required input.
            chat_template (`Optional[str]`, *optional*):
                The Jinja template to use for formatting the conversation. If not provided, the tokenizer's default chat
                template will be used.
            audio_token (`Optional[str]`, *optional*, defaults to `"<sound>"`):
                Special token used to represent audio inputs in the chat template.
            default_transcription_prompt (`str`, *optional*, defaults to `"Transcribe the input speech."`):
                Default prompt to use for transcription tasks when applying transcription requests.
            max_audio_len (`int`, *optional*, defaults to 600):
                Maximum length of audio sequences in seconds. Audio longer than this will be truncated.
    N<sound>Transcribe the input speech.X  c                    s4   || _ ||| _|| _|| _t j|||d d S )N)chat_template)audio_tokenconvert_tokens_to_idsaudio_token_iddefault_transcription_promptmax_audio_lensuper__init__)selffeature_extractor	tokenizerr$   r%   r(   r)   	__class__r   r   r+   L   s
   	z AudioFlamingo3Processor.__init__audio_lengthstorch.Tensorreturnc                 C   s$   |d d d }|d d d }|S )N      r   )r,   r1   conv_output_lengthsaudio_tokens_lengthsr   r   r   _get_audio_token_length[   s   z/AudioFlamingo3Processor._get_audio_token_lengthFtextaudiooutput_labelskwargsc              
   K   s  | j tfd| jji|}|d }|d }|d}|dkr't| jj dt|t	r0|g}nt|t
tfr@tdd |D sDtd	i }	|d
urt|}t|t|krftdt| dt| dt|d |d  }
t| j|d  }g }g }|D ]Y}t|jd }td||
 d |
 }||krtd||d  dd| j d| j d |}|| t|||
 }t|D ]}||
 }t|d |
 |}||||  qq| j|fi |}	|	d}||	d< tdd t|d|D }| |}t|D ]\}}t !t "| j#| j#| || }|||< q| j|fi |}i ||	}|rJ|d $ }d||| j%k< d||| jj&k< ||d< t'||dS ) a=  
        Main method to prepare one or several text sequence(s) and audio waveform(s) for the model. This
        method expands `<sound>` placeholders in the text based on the post-pool frame counts of the
        audio windows, then tokenizes the provided strings as-is, and extracts log-mel features
        with [`WhisperFeatureExtractor`]. If `audio` is `None`, no audio processing is performed and
        the text is tokenized as-is (LM-only behavior).

        Args:
            text (`str` or `list[str]`):
                Input sequence or batch of sequences.
            audio (`np.ndarray` or `list[np.ndarray]`):
                Input audio or batch of audios as NumPy arrays. If provided, there must be as many `text` inputs as
                `audio` inputs.
            output_labels (bool, *optional*, default=False):
                Whether to return labels for training.

        Returns:
            [`BatchFeature`]: A dictionary with tokenized text (`input_ids`, `attention_mask`) and
            audio features (`input_features`, `input_features_mask`).
        tokenizer_init_kwargsr   r   r   r   z% only supports `return_tensors='pt'`.c                 s       | ]}t |tV  qd S N
isinstancestr).0tr   r   r   	<genexpr>       z3AudioFlamingo3Processor.__call__.<locals>.<genexpr>zAInvalid input text. Please provide a string, or a list of stringsNzGot z
 text but z audios; they must match 1:1.r   r   r   r4   zAudio duration (z.1fzs) exceeds zs; truncating to first zs.attention_maskinput_features_maskc                 S   s   g | ]}|  qS r   )sum)rC   sr   r   r   
<listcomp>   s    z4AudioFlamingo3Processor.__call__.<locals>.<listcomp>	input_idsilabels)datatensor_type)(_merge_kwargsr   r.   init_kwargsget
ValueErrorr0   r   rA   rB   listtupleallr   lenintr)   shapemaxloggerwarningappendminranger-   poptorchstacksplitrI   r8   	enumerateresubescaper%   cloner'   pad_token_idr   )r,   r9   r:   r;   r<   call_kwargsr   r   r   audio_inputswindow_sizemax_windowsper_sample_windowsflat_chunksaudio_el	n_samplesn_wintime_capistartendpadding_maskr1   r7   audio_lengthexpandedtext_inputsrO   rN   r   r   r   __call__`   sp   

 
$

"
 z AudioFlamingo3Processor.__call__c                 C   s(   | j j}| jj}tt|| dg S )NrH   )r.   model_input_namesr-   rU   dictfromkeys)r,   	tok_names	fea_namesr   r   r   r}      s   z)AudioFlamingo3Processor.model_input_namespromptc           	      K   sP  t |tr	|g}n't |ttfr |r tdd |D r t|}ntt|}t r0dd |D }t|}|dkr<td|du rG| j	g| }nJt |trR|g| }n?t |ttfrt||krltdt| d	| d
g }|D ]}|du r}|
| j	 qpt |tr|
| qptdntddd t||D }| j|fdddd|S )a  
        Prepare inputs for automatic speech recognition without manually writing the default transcription prompt.

        Args:
            audio (`str`, `list[str]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
                Audio to transcribe. Strings are interpreted as local paths or URLs and will be loaded automatically by
                the chat template loader; NumPy arrays and PyTorch tensors are forwarded directly.
            prompt (`str` or `list[str]`, *optional*):
                Custom prompt(s) to include in the user turn. A list must be the same length as the batch. When `None`,
                each sample uses `"Transcribe the input speech."`.
            **kwargs:
                Additional keyword arguments forwarded to [`~AudioFlamingo3Processor.apply_chat_template`] (for example
                `text_kwargs`, `audio_kwargs`, ...).

        Returns:
            [`BatchFeature`]: Processor outputs ready to be passed to [`AudioFlamingo3ForConditionalGeneration.generate`].

        c                 s   r>   r?   r@   rC   elr   r   r   rE      rF   zFAudioFlamingo3Processor.apply_transcription_request.<locals>.<genexpr>c                 S   s,   g | ]}t |tjr|   n|qS r   )rA   rb   Tensordetachcpunumpyr   r   r   r   rK      s   , zGAudioFlamingo3Processor.apply_transcription_request.<locals>.<listcomp>r   z)`audio` must contain at least one sample.Nz	Received z prompt(s) for z$ audio sample(s); counts must match.z'Each prompt must be a string or `None`.z<`prompt` must be a string, a sequence of strings, or `None`.c                 S   s@   g | ]\}}d d|dt |trd|dnd|dgdgqS )userr9   )typer9   r:   )r   path)r   r:   )rolecontentr@   )rC   prompt_text
audio_itemr   r   r   rK     s    T)tokenizeadd_generation_promptreturn_dict)rA   rB   rU   rV   rW   r   r
   rX   rT   r(   r^   	TypeErrorzipapply_chat_template)	r,   r:   r   r<   audio_items
batch_sizepromptsitemconversationsr   r   r   apply_transcription_request   sP   
$


z3AudioFlamingo3Processor.apply_transcription_request)strip_prefixc                   s,    j j|i |}|r fdd|D }|S )ap  
        Forward arguments to [`~PreTrainedTokenizer.batch_decode`] and optionally remove the assistant framing the model
        was trained to produce.

        AF3 transcription requests respond with sentences such as `"The spoken content of the audio is "..."."`.
        Setting `strip_prefix=True` trims the fixed prefix for just the transcription text.
        c                    s   g | ]}  |qS r   )"_strip_assistant_prefix_and_quotes)rC   r9   r,   r   r   rK   &  s    z8AudioFlamingo3Processor.batch_decode.<locals>.<listcomp>)r.   batch_decode)r,   r   argsr<   decodedr   r   r   r     s   z$AudioFlamingo3Processor.batch_decodec                 C   s   |  }dD ]}||r|t|d   } nq|dr'|dd   }t|dkrC|d |d krC|d dv rC|dd   }|S )	zi
        Remove the assistant prefix and surrounding quotes from a decoded transcription string.
        )z"The spoken content of the audio isz!The transcription of the audio isN.rL   r5   r   >   "'r4   )strip
startswithrX   endswith)r,   r9   strippedprefixr   r   r   r   )  s   

(z:AudioFlamingo3Processor._strip_assistant_prefix_and_quotes)Nr!   r"   r#   )r1   r2   r3   r2   )NFr?   )r   r   r   __doc__r+   r8   r	   rU   r   boolr   r   r   r|   propertyrB   r}   r   r   r   __classcell__r   r   r/   r   r    4   sF    


e
Qr    )rf   r   npaudio_utilsr   r   feature_extraction_utilsr   processing_utilsr   r   r   tokenization_utils_baser	   utilsr
   r   rb   
get_loggerr   r\   r   r    __all__r   r   r   r   <module>   s   
  
