o
    ei3                     @   sJ  d dl Z d dl mZ ddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlm Z  e!e"Z#G dd deZ$G dd deZ%G dd deZ&eddG dd deZ'G dd deZ(eddG dd  d eZ)g d!Z*dS )"    N)nn   )ACT2FN)Cache)create_bidirectional_mask)BaseModelOutputWithPoolingCausalLMOutputWithPast)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)capture_outputs   )Qwen2AudioEncoderQwen2AudioPreTrainedModel)VoxtralForConditionalGenerationVoxtralMultiModalProjector)WhisperAttentionWhisperEncoderLayer   )AudioFlamingo3Configc                   @      e Zd ZdS )AudioFlamingo3AttentionN__name__
__module____qualname__ r   r   w/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/audioflamingo3/modular_audioflamingo3.pyr   '       r   c                   @   r   )AudioFlamingo3EncoderLayerNr   r   r   r   r    r"   +   r!   r"   c                   @   r   )AudioFlamingo3PreTrainedModelNr   r   r   r   r    r#   /   r!   r#   zT
    The audio model from AudioFlamingo3 without any head or projection on top.
    custom_introc                
   @   sH   e Zd ZdZeedZee	d	de	j
de	j
dB deeB fddZdS )
AudioFlamingo3EncoderzY
    AudioFlamingo3 encoder: Whisper encoder, average pool (time/2), then LayerNorm.
    )hidden_states
attentionsNinput_featuresinput_features_maskreturnc                 K   s  |j d d d d }|d}|d d d }tj||jd|dddf k }tj| |}tj| 	|}|
ddd}|| jj }tjj|| j| jd}t| j||d}| jD ]}	| joitg | jk }
|
ss|	||d }q]|
ddd}| |
ddd}| |}t|d	S )
ap  
        Args:
            input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Log-Mel features extracted from raw audio. Use the processor/feature extractor to compute and pad
                these features from waveform input.
            input_features_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
        r   r   deviceNr   )ptraining)configinputs_embedsattention_mask)last_hidden_state)shapesumtorcharanger.   r   
functionalgeluconv1conv2permuteembed_positionsweightdropoutr0   r   r1   layersrand	layerdrop
avg_pooler
layer_normr   )selfr)   r*   kwargsseq_leninput_features_lengthsr2   r'   r3   layerdropr   r   r    forwardB   s2   
 

zAudioFlamingo3Encoder.forwardN)r   r   r   __doc__r"   r   _can_record_outputsr   r   r7   Tensortupler   rL   r   r   r   r    r&   3   s    r&   c                       s&   e Zd ZdZdef fddZ  ZS )!AudioFlamingo3MultiModalProjectorz
    Audio adaptor (small MLP) that projects AudioFlamingo3Encoder features
    to the LLM embedding space so they can replace `<sound>` tokens.
    r1   c                    sR   t    tj|jj|jj|jd| _t	|j
 | _tj|jj|jj|jd| _d S )N)bias)super__init__r   Linearaudio_confighidden_sizetext_configprojector_biaslinear_1r   projector_hidden_actactlinear_2rF   r1   	__class__r   r    rU      s   
z*AudioFlamingo3MultiModalProjector.__init__)r   r   r   rN   r   rU   __classcell__r   r   r`   r    rR   z   s    rR   z
    The AudioFlamingo3 model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Qwen2 language model.
    c                       s  e Zd ZdZdZdZ fddZeeddde	j
de	jdee d	eeB fd
dZee											dde	jdB de	j
dB de	jdB de	jdB de	jdB dedB de	j
dB de	jdB dedB de	jdB dee	jB dee d	efddZ fddZ  ZS )&AudioFlamingo3ForConditionalGenerationNc                    s   t  | d S rM   )rT   rU   r_   r`   r   r    rU      s   z/AudioFlamingo3ForConditionalGeneration.__init__zThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.r$   r)   r*   rG   r+   c                 K   s   | j |f|dd|}| |j}|dd d d }tj|jd |jddddf |dddf k }|||j }||_	|S )a
  
        input_features (`torch.FloatTensor`):
            Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padded feature indices.
        T)r*   return_dictr,   r   r   r-   N)
audio_towermulti_modal_projectorr4   r6   r7   r8   r5   r.   topooler_output)rF   r)   r*   rG   audio_outputaudio_embedspost_lengths
valid_maskr   r   r    get_audio_features   s   2z9AudioFlamingo3ForConditionalGeneration.get_audio_featuresr   	input_idsr3   position_idspast_key_valuesr2   labels	use_cachecache_positionlogits_to_keepc                 K   s   |du r
|   |}|dur2|dur2| j||ddj}|| jjkd}|||j||j}| j	d||||||	|
|d|}|S )a+  
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor

        >>> model_id = "nvidia/audio-flamingo-3-hf"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")

        >>> conversations = [
        >>>     [
        >>>         {
        >>>             "role": "user",
        >>>             "content": [
        >>>                 {"type": "text", "text": "Transcribe the input speech."},
        >>>                 {
        >>>                     "type": "audio",
        >>>                     "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
        >>>                 },
        >>>             ],
        >>>         }
        >>>     ],
        >>>     [
        >>>         {
        >>>             "role": "user",
        >>>             "content": [
        >>>                 {
        >>>                     "type": "text",
        >>>                     "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
        >>>                 },
        >>>                 {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
        >>>             ],
        >>>         }
        >>>     ],
        >>> ]

        >>> inputs = processor.apply_chat_template(
        >>>     conversations,
        >>>     tokenize=True,
        >>>     add_generation_prompt=True,
        >>>     return_dict=True,
        >>> ).to(model.device)

        >>> outputs = model.generate(**inputs, max_new_tokens=500)

        >>> decoded_outputs = processor.batch_decode(
        >>>     outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
        >>> )
        >>> print(decoded_outputs)
        ["The spoken content of the audio is...", "The track's calming and meditative feel can be attributed to..."]
        ```NT)rd   r,   )r2   r3   ro   rp   rq   rr   rs   rt   r   )
get_input_embeddingsrm   rh   r1   audio_token_id	unsqueezemasked_scatterrg   r.   language_model)rF   rn   r)   r*   r3   ro   rp   r2   rq   rr   rs   rt   rG   rj   audio_token_maskoutputsr   r   r    rL      s*   P	z.AudioFlamingo3ForConditionalGeneration.forwardc                    sl   | dd }| dd }|d}t j|i |}|d ur4|d dkr4|d ur,||d< |d ur4||d< |S )Nr)   r*   rs   r   )popgetrT   prepare_inputs_for_generation)rF   argsrG   r)   r*   rs   model_inputsr`   r   r    r~   $  s   
zDAudioFlamingo3ForConditionalGeneration.prepare_inputs_for_generation)NNNNNNNNNNr   )r   r   r   _tp_plan_pp_plan_keep_in_fp32_modules_strictrU   r   r   r7   FloatTensorrP   r	   r
   rQ   r   rm   
LongTensorr   boolintr   rL   r~   rb   r   r   r`   r    rc      sv    	
grc   )rc   r#   r&   )+r7   r   activationsr   cache_utilsr   masking_utilsr   modeling_outputsr   r   processing_utilsr	   utilsr
   r   r   r   utils.genericr   utils.output_capturingr    qwen2_audio.modeling_qwen2_audior   r   voxtral.modeling_voxtralr   r   whisper.modeling_whisperr   r   configuration_audioflamingo3r   
get_loggerr   loggerr   r"   r#   r&   rR   rc   __all__r   r   r   r    <module>   s:   
B (