o
    eig                     @   s  d dl Z d dlmZ d dlZd dlmZ ddlmZ ddlmZm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( e)e*Z+		d/dej,dej-dej-dej-dej-dB de.dB de.fddZ/G dd  d ej,Z0G d!d" d"eZ1eG d#d$ d$eZ2ed%d&G d'd( d(e2Z3G d)d* d*ej,Z4ed+d&G d,d- d-e2eZ5g d.Z6dS )0    N)Callable)nn   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)create_bidirectional_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)capture_outputs   )	AutoModelAutoModelForCausalLM   )AudioFlamingo3ConfigAudioFlamingo3EncoderConfig        modulequerykeyvalueattention_maskscalingdropoutc           
      K   s   |d u r| dd }t||dd| }|d ur|| }tjj|dd}tjj||| jd}t||}	|	dd	 }	|	|fS )N      r   r   )dimptrainingr   )
sizetorchmatmul	transposer   
functionalsoftmaxr$   r*   
contiguous)
r   r   r    r!   r"   r#   r$   kwargsattn_weightsattn_output r5   x/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/audioflamingo3/modeling_audioflamingo3.pyeager_attention_forward/   s   
r7   c                       s   e Zd ZdZ						ddededed	ed
edededB dedB f fddZ					dde	j
de	j
dB dedB de	j
dB dede	j
dB dee dee	j
e	j
dB ee	j
 dB f fddZ  ZS )AudioFlamingo3Attentionz=Multi-headed attention from 'Attention Is All You Need' paperr   FTN	embed_dim	num_headsr$   
is_decoderbias	is_causal	layer_idxconfigc	           	         s   t    || _|| _|| _|| | _|| _| j| | jkr*td| j d| d| jd | _|| _	|| _
|d u rG|rGtd| jj d || _tj||dd| _tj|||d| _tj|||d| _tj|||d| _d S )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r&   zInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr<   )super__init__r9   r:   r$   head_dimr?   
ValueErrorr#   r;   r=   loggerwarning_once	__class____name__r>   r   Lineark_projv_projq_projout_proj)	selfr9   r:   r$   r;   r<   r=   r>   r?   rG   r5   r6   rB   L   s0   


z AudioFlamingo3Attention.__init__hidden_stateskey_value_statespast_key_valuesr"   output_attentionscache_positionr2   returnc                 K   s  |du}|j dd \}	}
|	|
d| jf}| || j }|j| }|dd }|durHt|trH|j	
| j}|rEd|j	| j< |j}n|j}|durN|n|}|re|re|re|j| j j}|j| j j}nA| ||	d| j| j}| ||	d| j| j}|dd }|dd }|dur|s|nd}|||| jd|i\}}t| jjt}|| ||||f| jsdn| jd|d	|\}}||	|
d }| |}||fS )
z#Input shape: Batch x Time x ChannelNr%   r   r   TrT   r         ?)r$   r#   rS   )shaperC   rL   r#   viewr.   r1   
isinstancer   
is_updatedgetr>   cross_attention_cacheself_attention_cachelayerskeysvaluesrJ   r:   rK   updater   get_interfacer?   _attn_implementationr7   r*   r$   reshaperM   )rN   rP   rQ   rR   r"   rS   rT   r2   is_cross_attentionbsztgt_lenq_input_shapequery_statesrZ   current_states
key_statesvalue_statesattention_interfacer4   r3   r5   r5   r6   forwardt   sX   
	

zAudioFlamingo3Attention.forward)r   FTFNN)NNNFN)rH   
__module____qualname____doc__intfloatboolr   rB   r,   Tensorr   r   r
   tuplern   __classcell__r5   r5   rO   r6   r8   I   s^    	+
r8   c                	       sF   e Zd Zdef fddZ	ddejdejdedejfd	d
Z  Z	S )AudioFlamingo3EncoderLayerr?   c                    s   t    |j| _t| j|j|j|d| _t	| j| _
|j| _t|j | _|j| _t| j|j| _t|j| j| _t	| j| _d S )N)r9   r:   r$   r?   )rA   rB   d_modelr9   r8   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normr$   r   activation_functionactivation_fnactivation_dropoutrI   encoder_ffn_dimfc1fc2final_layer_normrN   r?   rO   r5   r6   rB      s   
z#AudioFlamingo3EncoderLayer.__init__FrP   r"   rS   rU   c                 C   s   |}|  |}| j|||d\}}tjj|| j| jd}|| }|}| |}| | |}tjj|| j	| jd}| 
|}tjj|| j| jd}|| }|jtjkrft|jjd }tj|| |d}||fS )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rP   r"   rS   r(   i  )minmax)r~   r|   r   r/   r$   r*   r   r   r   r   r   dtyper,   float16finfor   clamp)rN   rP   r"   rS   residualr3   clamp_valuer5   r5   r6   rn      s(   



z"AudioFlamingo3EncoderLayer.forward)F)
rH   ro   rp   r   rB   r,   ru   rt   rn   rw   r5   r5   rO   r6   rx      s    rx   c                   @   s4   e Zd ZU eed< dZdZdZdgZdZ	dZ
dZdS )AudioFlamingo3PreTrainedModelr?   model)audiotextTr8   rR   N)rH   ro   rp   r   __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpar5   r5   r5   r6   r      s   
 r   zT
    The audio model from AudioFlamingo3 without any head or projection on top.
    custom_introc                
       s   e Zd ZU dZeed< dZdZdgZe	e
dZdef fddZd	d
 ZdejfddZdejfddZee	ddejdejdB deeB fddZdejfddZ  ZS )AudioFlamingo3EncoderzY
    AudioFlamingo3 encoder: Whisper encoder, average pool (time/2), then LayerNorm.
    r?   input_featuresr   rx   )rP   
attentionsc                    s   t     j| _ j| _ j} j| _ j| _ jr!t	
|nd| _tj| j|ddd| _tj||dddd| _t| j|| _| jd t fdd	t jD | _t j| _tjddd
| _d| _|   d S )NrV   r   r   )kernel_sizepaddingr   )r   strider   Fc                    s   g | ]}t  qS r5   )rx   ).0_r?   r5   r6   
<listcomp>1  s    z2AudioFlamingo3Encoder.__init__.<locals>.<listcomp>)r   )rA   rB   r$   encoder_layerdrop	layerdropry   num_mel_binsmax_source_positionsscale_embeddingmathsqrtembed_scaler   Conv1dconv1conv2	Embeddingembed_positionsrequires_grad_
ModuleListrangeencoder_layersr^   r}   
layer_norm	AvgPool1d
avg_poolergradient_checkpointing	post_init)rN   r?   r9   rO   r   r6   rB   !  s     zAudioFlamingo3Encoder.__init__c                 C   s   |   D ]}d|_qd| _d S )NF)
parametersrequires_grad_requires_grad)rN   paramr5   r5   r6   _freeze_parameters:  s   
z(AudioFlamingo3Encoder._freeze_parametersrU   c                 C   s   | j S Nr   rN   r5   r5   r6   get_input_embeddings?  s   z*AudioFlamingo3Encoder.get_input_embeddingsr!   c                 C   s
   || _ d S r   r   rN   r!   r5   r5   r6   set_input_embeddingsB     
z*AudioFlamingo3Encoder.set_input_embeddingsNinput_features_maskc                 K   s  |j d d d d }|d}|d d d }tj||jd|dddf k }tj| |}tj| 	|}|
ddd}|| jj }tjj|| j| jd}t| j||d}| jD ]}	| joitg | jk }
|
ss|	||d }q]|
ddd}| |
ddd}| |}t|d	S )
ap  
        Args:
            input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Log-Mel features extracted from raw audio. Use the processor/feature extractor to compute and pad
                these features from waveform input.
            input_features_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
        r%   r   r   deviceNr   r(   )r?   inputs_embedsr"   )last_hidden_state)rW   sumr,   aranger   r   r/   gelur   r   permuter   weightr$   r*   r	   r?   r^   randr   r   r   r   )rN   r   r   r2   seq_leninput_features_lengthsr   rP   r"   layerdropr5   r5   r6   rn   E  s2   
 

zAudioFlamingo3Encoder.forwardinput_lengthsc                 C   s(   |d d d }|d d d }||fS )zs
        Computes the output length of the convolutional layers and the output length of the audio encoder
        r   r   r5   )rN   r   output_lengthsr5   r5   r6    _get_feat_extract_output_lengths}  s   z6AudioFlamingo3Encoder._get_feat_extract_output_lengthsr   )rH   ro   rp   rq   r   r   main_input_namer   r   rx   r8   _can_record_outputsrB   r   r   Moduler   r   r   r   r,   ru   rv   r   rn   
LongTensorr   rw   r5   r5   rO   r6   r     s0   
 6r   c                       s.   e Zd ZdZdef fddZdd Z  ZS )!AudioFlamingo3MultiModalProjectorz
    Audio adaptor (small MLP) that projects AudioFlamingo3Encoder features
    to the LLM embedding space so they can replace `<sound>` tokens.
    r?   c                    sR   t    tj|jj|jj|jd| _t	|j
 | _tj|jj|jj|jd| _d S )Nr@   )rA   rB   r   rI   audio_confighidden_sizetext_configprojector_biaslinear_1r   projector_hidden_actactlinear_2r   rO   r5   r6   rB     s   
z*AudioFlamingo3MultiModalProjector.__init__c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )rN   audio_featuresrP   r5   r5   r6   rn     s   


z)AudioFlamingo3MultiModalProjector.forward)rH   ro   rp   rq   r   rB   rn   rw   r5   r5   rO   r6   r     s    
r   z
    The AudioFlamingo3 model which consists of a fine-tuned Whisper encoder, a multi-modal projector and a Qwen2 language model.
    c                       s8  e Zd ZdZdZdZ fddZdd Zdd Zdd	 Z	d
d Z
dd Zdd Zeedddejdejdee deeB fddZee											d&dejdB dejdB dejdB dejdB dejdB dedB dejdB dejdB dedB d ejdB d!eejB dee defd"d#Z fd$d%Z  ZS )'&AudioFlamingo3ForConditionalGenerationNc                    sH   t  | |jj| _t|j| _t|j| _	t
|| _|   d S r   )rA   rB   r   
vocab_sizer   from_configr   audio_towerr   language_modelr   multi_modal_projectorr   r   rO   r5   r6   rB     s   

z/AudioFlamingo3ForConditionalGeneration.__init__c                 C   
   | j  S r   )r   r   r   r5   r5   r6   r     r   z;AudioFlamingo3ForConditionalGeneration.get_input_embeddingsc                 C      | j | d S r   )r   r   r   r5   r5   r6   r        z;AudioFlamingo3ForConditionalGeneration.set_input_embeddingsc                 C   r   r   )r   get_output_embeddingsr   r5   r5   r6   r     r   z<AudioFlamingo3ForConditionalGeneration.get_output_embeddingsc                 C   r   r   )r   set_output_embeddings)rN   new_embeddingsr5   r5   r6   r     r   z<AudioFlamingo3ForConditionalGeneration.set_output_embeddingsc                 C   r   r   )r   set_decoder)rN   decoderr5   r5   r6   r     r   z2AudioFlamingo3ForConditionalGeneration.set_decoderc                 C   r   r   )r   get_decoderr   r5   r5   r6   r     r   z2AudioFlamingo3ForConditionalGeneration.get_decoderzThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.r   r   r   r2   rU   c                 K   s   | j |f|dd|}| |j}|dd d d }tj|jd |jddddf |dddf k }|||j }||_	|S )a
  
        input_features (`torch.FloatTensor`):
            Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padded feature indices.
        T)r   return_dictr%   r   r   r   N)
r   r   r   r   r,   r   rW   r   topooler_output)rN   r   r   r2   audio_outputaudio_embedspost_lengths
valid_maskr5   r5   r6   get_audio_features  s   2z9AudioFlamingo3ForConditionalGeneration.get_audio_featuresr   	input_idsr"   position_idsrR   r   labels	use_cacherT   logits_to_keepc                 K   s   |du r
|   |}|dur2|dur2| j||ddj}|| jjkd}|||j||j}| j	d||||||	|
|d|}|S )a+  
        input_features_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor

        >>> model_id = "nvidia/audio-flamingo-3-hf"
        >>> processor = AutoProcessor.from_pretrained(model_id)
        >>> model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")

        >>> conversations = [
        >>>     [
        >>>         {
        >>>             "role": "user",
        >>>             "content": [
        >>>                 {"type": "text", "text": "Transcribe the input speech."},
        >>>                 {
        >>>                     "type": "audio",
        >>>                     "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
        >>>                 },
        >>>             ],
        >>>         }
        >>>     ],
        >>>     [
        >>>         {
        >>>             "role": "user",
        >>>             "content": [
        >>>                 {
        >>>                     "type": "text",
        >>>                     "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
        >>>                 },
        >>>                 {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
        >>>             ],
        >>>         }
        >>>     ],
        >>> ]

        >>> inputs = processor.apply_chat_template(
        >>>     conversations,
        >>>     tokenize=True,
        >>>     add_generation_prompt=True,
        >>>     return_dict=True,
        >>> ).to(model.device)

        >>> outputs = model.generate(**inputs, max_new_tokens=500)

        >>> decoded_outputs = processor.batch_decode(
        >>>     outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
        >>> )
        >>> print(decoded_outputs)
        ["The spoken content of the audio is...", "The track's calming and meditative feel can be attributed to..."]
        ```NT)r   r%   )r   r"   r  rR   r  r  rT   r  r5   )
r   r   r   r?   audio_token_id	unsqueezemasked_scatterr   r   r   )rN   r   r   r   r"   r  rR   r   r  r  rT   r  r2   r   audio_token_maskoutputsr5   r5   r6   rn     s*   P	z.AudioFlamingo3ForConditionalGeneration.forwardc                    sl   | dd }| dd }|d}t j|i |}|d ur4|d dkr4|d ur,||d< |d ur4||d< |S )Nr   r   rT   r   )popr[   rA   prepare_inputs_for_generation)rN   argsr2   r   r   rT   model_inputsrO   r5   r6   r  O  s   
zDAudioFlamingo3ForConditionalGeneration.prepare_inputs_for_generation)NNNNNNNNNNr   )rH   ro   rp   _keep_in_fp32_modules_strict_tp_plan_pp_planrB   r   r   r   r   r   r   r   r   r,   FloatTensorru   r   r   rv   r   r   r   r   rt   rr   r   rn   r  rw   r5   r5   rO   r6   r     s    
	
gr   )r   r   r   )Nr   )7r   collections.abcr   r,   r   activationsr   cache_utilsr   r   
generationr   masking_utilsr	   modeling_flash_attention_utilsr
   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   autor   r   configuration_audioflamingo3r   r   
get_loggerrH   rE   r   ru   rs   r7   r8   rx   r   r   r   r   __all__r5   r5   r5   r6   <module>   sf   
	
|;u A