o
    ei(U                     @   s  d dl Z d dlmZ d dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlmZ ddlm Z m!Z! ddl"m#Z#m$Z$ e%e&Z'		d-dej(dej)dej)dej)dej)dB de*dB de*fddZ+G dd dej(Z,G dd  d eZ-eG d!d" d"eZ.ed#d$G d%d& d&e.Z/G d'd( d(ej(Z0ed)d$G d*d+ d+e.e
Z1g d,Z2dS ).    N)Callable)nn   )ACT2FN)Cache)GenerationMixin)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)capture_outputs   )	AutoModelAutoModelForCausalLM   )VoxtralConfigVoxtralEncoderConfig        modulequerykeyvalueattention_maskscalingdropoutc           
      K   s   |d u r| dd }t||dd| }|d ur|| }tjj|dd}tjj||| jd}t||}	|	dd	 }	|	|fS )N      r   r   )dimptrainingr   )
sizetorchmatmul	transposer   
functionalsoftmaxr"   r(   
contiguous)
r   r   r   r   r    r!   r"   kwargsattn_weightsattn_output r3   j/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/voxtral/modeling_voxtral.pyeager_attention_forward-   s   
r5   c                       s   e Zd ZdZ						ddededed	ed
edededB dedB f fddZde	j
dedefddZ		dde	j
de	j
dB dedee	j
e	j
dB ee	j
 dB f fddZ  ZS )VoxtralAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   FTN	embed_dim	num_headsr"   
is_decoderbias	is_causal	layer_idxconfigc	           	         s   t    || _|| _|| _|| | _|| _| j| | jkr*td| j d| d| jd | _|| _	|| _
|d u rG|rGtd| jj d || _tj||dd| _tj|||d| _tj|||d| _tj|||d| _d S )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r$   zInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr:   )super__init__r7   r8   r"   head_dimr=   
ValueErrorr!   r9   r;   loggerwarning_once	__class____name__r<   r   Lineark_projv_projq_projout_proj)	selfr7   r8   r"   r9   r:   r;   r<   r=   rE   r3   r4   r@   J   s0   


zVoxtralAttention.__init__tensorseq_lenbszc                 C   s    | ||| j| jdd S )Nr   r   )viewr8   rA   r,   r/   )rL   rN   rO   rP   r3   r3   r4   _shaper   s    zVoxtralAttention._shapehidden_statesr    output_attentionsreturnc                 K   s   |  \}}}| | || j ||}| | |d|}	| | |d|}
t| jj	t
}|| ||	|
|f| js<dn| jd|d|\}}|||d }| |}||fS )z#Input shape: Batch x Time x Channelr#   r         ?)r"   r!   rT   )r)   rR   rJ   r!   rH   rI   r   get_interfacer=   _attn_implementationr5   r(   r"   reshaper/   rK   )rL   rS   r    rT   r0   rP   tgt_len_query_states
key_statesvalue_statesattention_interfacer2   r1   r3   r3   r4   forwardu   s.   		

zVoxtralAttention.forward)r   FTFNNNF)rF   
__module____qualname____doc__intfloatboolr   r@   r*   TensorrR   tupler`   __classcell__r3   r3   rM   r4   r6   G   sJ    	(r6   c                	       sF   e Zd Zdef fddZ	ddejdejdedejfd	d
Z  Z	S )VoxtralEncoderLayerr=   c                    s   t    |j| _t| j|j|j|d| _t	| j| _
|j| _t|j | _|j| _t| j|j| _t|j| j| _t	| j| _d S )N)r7   r8   r"   r=   )r?   r@   d_modelr7   r6   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normr"   r   activation_functionactivation_fnactivation_dropoutrG   encoder_ffn_dimfc1fc2final_layer_normrL   r=   rM   r3   r4   r@      s   
zVoxtralEncoderLayer.__init__FrS   r    rT   rU   c                 C   s   |}|  |}| j|||d\}}tjj|| j| jd}|| }|}| |}| | |}tjj|| j	| jd}| 
|}tjj|| j| jd}|| }|jtjkrft|jjd }tj|| |d}||fS )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rS   r    rT   r&   i  )minmax)rq   ro   r   r-   r"   r(   rx   rs   rv   rt   rw   dtyper*   float16finfor{   clamp)rL   rS   r    rT   residualr1   clamp_valuer3   r3   r4   r`      s(   



zVoxtralEncoderLayer.forward)F)
rF   rb   rc   r   r@   r*   rh   rg   r`   rj   r3   r3   rM   r4   rk      s    rk   c                   @   sB   e Zd ZU eed< dZdZdZdZdZ	dZ
dZdZdZdZdZdS )VoxtralPreTrainedModelr=   model)audiotextTNpast_key_values)rF   rb   rc   r   __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_cache_class_supports_attention_backend_can_compile_fullgraphr3   r3   r3   r4   r      s   
 r   z:
    The Voxtral encoder, which is a Whisper encoder.
    custom_introc                       s   e Zd ZU dZeed< dZdZdgZe	e
dZdef fddZd	d
 ZdejfddZdejfddZee	ddee deeB fddZdejfddZ  ZS )VoxtralEncoderz
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`VoxtralEncoderLayer`].

    Args:
        config: VoxtralEncoderConfig
    r=   input_featuresr   rk   )
attentionsrS   c                    s   t     j| _ j| _ j} j| _ j| _ jr!t	
|nd| _tj| j|ddd| _tj||dddd| _t| j|| _| jd t fdd	t jD | _t j| _tjddd
| _d| _|   d S )NrV   r   r   )kernel_sizepaddingr   )r   strider   Fc                    s   g | ]}t  qS r3   )rk   ).0r[   r=   r3   r4   
<listcomp>  s    z+VoxtralEncoder.__init__.<locals>.<listcomp>)r   )r?   r@   r"   encoder_layerdrop	layerdroprl   num_mel_binsmax_source_positionsscale_embeddingmathsqrtembed_scaler   Conv1dconv1conv2	Embeddingembed_positionsrequires_grad_
ModuleListrangeencoder_layerslayersrp   
layer_norm	AvgPool1d
avg_poolergradient_checkpointing	post_init)rL   r=   r7   rM   r   r4   r@     s     zVoxtralEncoder.__init__c                 C   s   |   D ]}d|_qd| _d S ra   )
parametersrequires_grad_requires_grad)rL   paramr3   r3   r4   _freeze_parameters  s   
z!VoxtralEncoder._freeze_parametersrU   c                 C   s   | j S Nr   rL   r3   r3   r4   get_input_embeddings   s   z#VoxtralEncoder.get_input_embeddingsr   c                 C   s
   || _ d S r   r   rL   r   r3   r3   r4   set_input_embeddings#     
z#VoxtralEncoder.set_input_embeddingsNr0   c                 K   s  | j j| jjd  | jjd  }|jd |kr(td| d|jd  d| d|j| jjj	| jjj
d}tj| |}tj| |}|ddd	}| jj}|| |j	}tjj|| j| jd
}t| jD ]\}}	|	||d}
|
d }qj| |}t|dS )a  
        Args:
            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
            attention_mask (`torch.Tensor`)`, *optional*):
                Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
                but it is not used. By default the silence in the input log mel spectrogram are ignored.
        r   r#   z7Voxtral expects the mel input features to be of length z, but found z-. Make sure to pad the input mel features to .)r|   devicer   r   r&   )r    )last_hidden_state)r=   r   r   r   r   shaperB   toweightr|   r   r   r-   gelupermuter   r"   r(   	enumerater   r   r
   )rL   r   r    r0   expected_seq_lengthinputs_embeds	embed_posrS   idxencoder_layerlayer_outputsr3   r3   r4   r`   &  s,    

zVoxtralEncoder.forwardinput_lengthsc                 C   s(   |d d d }|d d d }||fS )zs
        Computes the output length of the convolutional layers and the output length of the audio encoder
        r   r   r3   )rL   r   output_lengthsr3   r3   r4    _get_feat_extract_output_lengthsW  s   z/VoxtralEncoder._get_feat_extract_output_lengthsr   )rF   rb   rc   rd   r   r   main_input_namer   r   r6   rk   _can_record_outputsr@   r   r   Moduler   r   r   r   r   r   ri   r
   r`   r*   
LongTensorr   rj   r3   r3   rM   r4   r      s,   
 	/r   c                       s*   e Zd Zdef fddZdd Z  ZS )VoxtralMultiModalProjectorr=   c                    sN   t    tj|jj|jjdd| _t	|j
 | _tj|jj|jjdd| _d S )NFr>   )r?   r@   r   rG   audio_configintermediate_sizetext_confighidden_sizelinear_1r   projector_hidden_actactlinear_2ry   rM   r3   r4   r@   a  s   
z#VoxtralMultiModalProjector.__init__c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )rL   audio_featuresrS   r3   r3   r4   r`   g  s   


z"VoxtralMultiModalProjector.forward)rF   rb   rc   r   r@   r`   rj   r3   r3   rM   r4   r   `  s    r   zs
    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a LLama language model.
    c                       s   e Zd ZdgZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
eedddejdee deeB fddZee										d&dejdB dejdB dejdB dejdB dedB dejdB dejdB dedB d ejdB d!eejB dee defd"d#Z fd$d%Z  ZS )'VoxtralForConditionalGenerationr   c                    sH   t  | |jj| _t|j| _t|j| _	t
|| _|   d S r   )r?   r@   r   
vocab_sizer   from_configr   audio_towerr   language_modelr   multi_modal_projectorr   ry   rM   r3   r4   r@   v  s   

z(VoxtralForConditionalGeneration.__init__c                 C   
   | j  S r   )r   r   r   r3   r3   r4   r     r   z4VoxtralForConditionalGeneration.get_input_embeddingsc                 C      | j | d S r   )r   r   r   r3   r3   r4   r        z4VoxtralForConditionalGeneration.set_input_embeddingsc                 C   r   r   )r   get_output_embeddingsr   r3   r3   r4   r     r   z5VoxtralForConditionalGeneration.get_output_embeddingsc                 C   r   r   )r   set_output_embeddings)rL   new_embeddingsr3   r3   r4   r     r   z5VoxtralForConditionalGeneration.set_output_embeddingsc                 C   r   r   )r   set_decoder)rL   decoderr3   r3   r4   r     r   z+VoxtralForConditionalGeneration.set_decoderc                 C   r   r   )r   get_decoderr   r3   r3   r4   r     r   z+VoxtralForConditionalGeneration.get_decoderzThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.r   r   r0   rU   c                 K   sB   | j |fddi|}|j}|d| jjj}| |}||_|S )aa  
        input_features (`torch.FloatTensor`):
            Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
        return_dictTr#   )r   r   rY   r=   r   r   r   pooler_output)rL   r   r0   audio_outputsaudio_hidden_statesaudio_embedsr3   r3   r4   get_audio_features  s   
z2VoxtralForConditionalGeneration.get_audio_featuresNr   	input_idsr    position_idsr   r   labels	use_cachecache_positionlogits_to_keepc                 K   s   |du r
|   |}|dur1|dur1| j|ddj}|| jjkd}|||j||j}| j	d|||||||	|
d|}|S )aj  
        Example:

        ```python
        >>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
        >>> import torch

        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
        >>> repo_id = "mistralai/Voxtral-Mini-3B-2507"

        >>> processor = AutoProcessor.from_pretrained(repo_id)
        >>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, dtype=torch.bfloat16, device_map=device)

        >>> conversation = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "audio",
                        "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
                    },
                    {"type": "text", "text": "What can you tell me about this audio?"},
                ],
            }
        ]

        >>> inputs = processor.apply_chat_template(conversation)
        >>> inputs = inputs.to(device, dtype=torch.bfloat16)

        >>> outputs = model.generate(**inputs, max_new_tokens=30)
        >>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
        ["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
        ```NT)r   r#   )r    r   r   r   r   r   r   r   r3   )
r   r   r   r=   audio_token_id	unsqueezemasked_scatterr   r   r   )rL   r   r   r    r   r   r   r   r   r   r   r0   r   audio_token_maskoutputsr3   r3   r4   r`     s*   1	z'VoxtralForConditionalGeneration.forwardc                    sF   | dd }|dd}t j|i |}|s|dds!||d< |S )Nr   is_first_iterationFr   T)popgetr?   prepare_inputs_for_generation)rL   argsr0   r   r  model_inputsrM   r3   r4   r    s   z=VoxtralForConditionalGeneration.prepare_inputs_for_generation)
NNNNNNNNNr   )rF   rb   rc   _keep_in_fp32_modules_strictr@   r   r   r   r   r   r   r   r   r*   FloatTensorr   r   ri   r
   r   r   rh   r   rg   re   r   r`   r  rj   r3   r3   rM   r4   r   n  st    
	
Hr   )r   r   r   )Nr   )3r   collections.abcr   r*   r   activationsr   cache_utilsr   
generationr   modeling_layersr   modeling_outputsr	   r
   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   autor   r   configuration_voxtralr   r   
get_loggerrF   rC   r   rh   rf   r5   r6   rk   r   r   r   r   __all__r3   r3   r3   r4   <module>   sb   
	
X;q 