o
    ¾e¦i.  ã                   @   s<  d dl Z d dl mZ ddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZmZmZ ddlmZ G dd„ deƒZ G dd„ deƒZ!G dd„ deƒZ"eddG dd„ deƒƒZ#G dd„ dej$ƒZ%eddG dd„ de"eƒƒZ&g d ¢Z'dS )!é    N)Únné   )ÚACT2FN)ÚCache)ÚGenerationMixin)ÚBaseModelOutputWithPastÚBaseModelOutputWithPoolingÚCausalLMOutputWithPast)ÚUnpack)ÚTransformersKwargsÚauto_docstringÚcan_return_tuple)Úmerge_with_config_defaults)Úcapture_outputsé   )Ú	AutoModelÚAutoModelForCausalLM)ÚQwen2AudioAttentionÚQwen2AudioEncoderÚQwen2AudioEncoderLayerÚQwen2AudioPreTrainedModelé   )ÚVoxtralConfigc                   @   ó   e Zd ZdS )ÚVoxtralAttentionN©Ú__name__Ú
__module__Ú__qualname__© r   r   úi/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/voxtral/modular_voxtral.pyr   )   ó    r   c                   @   r   )ÚVoxtralEncoderLayerNr   r   r   r   r    r"   -   r!   r"   c                   @   s    e Zd ZdZdZdZdZdZdS )ÚVoxtralPreTrainedModelTN)r   r   r   Ú_supports_flex_attnÚ_supports_cache_classÚ_supports_attention_backendÚ_can_compile_fullgraphÚ_no_split_modulesr   r   r   r    r#   1   s    r#   z:
    The Voxtral encoder, which is a Whisper encoder.
    ©Úcustom_introc                   @   s<   e Zd ZeedœZee	ddee	 de
eB fdd„ƒƒZdS )ÚVoxtralEncoder)Ú
attentionsÚhidden_statesNÚkwargsÚreturnc                 K   s  | j j| jjd  | jjd  }|jd |kr(td|› d|jd › d|› dƒ‚|j| jjj	| jjj
d}tj |  |¡¡}tj |  |¡¡}| ddd	¡}| jj}||  |j	¡}tjj|| j| jd
}t| jƒD ]\}}	|	||d}
|
d }qj|  |¡}t|dS )aÙ  
        Args:
            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
            attention_mask (`torch.Tensor`)`, *optional*):
                Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
                but it is not used. By default the silence in the input log mel spectrogram are ignored.
        r   éÿÿÿÿz7Voxtral expects the mel input features to be of length z, but found z-. Make sure to pad the input mel features to Ú.)ÚdtypeÚdevicer   r   )ÚpÚtraining)Úattention_mask)Úlast_hidden_state)ÚconfigÚmax_source_positionsÚconv1ÚstrideÚconv2ÚshapeÚ
ValueErrorÚtoÚweightr2   r3   r   Ú
functionalÚgeluÚpermuteÚembed_positionsÚdropoutr5   Ú	enumerateÚlayersÚ
layer_normr   )ÚselfÚinput_featuresr6   r.   Úexpected_seq_lengthÚinputs_embedsÚ	embed_posr-   ÚidxÚencoder_layerÚlayer_outputsr   r   r    ÚforwardE   s,    ÿþ

ÿzVoxtralEncoder.forward©N)r   r   r   r   r"   Ú_can_record_outputsr   r   r
   r   Útupler   rQ   r   r   r   r    r+   :   s    þýüûr+   c                       s*   e Zd Zdef‡ fdd„Zdd„ Z‡  ZS )ÚVoxtralMultiModalProjectorr8   c                    sN   t ƒ  ¡  tj|jj|jjdd| _t	|j
 | _tj|jj|jjdd| _d S )NF)Úbias)ÚsuperÚ__init__r   ÚLinearÚaudio_configÚintermediate_sizeÚtext_configÚhidden_sizeÚlinear_1r   Úprojector_hidden_actÚactÚlinear_2©rI   r8   ©Ú	__class__r   r    rX   w   s   
z#VoxtralMultiModalProjector.__init__c                 C   s"   |   |¡}|  |¡}|  |¡}|S rR   )r^   r`   ra   )rI   Úaudio_featuresr-   r   r   r    rQ   }   s   


z"VoxtralMultiModalProjector.forward)r   r   r   r   rX   rQ   Ú__classcell__r   r   rc   r    rU   v   s    rU   zs
    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a LLama language model.
    c                       s   e Zd ZdgZ‡ fdd„Zdd„ Zdd„ Zdd	„ Zd
d„ Zdd„ Z	dd„ Z
eedddejdee deeB fdd„ƒƒZee										d&dejdB dejdB dejdB dejdB dedB dejdB dejdB dedB d ejdB d!eejB dee defd"d#„ƒƒZ‡ fd$d%„Z‡  ZS )'ÚVoxtralForConditionalGenerationrD   c                    sH   t ƒ  |¡ |jj| _t |j¡| _t |j¡| _	t
|ƒ| _|  ¡  d S rR   )rW   rX   r\   Ú
vocab_sizer   Úfrom_configrZ   Úaudio_towerr   Úlanguage_modelrU   Úmulti_modal_projectorÚ	post_initrb   rc   r   r    rX   Œ   s   

z(VoxtralForConditionalGeneration.__init__c                 C   ó
   | j  ¡ S rR   )rk   Úget_input_embeddings©rI   r   r   r    ro   –   ó   
z4VoxtralForConditionalGeneration.get_input_embeddingsc                 C   ó   | j  |¡ d S rR   )rk   Úset_input_embeddings)rI   Úvaluer   r   r    rs   ™   ó   z4VoxtralForConditionalGeneration.set_input_embeddingsc                 C   rn   rR   )rk   Úget_output_embeddingsrp   r   r   r    rv   œ   rq   z5VoxtralForConditionalGeneration.get_output_embeddingsc                 C   rr   rR   )rk   Úset_output_embeddings)rI   Únew_embeddingsr   r   r    rw   Ÿ   ru   z5VoxtralForConditionalGeneration.set_output_embeddingsc                 C   rr   rR   )rk   Úset_decoder)rI   Údecoderr   r   r    ry   ¢   ru   z+VoxtralForConditionalGeneration.set_decoderc                 C   rn   rR   )rk   Úget_decoderrp   r   r   r    r{   ¥   rq   z+VoxtralForConditionalGeneration.get_decoderzŸThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.r)   rJ   r.   r/   c                 K   sB   | j |fddi|¤Ž}|j}| d| jjj¡}|  |¡}||_|S )aa  
        input_features (`torch.FloatTensor`):
            Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
        Úreturn_dictTr0   )rj   r7   Úreshaper8   rZ   r[   rl   Úpooler_output)rI   rJ   r.   Úaudio_outputsÚaudio_hidden_statesÚaudio_embedsr   r   r    Úget_audio_features¨   s   
z2VoxtralForConditionalGeneration.get_audio_featuresNr   Ú	input_idsr6   Úposition_idsÚpast_key_valuesrL   ÚlabelsÚ	use_cacheÚcache_positionÚlogits_to_keepc                 K   sˆ   |du r
|   ¡ |ƒ}|dur1|dur1| j|ddj}|| jjk d¡}| | |j¡| |j¡¡}| j	d|||||||	|
dœ|¤Ž}|S )aj  
        Example:

        ```python
        >>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
        >>> import torch

        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
        >>> repo_id = "mistralai/Voxtral-Mini-3B-2507"

        >>> processor = AutoProcessor.from_pretrained(repo_id)
        >>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, dtype=torch.bfloat16, device_map=device)

        >>> conversation = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "audio",
                        "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
                    },
                    {"type": "text", "text": "What can you tell me about this audio?"},
                ],
            }
        ]

        >>> inputs = processor.apply_chat_template(conversation)
        >>> inputs = inputs.to(device, dtype=torch.bfloat16)

        >>> outputs = model.generate(**inputs, max_new_tokens=30)
        >>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
        ["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
        ```NT)r|   r0   )r6   r„   r…   rL   r†   r‡   rˆ   r‰   r   )
ro   r‚   r~   r8   Úaudio_token_idÚ	unsqueezeÚmasked_scatterr?   r3   rk   )rI   rƒ   rJ   r6   r„   r…   rL   r†   r‡   rˆ   r‰   r.   r   Úaudio_token_maskÚoutputsr   r   r    rQ   ¿   s*   1ÿø	÷z'VoxtralForConditionalGeneration.forwardc                    sF   |  dd ¡}| dd¡}tƒ j|i |¤Ž}|s| dd¡s!||d< |S )NrJ   Úis_first_iterationFr‡   T)ÚpopÚgetrW   Úprepare_inputs_for_generation)rI   Úargsr.   rJ   r   Úmodel_inputsrc   r   r    r’   	  s   z=VoxtralForConditionalGeneration.prepare_inputs_for_generation)
NNNNNNNNNr   )r   r   r   Ú_keep_in_fp32_modules_strictrX   ro   rs   rv   rw   ry   r{   r   r   ÚtorchÚFloatTensorr
   r   rT   r   r‚   Ú
LongTensorÚTensorr   ÚboolÚintr	   rQ   r’   rf   r   r   rc   r    rg   „   st    
ÿÿÿþõþýüûúùø	÷
öõôóHrg   )r#   r+   rg   )(r–   r   Úactivationsr   Úcache_utilsr   Ú
generationr   Úmodeling_outputsr   r   r	   Úprocessing_utilsr
   Úutilsr   r   r   Úutils.genericr   Úutils.output_capturingr   Úautor   r   Ú qwen2_audio.modeling_qwen2_audior   r   r   r   Úconfiguration_voxtralr   r   r"   r#   r+   ÚModulerU   rg   Ú__all__r   r   r   r    Ú<module>   s6   	ÿ7ÿ 