o
    ei.                     @   s<  d dl Z d dl mZ ddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZmZmZ ddlmZ G dd deZ G dd deZ!G dd deZ"eddG dd deZ#G dd dej$Z%eddG dd de"eZ&g d Z'dS )!    N)nn   )ACT2FN)Cache)GenerationMixin)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)Unpack)TransformersKwargsauto_docstringcan_return_tuple)merge_with_config_defaults)capture_outputs   )	AutoModelAutoModelForCausalLM)Qwen2AudioAttentionQwen2AudioEncoderQwen2AudioEncoderLayerQwen2AudioPreTrainedModel   )VoxtralConfigc                   @      e Zd ZdS )VoxtralAttentionN__name__
__module____qualname__ r   r   i/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/voxtral/modular_voxtral.pyr   )       r   c                   @   r   )VoxtralEncoderLayerNr   r   r   r   r    r"   -   r!   r"   c                   @   s    e Zd ZdZdZdZdZdZdS )VoxtralPreTrainedModelTN)r   r   r   _supports_flex_attn_supports_cache_class_supports_attention_backend_can_compile_fullgraph_no_split_modulesr   r   r   r    r#   1   s    r#   z:
    The Voxtral encoder, which is a Whisper encoder.
    custom_introc                   @   s<   e Zd ZeedZee	ddee	 de
eB fddZdS )VoxtralEncoder)
attentionshidden_statesNkwargsreturnc                 K   s  | j j| jjd  | jjd  }|jd |kr(td| d|jd  d| d|j| jjj	| jjj
d}tj| |}tj| |}|ddd	}| jj}|| |j	}tjj|| j| jd
}t| jD ]\}}	|	||d}
|
d }qj| |}t|dS )a  
        Args:
            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
            attention_mask (`torch.Tensor`)`, *optional*):
                Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
                but it is not used. By default the silence in the input log mel spectrogram are ignored.
        r   z7Voxtral expects the mel input features to be of length z, but found z-. Make sure to pad the input mel features to .)dtypedevicer   r   )ptraining)attention_mask)last_hidden_state)configmax_source_positionsconv1strideconv2shape
ValueErrortoweightr2   r3   r   
functionalgelupermuteembed_positionsdropoutr5   	enumeratelayers
layer_normr   )selfinput_featuresr6   r.   expected_seq_lengthinputs_embeds	embed_posr-   idxencoder_layerlayer_outputsr   r   r    forwardE   s,    

zVoxtralEncoder.forwardN)r   r   r   r   r"   _can_record_outputsr   r   r
   r   tupler   rQ   r   r   r   r    r+   :   s    r+   c                       s*   e Zd Zdef fddZdd Z  ZS )VoxtralMultiModalProjectorr8   c                    sN   t    tj|jj|jjdd| _t	|j
 | _tj|jj|jjdd| _d S )NF)bias)super__init__r   Linearaudio_configintermediate_sizetext_confighidden_sizelinear_1r   projector_hidden_actactlinear_2rI   r8   	__class__r   r    rX   w   s   
z#VoxtralMultiModalProjector.__init__c                 C   s"   |  |}| |}| |}|S rR   )r^   r`   ra   )rI   audio_featuresr-   r   r   r    rQ   }   s   


z"VoxtralMultiModalProjector.forward)r   r   r   r   rX   rQ   __classcell__r   r   rc   r    rU   v   s    rU   zs
    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a LLama language model.
    c                       s   e Zd ZdgZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
eedddejdee deeB fddZee										d&dejdB dejdB dejdB dejdB dedB dejdB dejdB dedB d ejdB d!eejB dee defd"d#Z fd$d%Z  ZS )'VoxtralForConditionalGenerationrD   c                    sH   t  | |jj| _t|j| _t|j| _	t
|| _|   d S rR   )rW   rX   r\   
vocab_sizer   from_configrZ   audio_towerr   language_modelrU   multi_modal_projector	post_initrb   rc   r   r    rX      s   

z(VoxtralForConditionalGeneration.__init__c                 C   
   | j  S rR   )rk   get_input_embeddingsrI   r   r   r    ro         
z4VoxtralForConditionalGeneration.get_input_embeddingsc                 C      | j | d S rR   )rk   set_input_embeddings)rI   valuer   r   r    rs         z4VoxtralForConditionalGeneration.set_input_embeddingsc                 C   rn   rR   )rk   get_output_embeddingsrp   r   r   r    rv      rq   z5VoxtralForConditionalGeneration.get_output_embeddingsc                 C   rr   rR   )rk   set_output_embeddings)rI   new_embeddingsr   r   r    rw      ru   z5VoxtralForConditionalGeneration.set_output_embeddingsc                 C   rr   rR   )rk   set_decoder)rI   decoderr   r   r    ry      ru   z+VoxtralForConditionalGeneration.set_decoderc                 C   rn   rR   )rk   get_decoderrp   r   r   r    r{      rq   z+VoxtralForConditionalGeneration.get_decoderzThis method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.r)   rJ   r.   r/   c                 K   sB   | j |fddi|}|j}|d| jjj}| |}||_|S )aa  
        input_features (`torch.FloatTensor`):
            Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
        return_dictTr0   )rj   r7   reshaper8   rZ   r[   rl   pooler_output)rI   rJ   r.   audio_outputsaudio_hidden_statesaudio_embedsr   r   r    get_audio_features   s   
z2VoxtralForConditionalGeneration.get_audio_featuresNr   	input_idsr6   position_idspast_key_valuesrL   labels	use_cachecache_positionlogits_to_keepc                 K   s   |du r
|   |}|dur1|dur1| j|ddj}|| jjkd}|||j||j}| j	d|||||||	|
d|}|S )aj  
        Example:

        ```python
        >>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
        >>> import torch

        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
        >>> repo_id = "mistralai/Voxtral-Mini-3B-2507"

        >>> processor = AutoProcessor.from_pretrained(repo_id)
        >>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, dtype=torch.bfloat16, device_map=device)

        >>> conversation = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "audio",
                        "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
                    },
                    {"type": "text", "text": "What can you tell me about this audio?"},
                ],
            }
        ]

        >>> inputs = processor.apply_chat_template(conversation)
        >>> inputs = inputs.to(device, dtype=torch.bfloat16)

        >>> outputs = model.generate(**inputs, max_new_tokens=30)
        >>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
        ["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
        ```NT)r|   r0   )r6   r   r   rL   r   r   r   r   r   )
ro   r   r~   r8   audio_token_id	unsqueezemasked_scatterr?   r3   rk   )rI   r   rJ   r6   r   r   rL   r   r   r   r   r.   r   audio_token_maskoutputsr   r   r    rQ      s*   1	z'VoxtralForConditionalGeneration.forwardc                    sF   | dd }|dd}t j|i |}|s|dds!||d< |S )NrJ   is_first_iterationFr   T)popgetrW   prepare_inputs_for_generation)rI   argsr.   rJ   r   model_inputsrc   r   r    r   	  s   z=VoxtralForConditionalGeneration.prepare_inputs_for_generation)
NNNNNNNNNr   )r   r   r   _keep_in_fp32_modules_strictrX   ro   rs   rv   rw   ry   r{   r   r   torchFloatTensorr
   r   rT   r   r   
LongTensorTensorr   boolintr	   rQ   r   rf   r   r   rc   r    rg      st    
	
Hrg   )r#   r+   rg   )(r   r   activationsr   cache_utilsr   
generationr   modeling_outputsr   r   r	   processing_utilsr
   utilsr   r   r   utils.genericr   utils.output_capturingr   autor   r    qwen2_audio.modeling_qwen2_audior   r   r   r   configuration_voxtralr   r   r"   r#   r+   ModulerU   rg   __all__r   r   r   r    <module>   s6   	7 