o
    	۷i/                     @   sH  d dl Z d dlmZmZ d dlZd dlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZmZmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZmZ ddlmZmZmZmZ ddl m!Z! G dd deZ"G dd deZ#G dd deZ$eddG dd deZ%G dd dej&Z'eddG dd de$eZ(g d Z)dS )!    N)OptionalUnion)nn   )ACT2FN)Cache)GenerationMixin)BaseModelOutputBaseModelOutputWithPastCausalLMOutputWithPast)Unpack)TransformersKwargsauto_docstringcan_return_tuple)check_model_inputs   )	AutoModelAutoModelForCausalLM)Qwen2AudioAttentionQwen2AudioEncoderQwen2AudioEncoderLayerQwen2AudioPreTrainedModel   )VoxtralConfigc                   @      e Zd ZdS )VoxtralAttentionN__name__
__module____qualname__ r    r    a/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/voxtral/modular_voxtral.pyr   '       r   c                   @   r   )VoxtralEncoderLayerNr   r    r    r    r!   r#   +   r"   r#   c                   @   s$   e Zd ZdZdZdZdZdZdZdS )VoxtralPreTrainedModelTN)r   r   r   _supports_flex_attn_supports_cache_class_supports_attention_backend_can_compile_fullgraph_no_split_modulesr    r    r    r!   r$   /   s    r$   z:
    The Voxtral encoder, which is a Whisper encoder.
    )custom_introc                   @   s2   e Zd ZeedZe 	ddee fddZ	dS )VoxtralEncoder)
attentionshidden_statesNkwargsc                 K   s  | j j| jjd  | jjd  }|jd |kr(td| d|jd  d| d|j| jjj	| jjj
d}tj| |}tj| |}|ddd	}| jj}|| |j	}tjj|| j| jd
}t| jD ]\}}	|	||dd}
|
d }qj| |}t|dS )a  
        Args:
            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
            attention_mask (`torch.Tensor`)`, *optional*):
                Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
                but it is not used. By default the silence in the input log mel spectrogram are ignored.
        r   z:Qwen2Audio expects the mel input features to be of length z, but found z-. Make sure to pad the input mel features to .)dtypedevicer   r   )ptrainingN)attention_masklayer_head_mask)last_hidden_state)configmax_source_positionsconv1strideconv2shape
ValueErrortoweightr1   r2   r   
functionalgelupermuteembed_positionsdropoutr4   	enumeratelayers
layer_normr	   )selfinput_featuresr5   r.   expected_seq_lengthinputs_embeds	embed_posr-   idxencoder_layerlayer_outputsr    r    r!   forwardD   s.    

zVoxtralEncoder.forwardN)
r   r   r   r   r#   _can_record_outputsr   r   r   rQ   r    r    r    r!   r+   9   s    r+   c                       s*   e Zd Zdef fddZdd Z  ZS )VoxtralMultiModalProjectorr8   c                    sN   t    tj|jj|jjdd| _t	|j
 | _tj|jj|jjdd| _d S )NF)bias)super__init__r   Linearaudio_configintermediate_sizetext_confighidden_sizelinear_1r   projector_hidden_actactlinear_2rI   r8   	__class__r    r!   rW   v   s   
z#VoxtralMultiModalProjector.__init__c                 C   s"   |  |}| |}| |}|S rR   )r]   r_   r`   )rI   audio_featuresr-   r    r    r!   rQ   |   s   


z"VoxtralMultiModalProjector.forward)r   r   r   r   rW   rQ   __classcell__r    r    rb   r!   rT   u   s    rT   zs
    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a LLama language model.
    c                       s4  e Zd ZdgZddiZddgdgfiZdgZ fddZd	d
 Zdd Z	dd Z
dd Zdd Zdd ZdejfddZdejfddZee										d+deej deej deej deej dee d eej d!eej d"ee d#eej d$eeejf d%ee d&efd'd(Z fd)d*Z  Z S ),VoxtralForConditionalGenerationzlm_head.weightlm_headcolwise_repr-   logitsrD   c                    sH   t  | |jj| _t|j| _t|j| _	t
|| _|   d S rR   )rV   rW   r[   
vocab_sizer   from_configrY   audio_towerr   language_modelrT   multi_modal_projector	post_initra   rb   r    r!   rW      s   

z(VoxtralForConditionalGeneration.__init__c                 C   
   | j  S rR   )rm   get_input_embeddingsrI   r    r    r!   rq         
z4VoxtralForConditionalGeneration.get_input_embeddingsc                 C      | j | d S rR   )rm   set_input_embeddings)rI   valuer    r    r!   ru         z4VoxtralForConditionalGeneration.set_input_embeddingsc                 C   rp   rR   )rm   get_output_embeddingsrr   r    r    r!   rx      rs   z5VoxtralForConditionalGeneration.get_output_embeddingsc                 C   rt   rR   )rm   set_output_embeddings)rI   new_embeddingsr    r    r!   ry      rw   z5VoxtralForConditionalGeneration.set_output_embeddingsc                 C   rt   rR   )rm   set_decoder)rI   decoderr    r    r!   r{      rw   z+VoxtralForConditionalGeneration.set_decoderc                 C   rp   rR   )rm   get_decoderrr   r    r    r!   r}      rs   z+VoxtralForConditionalGeneration.get_decoderrJ   c                 C   s0   |  |}|j}|d| jjj}| |}|S )a  
        This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
        Args:
            input_features (`torch.FloatTensor`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]

        Returns:
            `torch.FloatTensor`:
                The audio embeddings.
        r/   )rl   r7   reshaper8   rY   rZ   rn   )rI   rJ   audio_outputsaudio_hidden_statesaudio_embedsr    r    r!   get_audio_features   s
   

z2VoxtralForConditionalGeneration.get_audio_featuresc                 C   s   t dt | |S )NzUThe method `get_audio_embeds` is deprecated. Please use `get_audio_features` instead.)warningswarnFutureWarningr   )rI   rJ   r    r    r!   get_audio_embeds   s   
z0VoxtralForConditionalGeneration.get_audio_embedsNr   	input_idsr5   position_idspast_key_valuesrL   labels	use_cachecache_positionlogits_to_keepr.   returnc                 K   s   |du r
|   |}|dur.|dur.| |}|| jjkd}|||j||j}| jd|||||||	|
d|}|S )aj  
        Example:

        ```python
        >>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
        >>> import torch

        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
        >>> repo_id = "mistralai/Voxtral-Mini-3B-2507"

        >>> processor = AutoProcessor.from_pretrained(repo_id)
        >>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, dtype=torch.bfloat16, device_map=device)

        >>> conversation = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "audio",
                        "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
                    },
                    {"type": "text", "text": "What can you tell me about this audio?"},
                ],
            }
        ]

        >>> inputs = processor.apply_chat_template(conversation)
        >>> inputs = inputs.to(device, dtype=torch.bfloat16)

        >>> outputs = model.generate(**inputs, max_new_tokens=30)
        >>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
        ["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
        ```Nr/   )r5   r   r   rL   r   r   r   r   r    )	rq   r   r8   audio_token_id	unsqueezemasked_scatterr?   r2   rm   )rI   r   rJ   r5   r   r   rL   r   r   r   r   r.   r   audio_token_maskoutputsr    r    r!   rQ      s*   1
	z'VoxtralForConditionalGeneration.forwardc                    sH   | dd }|d}t j|i |}|d ur"|d dkr"||d< |S )NrJ   r   r   )popgetrV   prepare_inputs_for_generation)rI   argsr.   rJ   r   model_inputsrb   r    r!   r     s   
z=VoxtralForConditionalGeneration.prepare_inputs_for_generation)
NNNNNNNNNr   )!r   r   r   _tied_weights_keys_tp_plan_pp_plan_keep_in_fp32_modules_strictrW   rq   ru   rx   ry   r{   r}   torchFloatTensorr   r   r   r   r   
LongTensorTensorr   boolr   intr   r   r   rQ   r   re   r    r    rb   r!   rf      sh    
	
Hrf   )r$   r+   rf   )*r   typingr   r   r   r   activationsr   cache_utilsr   
generationr   modeling_outputsr	   r
   r   processing_utilsr   utilsr   r   r   utils.genericr   autor   r    qwen2_audio.modeling_qwen2_audior   r   r   r   configuration_voxtralr   r   r#   r$   r+   ModulerT   rf   __all__r    r    r    r!   <module>   s8   
7 