o
    c۷i>T                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	Z
d dlmZ d dlmZmZmZ e eZdZded	ed
dfddZG dd deeZeG dd dZeG dd dZeG dd dZeG dd dZG dd dZdS )    N)	dataclass)Enum)Any)Audio)
AudioChunkAudioURLChunkAudioURLType
   num_samplesmult_ofreturnc                 C   s$   | | dksJ d| d|d S )Nr   znum_samples=z must be a multiple of mult_of= )r
   r   r   r   \/home/ubuntu/vllm_env/lib/python3.10/site-packages/mistral_common/tokens/tokenizers/audio.py_check_mult_of   s   $r   c                   @   s   e Zd ZdZdZdZdS )TranscriptionFormatzTranscription format.

    Should be set by the tokenizer for correct encoding.

    Attributes:
    - INSTRUCT: The instruct format.
    - STREAMING: The streaming format.
    instruct	streamingN)__name__
__module____qualname____doc__INSTRUCT	STREAMINGr   r   r   r   r      s    	r   c                   @   s4   e Zd ZU dZeed< eed< eed< d	ddZdS )
AudioSpectrogramConfiga^  Configuration for generating an audio spectrogram.

    Attributes:
        num_mel_bins: Number of mel bins, typically 80 or 128.
        hop_length: Length of the overlapping windows for
            the STFT used to obtain the Mel Frequency coefficients, typically 160.
        window_size: Window size of the Fourier transform, typically 400.
    num_mel_bins
hop_lengthwindow_sizer   Nc                 C   s@   | j dks
J | j | jdksJ | j| jdksJ | jd S )Nr   )r   r   r   selfr   r   r   __post_init__;   s   z$AudioSpectrogramConfig.__post_init__r   N)r   r   r   r   int__annotations__r   r   r   r   r   r   )   s   
 
r   c                   @   s|  e Zd ZU dZeed< eed< eed< dZedB ed< e	j
Ze	ed< dZedB ed< dZedB ed	< dZedB ed
< dZedB ed< dZeeef dB ed< d'ddZedefddZdedefddZedefddZd(dedB defddZdedefddZedefddZedefddZedefdd Zedefd!d"Zd(dedB defd#d$Zedefd%d&Z dS ))AudioConfiga  Configuration for audio processing.

    Attributes:
        sampling_rate: Sampling rate of the audio.
        frame_rate: Number of frames per second accepted by the tokenizer model.
        encoding_config: Configuration for audio spectrogram.
        chunk_length_s: Whether to pad an audio into multiples of chunk_length_s seconds (optional).
        voice_num_audio_tokens: Mapping from speaker voice name to number of audio tokens
            for that speaker's reference audio (optional, only for TTS).
    sampling_rate
frame_rateencoding_configNchunk_length_stranscription_formattranscription_delay_msstreaming_look_ahead_msstreaming_look_back_msstreaming_n_left_pad_tokensvoice_num_audio_tokensr   c                 C   s  | j dks
J | j | jdksJ | j| jd ur4| jdks#J | j| jdks4J d| j d| j | jso| jd u sEJ d| jd| jd u sSJ d| jd| jd u saJ d| jd| jd u soJ d| jd| jr| jd usJ d| jd	| jd usJ d| jd	| jd usJ d| jd	| jd usJ d| jd	d
| j  }| jdksJ d| j| dksJ d| jd|| jd u sJ d| jdd S d S )Nr   z7chunk_length_s and sampling_rate must both be > 0, got z and zself.transcription_delay_ms=z must be None.zself.streaming_look_ahead_ms=zself.streaming_look_back_ms=!self.streaming_n_left_pad_tokens= must be set.     @@z*{self.transcription_delay_ms=} must be > 0z) must be a multiple of frame_duration_ms=zself.chunk_length_s=z cannot be set in streaming.)	r%   r$   r'   chunk_framesis_streamingr)   r*   r+   r,   )r   frame_duration_msr   r   r   r   c   s2   

zAudioConfig.__post_init__c                 C   s   | j tjkS N)r(   r   r   r   r   r   r   r2      s   zAudioConfig.is_streaming	audio_lenc                 C   sD   || j j dkrt|| j j d }n|| j j }t|| j S Nr      )r&   r   mathceilaudio_length_per_tok)r   r5   r   r   r   num_audio_tokens   s   zAudioConfig.num_audio_tokensc                 C   s   t dt |  S )Nz4Use get_num_delay_tokens instead of num_delay_tokens)warningswarnDeprecationWarningget_num_delay_tokensr   r   r   r   num_delay_tokens   s   zAudioConfig.num_delay_tokensc                 C   sN   | j sJ d| j d|d u r| j}|d usJ d|d| | |S )Nz5Can't call get_num_delay_tokens if self.is_streaming=.z:Can't call get_num_delay_tokens if transcription_delay_ms=)r2   r)   r;   	delay_lenr   r)   r   r   r   r?      s
   z AudioConfig.get_num_delay_tokensc                 C   s   t |d | j S Nr0   )r!   r$   rC   r   r   r   rB      s   zAudioConfig.delay_lenc                 C   s
   d| j  S rD   )r%   r   r   r   r   r3      s   
zAudioConfig.frame_duration_msc                 C   s,   | j dusJ d| j dt| j | j S )z)Calculate the number of frames per chunk.Nz/Can't call chunk_frames if self.chunk_length_s=rA   )r'   r!   r$   r   r   r   r   r1      s   zAudioConfig.chunk_framesc                 C   s   t | j| j S r4   )r!   r$   r%   r   r   r   r   raw_audio_length_per_tok   s   z$AudioConfig.raw_audio_length_per_tokc                 C   s   t | j}|| jj }t|S )z(Calculate the length of audio per token.)floatrE   r&   r   r!   )r   downsample_factorr   r   r   r:      s   
z AudioConfig.audio_length_per_tokc                 C   s*   | j sJ d| j d| |d t S )Nz3Can't call n_right_pad_tokens if self.is_streaming=rA   r7   )r2   r?   OFFLINE_STREAMING_BUFFER_TOKENSrC   r   r   r   n_right_pad_tokens   s   zAudioConfig.n_right_pad_tokensc                 C   s:   | j sJ d| j d| jd usJ d| jd| jS )Nz2Can't call n_left_pad_tokens if self.is_streaming=rA   r.   r/   )r2   r,   r   r   r   r   n_left_pad_tokens   s   zAudioConfig.n_left_pad_tokensr    r4   )!r   r   r   r   r!   r"   rF   r   r'   r   r   r(   r)   r*   r+   r,   r-   dictstrr   propertyboolr2   r;   r@   r?   rB   r3   r1   rE   r:   rI   rJ   r   r   r   r   r#   A   s>   
 
	r#   c                   @   s*   e Zd ZU dZee ed< edB ed< dS )AudioEncodinga  Encapsulates the tokens and audio data for an audio chunk.

    Attributes:
        tokens: Text tokens corresponding to this audio chunk.
        audio: Original audio waveform data, or None when using a preset voice
            (no reference audio to forward to the model).
    tokensNaudio)r   r   r   r   listr!   r"   r   r   r   r   r   rO      s   
 rO   c                   @   sN   e Zd ZU dZedB ed< edB ed< edB ed< edB ed< edB ed< dS )SpecialAudioIDsa  Special text tokens corresponding to audio token sequence.

    Attributes:
        audio: Token representing audio.
        begin_audio: Token representing the beginning of audio.
        streaming_pad: Token representing streaming pad of audio. Only relevant for steaming models.
        text_to_audio: Token representing intent to convert text to audio.
        audio_to_text: Token representing intent to convert audio to text.
    NrQ   begin_audiostreaming_padtext_to_audioaudio_to_text)r   r   r   r   r!   r"   r   r   r   r   rS      s   
 
rS   c                   @   s  e Zd ZdZdededdfddZ	d8dejd	e	d
e
dB dedejf
ddZd8d
e
dB deeef fddZd8de	d
e
dB dee	e	f fddZde	d	e	de	fddZd8d
e
dB dee	 fddZde	dee	 fddZd8ded
e
dB defddZde	dee	 fdd Zd!e	de	fd"d#ZdedB d$edB defd%d&Zd'edefd(d)Zd'edefd*d+Zd'eeB defd,d-Ze de	fd.d/Z!e de	fd0d1Z"e de	fd2d3Z#e de	fd4d5Z$e de	fd6d7Z%dS )9AudioEncodera	  Encodes audio chunks into a format suitable for further processing.

    Attributes:
        audio_config: Configuration for audio processing.
        encoding_config: Configuration for audio spectrogram.
        special_ids: Special tokens for audio encoding.
    audio_configspecial_idsr   Nc                 C   s   || _ |j| _|| _d S r4   )rY   r&   rZ   )r   rY   rZ   r   r   r   __init__   s   
zAudioEncoder.__init__audio_arrayr$   r)   kwargsc                 K   s   | j jr| |jd |}t|d||jd  f}|S | j jr5| |jd |\}}t|||f}|S t| j	t
rS|jd | j	jk rSt|d| j	j|jd  f}|S )a9  Pad the audio array to the desired length.

        Args:
            audio_array: Audio data as a numpy array.
            sampling_rate: Sampling rate of the audio.
            transcription_delay_ms (optional): Delay in milliseconds for transcription.

        Returns:
            Padded audio array.
        r   )rY   r'   next_multiple_of_chunk_framesshapenppadr2   _get_streaming_pad
isinstancer&   r   r   )r   r\   r$   r)   r]   r_   left_pad	right_padr   r   r   rb      s   
zAudioEncoder.padc                 C   sT   |  d|\}}ttj|tjd| jjdd}ttj|tjd| jjdd}||fS )zGets left and right padding for realtime audio models.

        Args:
            transcription_delay_ms (optional): Delay in milliseconds for transcription.

        Returns:
            Tuple of left and right padding for realtime audio models.
        r   )dtypewav)r\   r$   format)rc   r   ra   zerosfloat32rY   r$   )r   r)   re   rf   left_pad_audioright_pad_audior   r   r   get_padding_audio  s   
zAudioEncoder.get_padding_audior
   c           	      C   sl   | j j}t|||  | }| j |}t|| }t|| ||7 }| j j}t|| }t|| ||fS r4   )rY   rE   r!   rI   r   rJ   )	r   r
   r)   r   rf   _extra_right_pad_tokens_extra_right_pad_samples_extra_left_pad_tokensre   r   r   r   rc   3  s   

zAudioEncoder._get_streaming_padaudio_array_lenc                 C   s^   || j jksJ d|d| j j| j jdus"J d| j jdt|| j j | j j S )zCalculate the next multiple of chunk frames.

        Args:
            audio_array_len: Length of the audio array.
            sampling_rate: Sampling rate of the audio.

        Returns:
            The next multiple of chunk frames.
        zExpected sampling_rate=z' to be self.audio_config.sampling_rate=NzMCan't call next_multiple_of_chunk_frames if self.audio_config.chunk_length_s=rA   )rY   r$   r'   r8   r9   r1   )r   rr   r$   r   r   r   r_   H  s   
z*AudioEncoder.next_multiple_of_chunk_framesc                 C   sT   t | jjtsJ d| jj| jjdusJ | jj| j| }| jg| }|S )z8Encode the streaming tokens given a transcription delay.zQAudio encoder must be spectrogram encoder, got self.audio_config.encoding_config=N)rd   rY   r&   r   r)   rJ   r?   rU   )r   r)   stream_pad_prefix_lenrP   r   r   r   encode_streaming_tokens[  s   z$AudioEncoder.encode_streaming_tokenssignal_lengthc                 C   s^   || j j dkrt|| j j d }n|| j j }t|| jj }| jg| jg|  }|S r6   )r&   r   r8   r9   rY   r:   begin_audio_tokenaudio_token)r   ru   r;   rP   r   r   r   _encode_audio_tokensj  s   z!AudioEncoder._encode_audio_tokensrQ   c                 C   s\   | | jj | |j| jj||_| jjtjkr| |}n	| 	|jj
d }t||dS )z4Encode an audio optionally with transcription delay.r   rP   rQ   )resamplerY   r$   rb   r\   r(   r   r   rt   rx   r`   rO   )r   rQ   r)   rP   r   r   r   encode_audiov  s   zAudioEncoder.encode_audior;   c                 C   s&   g }| | j || jg|  |S )zBuild the token sequence for a speech request's audio segment.

        Args:
            num_audio_tokens: Number of audio placeholder tokens to emit.

        Returns:
            List of token IDs: [BEGIN_AUDIO, AUDIO * num_audio_tokens].
        )appendrv   extendrw   )r   r;   rP   r   r   r   '_encode_audio_tokens_for_speech_request  s   	z4AudioEncoder._encode_audio_tokens_for_speech_requestaudio_lengthc                 C   s   t || jj | jj d S )zCompute the number of audio tokens needed for a given audio length.

        Args:
            audio_length: Number of audio samples.

        Returns:
            Number of audio tokens (includes +1 for END_OUTPUT_AUDIO).
        r7   )r8   r9   rY   r$   r%   )r   r   r   r   r   '_get_num_audio_token_for_speech_request  s   
z4AudioEncoder._get_num_audio_token_for_speech_requestvoicec                 C   s   |dus|dusJ d|d||dur&| | jj | t|j}n(| jjdus0J d|dur:|| jjv sHJ d|dt| jj | jj| }| |}t	||dS )a  Encode audio or voice preset into an AudioEncoding for speech synthesis.

        Either ``audio`` (reference audio for voice cloning) or ``voice`` (preset name)
        must be provided. When ``audio`` is given it takes precedence.

        Args:
            audio: Reference audio waveform, or None to use a voice preset.
            voice: Preset voice name (e.g. 'Neutral Male', 'Neutral Female'), or None when using ref audio.

        Returns:
            AudioEncoding containing the token sequence and optional audio data.
        NzAEither audio or voice must be defined to encode audio, got audio=z and voice=zUvoice_num_audio_tokens must be set in audio config to use voice-based speech requestszUnknown voice z, expected one of ry   )
rz   rY   r$   r   lenr\   r-   rR   r~   rO   )r   rQ   r   r;   rP   r   r   r   encode_audio_for_speech_request  s$   
z,AudioEncoder.encode_audio_for_speech_requestcontentc                 C   s   t |j}| |S r4   )r   from_raw_audioinput_audior{   )r   r   rQ   r   r   r   _encode_audio_chunk  s   
z AudioEncoder._encode_audio_chunkc                 C   sT   |  }|tjtjhv rt|j}n|tjkrt|j}nt|j}| 	|S r4   )
get_url_typer   filefile_urir   	from_fileurlfrom_urlfrom_base64r{   )r   r   url_typerQ   r   r   r   _encode_audio_url_chunk  s   

z$AudioEncoder._encode_audio_url_chunkc                 C   s:   t |tr
| |S t |tr| |S tdt| )zCall the encoder on an audio chunk or URL chunk.

        Args:
            content: Audio or URL chunk to encode.

        Returns:
            Encoded audio data and tokens.
        zUnsupported content type: )rd   r   r   r   r   
ValueErrortype)r   r   r   r   r   __call__  s
   
	


zAudioEncoder.__call__c                 C   (   | j jdusJ d| j jd| j jS )zGet the audio token.Nzself.special_ids.audio=r/   )rZ   rQ   r   r   r   r   rw         zAudioEncoder.audio_tokenc                 C   r   )zGet the begin audio token.Nzself.special_ids.begin_audio=r/   )rZ   rT   r   r   r   r   rv     r   zAudioEncoder.begin_audio_tokenc                 C   r   )zGet the streaming pad token.Nzself.special_ids.streaming_pad=r/   )rZ   rU   r   r   r   r   rU     r   zAudioEncoder.streaming_padc                 C   r   )zGet the text_to_audio token.Nzself.special_ids.text_to_audio=r/   )rZ   rV   r   r   r   r   text_to_audio_token  r   z AudioEncoder.text_to_audio_tokenc                 C   r   )zGet the audio_to_text token.Nzself.special_ids.audio_to_text=r/   )rZ   rW   r   r   r   r   audio_to_text_token  r   z AudioEncoder.audio_to_text_tokenr4   )&r   r   r   r   r#   rS   r[   ra   ndarrayr!   rF   r   rb   tupler   rn   rc   r_   rR   rt   rx   rO   r{   r~   r   rL   r   r   r   r   r   r   rM   rw   rv   rU   r   r   r   r   r   r   rX      sJ    	
 $$#rX   )loggingr8   r<   dataclassesr   enumr   typingr   numpyra   mistral_common.audior   &mistral_common.protocol.instruct.chunkr   r   r   	getLoggerr   loggerrH   r!   r   rL   r   r   r#   rO   rS   rX   r   r   r   r   <module>   s.    
 