o
    *iE                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	Z
d dlmZ d dlmZmZmZ e eZdZded	ed
dfddZG dd deeZeG dd dZeG dd dZeG dd dZeG dd dZG dd dZdS )    N)	dataclass)Enum)Any)Audio)
AudioChunkAudioURLChunkAudioURLType
   num_samplesmult_ofreturnc                 C   s$   | | dksJ d| d|d S )Nr   znum_samples=z must be a multiple of mult_of= )r
   r   r   r   c/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/mistral_common/tokens/tokenizers/audio.py_check_mult_of   s   $r   c                   @   s   e Zd ZdZdZdZdS )TranscriptionFormatzTranscription format.

    Should be set by the tokenizer for correct encoding.

    Attributes:
    - INSTRUCT: The instruct format.
    - STREAMING: The streaming format.
    instruct	streamingN)__name__
__module____qualname____doc__INSTRUCT	STREAMINGr   r   r   r   r      s    	r   c                   @   s4   e Zd ZU dZeed< eed< eed< d	ddZdS )
AudioSpectrogramConfiga^  Configuration for generating an audio spectrogram.

    Attributes:
        num_mel_bins: Number of mel bins, typically 80 or 128.
        hop_length: Length of the overlapping windows for
            the STFT used to obtain the Mel Frequency coefficients, typically 160.
        window_size: Window size of the Fourier transform, typically 400.
    num_mel_bins
hop_lengthwindow_sizer   Nc                 C   s@   | j dks
J | j | jdksJ | j| jdksJ | jd S )Nr   )r   r   r   selfr   r   r   __post_init__;   s   z$AudioSpectrogramConfig.__post_init__r   N)r   r   r   r   int__annotations__r   r   r   r   r   r   )   s   
 
r   c                   @   sd  e Zd ZU dZeed< eed< eed< dZedB ed< e	j
Ze	ed< dZedB ed< dZedB ed	< dZedB ed
< dZedB ed< d&ddZedefddZdedefddZedefddZd'dedB defddZdedefddZedefddZedefddZedefddZedefd d!Zd'dedB defd"d#Zedefd$d%ZdS )(AudioConfiga`  Configuration for audio processing.

    Attributes:
        sampling_rate: Sampling rate of the audio.
        frame_rate: Number of frames per second accepted by the tokenizer model.
        encoding_config: Configuration for audio spectrogram.
        chunk_length_s: Whether to pad an audio into multiples of chunk_length_s seconds (optional).
    sampling_rate
frame_rateencoding_configNchunk_length_stranscription_formattranscription_delay_msstreaming_look_ahead_msstreaming_look_back_msstreaming_n_left_pad_tokensr   c                 C   s  | j dks
J | j | jdksJ | j| jd ur4| jdks#J | j| jdks4J d| j d| j | jso| jd u sEJ d| jd| jd u sSJ d| jd| jd u saJ d| jd| jd u soJ d| jd| jr| jd usJ d| jd	| jd usJ d| jd	| jd usJ d| jd	| jd usJ d| jd	d
| j  }| jdksJ d| j| dksJ d| jd|| jd u sJ d| jdd S d S )Nr   z7chunk_length_s and sampling_rate must both be > 0, got z and zself.transcription_delay_ms=z must be None.zself.streaming_look_ahead_ms=zself.streaming_look_back_ms=!self.streaming_n_left_pad_tokens= must be set.     @@z*{self.transcription_delay_ms=} must be > 0z) must be a multiple of frame_duration_ms=zself.chunk_length_s=z cannot be set in streaming.)	r%   r$   r'   chunk_framesis_streamingr)   r*   r+   r,   )r   frame_duration_msr   r   r   r   _   s2   

zAudioConfig.__post_init__c                 C   s   | j tjkS N)r(   r   r   r   r   r   r   r1   ~   s   zAudioConfig.is_streaming	audio_lenc                 C   sD   || j j dkrt|| j j d }n|| j j }t|| j S Nr      )r&   r   mathceilaudio_length_per_tok)r   r4   r   r   r   num_audio_tokens   s   zAudioConfig.num_audio_tokensc                 C   s   t dt |  S )Nz4Use get_num_delay_tokens instead of num_delay_tokens)warningswarnDeprecationWarningget_num_delay_tokensr   r   r   r   num_delay_tokens   s   zAudioConfig.num_delay_tokensc                 C   sN   | j sJ d| j d|d u r| j}|d usJ d|d| | |S )Nz5Can't call get_num_delay_tokens if self.is_streaming=.z:Can't call get_num_delay_tokens if transcription_delay_ms=)r1   r)   r:   	delay_lenr   r)   r   r   r   r>      s
   z AudioConfig.get_num_delay_tokensc                 C   s   t |d | j S Nr/   )r!   r$   rB   r   r   r   rA      s   zAudioConfig.delay_lenc                 C   s
   d| j  S rC   )r%   r   r   r   r   r2      s   
zAudioConfig.frame_duration_msc                 C   s,   | j dusJ d| j dt| j | j S )z)Calculate the number of frames per chunk.Nz/Can't call chunk_frames if self.chunk_length_s=r@   )r'   r!   r$   r   r   r   r   r0      s   zAudioConfig.chunk_framesc                 C   s   t | j| j S r3   )r!   r$   r%   r   r   r   r   raw_audio_length_per_tok   s   z$AudioConfig.raw_audio_length_per_tokc                 C   s   t | j}|| jj }t|S )z(Calculate the length of audio per token.)floatrD   r&   r   r!   )r   downsample_factorr   r   r   r9      s   
z AudioConfig.audio_length_per_tokc                 C   s*   | j sJ d| j d| |d t S )Nz3Can't call n_right_pad_tokens if self.is_streaming=r@   r6   )r1   r>   OFFLINE_STREAMING_BUFFER_TOKENSrB   r   r   r   n_right_pad_tokens   s   zAudioConfig.n_right_pad_tokensc                 C   s:   | j sJ d| j d| jd usJ d| jd| jS )Nz2Can't call n_left_pad_tokens if self.is_streaming=r@   r-   r.   )r1   r,   r   r   r   r   n_left_pad_tokens   s   zAudioConfig.n_left_pad_tokensr    r3   )r   r   r   r   r!   r"   rE   r   r'   r   r   r(   r)   r*   r+   r,   r   propertyboolr1   r:   r?   r>   rA   r2   r0   rD   r9   rH   rI   r   r   r   r   r#   A   s<   
 	
	r#   c                   @   s&   e Zd ZU dZee ed< eed< dS )AudioEncodingzEncapsulates the tokens and audio data for an audio chunk.

    Attributes:
        tokens: Text tokens corresponding to this audio chunk.
        audio: Original audio waveform data.
    tokensaudioN)r   r   r   r   listr!   r"   r   r   r   r   r   rL      s   
 rL   c                   @   s6   e Zd ZU dZedB ed< edB ed< edB ed< dS )SpecialAudioIDsa  Special text tokens corresponding to audio token sequence.

    Attributes:
        audio: Token representing audio.
        begin_audio: Token representing the beginning of audio.
        streaming_pad: Token representing streaming pad of audio. Only relevant for steaming models.
    NrN   begin_audiostreaming_pad)r   r   r   r   r!   r"   r   r   r   r   rP      s
   
 rP   c                   @   sh  e Zd ZdZdededdfddZ	d+dejd	e	d
e
dB dedejf
ddZd+d
e
dB deeef fddZd+de	d
e
dB dee	e	f fddZde	d	e	de	fddZd+d
e
dB dee	 fddZde	dee	 fddZd+ded
e
dB defddZdedefdd Zdedefd!d"ZdeeB defd#d$Zede	fd%d&Zede	fd'd(Zede	fd)d*ZdS ),AudioEncodera	  Encodes audio chunks into a format suitable for further processing.

    Attributes:
        audio_config: Configuration for audio processing.
        encoding_config: Configuration for audio spectrogram.
        special_ids: Special tokens for audio encoding.
    audio_configspecial_idsr   Nc                 C   s   || _ |j| _|| _d S r3   )rT   r&   rU   )r   rT   rU   r   r   r   __init__   s   
zAudioEncoder.__init__audio_arrayr$   r)   kwargsc                 K   s   | j jr| |jd |}t|d||jd  f}|S | j jr5| |jd |\}}t|||f}|S t| j	t
rS|jd | j	jk rSt|d| j	j|jd  f}|S )a9  Pad the audio array to the desired length.

        Args:
            audio_array: Audio data as a numpy array.
            sampling_rate: Sampling rate of the audio.
            transcription_delay_ms (optional): Delay in milliseconds for transcription.

        Returns:
            Padded audio array.
        r   )rT   r'   next_multiple_of_chunk_framesshapenppadr1   _get_streaming_pad
isinstancer&   r   r   )r   rW   r$   r)   rX   rZ   left_pad	right_padr   r   r   r]      s   
zAudioEncoder.padc                 C   sT   |  d|\}}ttj|tjd| jjdd}ttj|tjd| jjdd}||fS )zGets left and right padding for realtime audio models.

        Args:
            transcription_delay_ms (optional): Delay in milliseconds for transcription.

        Returns:
            Tuple of left and right padding for realtime audio models.
        r   )dtypewav)rW   r$   format)r^   r   r\   zerosfloat32rT   r$   )r   r)   r`   ra   left_pad_audioright_pad_audior   r   r   get_padding_audio  s   
zAudioEncoder.get_padding_audior
   c           	      C   sl   | j j}t|||  | }| j |}t|| }t|| ||7 }| j j}t|| }t|| ||fS r3   )rT   rD   r!   rH   r   rI   )	r   r
   r)   r   ra   _extra_right_pad_tokens_extra_right_pad_samples_extra_left_pad_tokensr`   r   r   r   r^   ,  s   

zAudioEncoder._get_streaming_padaudio_array_lenc                 C   s^   || j jksJ d|d| j j| j jdus"J d| j jdt|| j j | j j S )zCalculate the next multiple of chunk frames.

        Args:
            audio_array_len: Length of the audio array.
            sampling_rate: Sampling rate of the audio.

        Returns:
            The next multiple of chunk frames.
        zExpected sampling_rate=z' to be self.audio_config.sampling_rate=NzMCan't call next_multiple_of_chunk_frames if self.audio_config.chunk_length_s=r@   )rT   r$   r'   r7   r8   r0   )r   rm   r$   r   r   r   rZ   A  s   
z*AudioEncoder.next_multiple_of_chunk_framesc                 C   sT   t | jjtsJ d| jj| jjdusJ | jj| j| }| jg| }|S )z8Encode the streaming tokens given a transcription delay.zQAudio encoder must be spectrogram encoder, got self.audio_config.encoding_config=N)r_   rT   r&   r   r)   rI   r>   rR   )r   r)   stream_pad_prefix_lenrM   r   r   r   encode_streaming_tokensT  s   z$AudioEncoder.encode_streaming_tokenssignal_lengthc                 C   s^   || j j dkrt|| j j d }n|| j j }t|| jj }| jg| jg|  }|S r5   )r&   r   r7   r8   rT   r9   begin_audio_tokenaudio_token)r   rp   r:   rM   r   r   r   _encode_audio_tokensc  s   z!AudioEncoder._encode_audio_tokensrN   c                 C   s\   | | jj | |j| jj||_| jjtjkr| |}n	| 	|jj
d }t||dS )z4Encode an audio optionally with transcription delay.r   )rM   rN   )resamplerT   r$   r]   rW   r(   r   r   ro   rs   r[   rL   )r   rN   r)   rM   r   r   r   encode_audioo  s   zAudioEncoder.encode_audiocontentc                 C   s   t |j}| |S r3   )r   from_raw_audioinput_audioru   )r   rv   rN   r   r   r   _encode_audio_chunk~  s   
z AudioEncoder._encode_audio_chunkc                 C   sT   |  }|tjtjhv rt|j}n|tjkrt|j}nt|j}| 	|S r3   )
get_url_typer   filefile_urir   	from_fileurlfrom_urlfrom_base64ru   )r   rv   url_typerN   r   r   r   _encode_audio_url_chunk  s   

z$AudioEncoder._encode_audio_url_chunkc                 C   s:   t |tr
| |S t |tr| |S tdt| )zCall the encoder on an audio chunk or URL chunk.

        Args:
            content: Audio or URL chunk to encode.

        Returns:
            Encoded audio data and tokens.
        zUnsupported content type: )r_   r   r   r   ry   
ValueErrortype)r   rv   r   r   r   __call__  s
   
	


zAudioEncoder.__call__c                 C   (   | j jdusJ d| j jd| j jS )zGet the audio token.Nzself.special_ids.audio=r.   )rU   rN   r   r   r   r   rr         zAudioEncoder.audio_tokenc                 C   r   )zGet the begin audio token.Nzself.special_ids.begin_audio=r.   )rU   rQ   r   r   r   r   rq     r   zAudioEncoder.begin_audio_tokenc                 C   r   )zGet the streaming pad token.Nzself.special_ids.streaming_pad=r.   )rU   rR   r   r   r   r   rR     r   zAudioEncoder.streaming_padr3   ) r   r   r   r   r#   rP   rV   r\   ndarrayr!   rE   r   r]   tupler   ri   r^   rZ   rO   ro   rs   rL   ru   r   ry   r   r   r   rJ   rr   rq   rR   r   r   r   r   rS      s<    	
 $$rS   )loggingr7   r;   dataclassesr   enumr   typingr   numpyr\   mistral_common.audior   &mistral_common.protocol.instruct.chunkr   r   r   	getLoggerr   loggerrG   r!   r   strr   r   r#   rL   rP   rS   r   r   r   r   <module>   s.    
 