o
    ~i                  	   @   sv  d Z ddlZddlmZ dgZG dd deZedkrddlZej	 r&dndZ
d	Zd
ZdZdD ]XZeeeed e
ZedkrKeede neeee
Ze ( eeZeej e Zeej edv ryeeZeej W d   n1 sw   Y  q0ed\ZZeeed Ze  eeZW d   n1 sw   Y  edee dS dS )z(Mimi (see https://kyutai.org/Moshi.pdf).    N)CodecMimic                       sL   e Zd Z			d fdd	Ze dd Zdd	 Zd
d Zdd Z	  Z
S )r   reconstruct   Tc                    s   zddl m} W n ty   tdw t |d| || _d| _|| _|d| _	|dkr9d | j	_
d | j	_d S |dkrGd | j	_d | j	_d S d S )	Nr   )	MimiModelz5`pip install transformers>=4.45.1` to use this modulei]  i   zkyutai/mimiencodedecode)transformersr   ImportErrorsuper__init__num_codebooks
vocab_sizelatentfrom_pretrainedmodeldecoderdecoder_transformerencoderencoder_transformer)selfsample_ratemoder   r   r   	__class__ D/home/ubuntu/.local/lib/python3.10/site-packages/audiocodecs/mimi.pyr      s"   zMimi.__init__c                 C   s  | j r&| jjjj}| jjjj}|| d | j }dd |D }t|}|S | jjjj}| jjjj}|| d | j }dd |D }t|}|d }| jjj	|d d }| jdkr| jjj	|dd  j
dd}|| jd | jd	}t|d  |g}|S |d  }|S )
Nc                 S      g | ]}|j jqS r   codebookembed.0layerr   r   r   
<listcomp>@       zMimi.embs.<locals>.<listcomp>c                 S   r   r   r   r!   r   r   r   r$   F   r%   ).Nr   ).r      )end_dim)r   r   	quantizer"semantic_residual_vector_quantizerlayers"acoustic_residual_vector_quantizerr   torchstackoutput_projflattenreshaper   cat)r   semantic_layersacoustic_layersr+   embsembs_semanticembs_acousticr   r   r   r5   6   sB   






z	Mimi.embsc                 C   s   |j d | }|   }tj||j|jdd  |d d d f k }| jj	|d d d f |d d d f | j
d}|jdd}|S )Nr(   )devicedtype)num_quantizers)shapemaxlongitemr-   aranger8   r9   r   r   r   audio_codesmovedim)r   siglengthabs_lensmax_lenpadding_maskoutputtoksr   r   r   _sig_to_toks]   s"    zMimi._sig_to_toksc                 C   s   | j r9|d d d f }| j|}| j|dd}|d dd}| j|}| jjj|}|	dd}|S |d d d f }| j|}| j|dd}|d dd}| j|}|	dd}|S )Nr&      r   r(   r;   )
r   r   r   r   	transpose
downsampler)   r*   
input_projrB   )r   rC   rD   input_values
embeddingsencoder_outputsfeatsr   r   r   _sig_to_featsp   s&   
zMimi._sig_to_featsc                 C   s*   | j |dd}|jd d df }|S )Nr(   r;   r   )r   r   rB   audio_values)r   rI   rD   rH   rC   r   r   r   _toks_to_sig   s   zMimi._toks_to_sig)r   r   T)__name__
__module____qualname__r   r-   no_gradr5   rJ   rS   rU   __classcell__r   r   r   r   r      s    
&__main__cudacpui'  rK   r   )r   r   r   )r   r   r   
   )r   r   zexample.wav)r   zreconstruction.wav) __doc__r-   audiocodecs.codecr   __all__r   rV   
torchaudior\   is_availabler8   r   
batch_sizer   r   evaltocodeczerosr>   randninputrY   rH   printr<   r5   sig_to_featsloadrC   rec_sigsaver   r   r   r   <module>   sJ   u





	

