o
    ~i                  	   @   s  d Z ddlZddlZddlZddlmZ ddlmZ dgZG dd deZ	e
dkrddlZej r4dnd	Zd
ZdZdZdD ]XZe	eeed eZedkrYeede neeeeZe ( eeZeej e Zeej edv reeZeej W d   n1 sw   Y  q>e d\Z!Ze	eed Ze  ee!Z"W d   n1 sw   Y  e#de"e dS dS )z7SpeechTokenizer (see https://arxiv.org/abs/2308.16692).    N)snapshot_download)CodecSpeechTokenizerc                       sJ   e Zd Z		d fdd	Ze dd Zdd Zd	d
 Zdd Z	  Z
S )r   reconstruct   c           
         s   z&t jt jt dd tjD } fddtjD t_dd l}|t_W n ty1   tdw t 	|d| || _
d}t|d}t j|d	d
}t j|d	d}	|j||	| _|dkrfd | j_d S |dkrtd | j_d | j_d S d S )Nc                 S   s   g | ]}|qS  r   .0xr   r   O/home/ubuntu/.local/lib/python3.10/site-packages/audiocodecs/speechtokenizer.py
<listcomp>)   s    z,SpeechTokenizer.__init__.<locals>.<listcomp>c                    s   g | ]} |vr|qS r   r   r   root_dirr   r   r   *   s    r   z0`pip install speechtokenizer` to use this modulei>  zfnlp/SpeechTokenizer)repo_idspeechtokenizer_hubert_avgzconfig.jsonzSpeechTokenizer.ptencodedecode)ospathdirnamerealpath__file__sysspeechtokenizerImportErrorsuper__init__num_codebooksr   joinr   load_from_checkpointmodeldecoderencoder	transform)
selfsample_ratemoder   sys_pathr   sourcer   config_pathcheckpoint_path	__class__r   r   r       s4   

zSpeechTokenizer.__init__c           	      C   s   d}t t| j  j}tj||d}|d d d d f | j	dd
 }g }t|D ]\}}| jjjj| }||}|| q,| jj|t|k sRJ t|d }|S )Ni   )device).r   )nextiterr    
state_dictvaluesr-   torcharangeexpandr   clone	enumerate	quantizervqlayersr   appendsumallstack)	r$   
vocab_sizer-   toksembsiindiceslayer	quantizedr   r   r   rA   E   s    
zSpeechTokenizer.embsc                 C   s2   | j |d d d f d | j }|dd}|S )Nr.   )r    r   r   movedim)r$   siglengthr@   r   r   r   _sig_to_toksX   s   "zSpeechTokenizer._sig_to_toksc                 C   s(   | j |d d d f }|dd}|S )Nr.   )r    r"   rG   )r$   rH   rI   featsr   r   r   _sig_to_feats_   s   zSpeechTokenizer._sig_to_featsc                 C   s(   | dd}| j|d d df }|S )Nr.   rF   r   )rG   r    r   )r$   r@   rI   rH   r   r   r   _toks_to_sigf   s   zSpeechTokenizer._toks_to_sig)r   r   )__name__
__module____qualname__r   r3   no_gradrA   rJ   rM   rN   __classcell__r   r   r+   r   r      s    %
__main__cudacpui'     r   )r   r   r   )r&   r   r   
   )r   r   zexample.wav)r   zreconstruction.wav)$__doc__r   r   r3   huggingface_hubr   audiocodecs.codecr   __all__r   rO   
torchaudiorU   is_availabler-   r%   
batch_sizer   r&   evaltocodeczeroslongrandninputrR   outputprintshaperA   sig_to_featsloadrH   rec_sigsaver   r   r   r   <module>   s\   O





	

