o
    ~i7                  	   @   s  d Z ddlZddlZddlZddlmZ dgZejej	ddddZ
G d	d deZed
krddlZej r;dndZdZdZdD ]WZeeed eZedkr]eedd neeeeZe ( eeZeej e Zeej edv re eZeej W d   n1 sw   Y  qCe!d\Z"Zee Ze  ee"Z#W d   n1 sw   Y  e$de#e dS dS )z4SemantiCodec (see https://arxiv.org/abs/2405.00233).    N)CodecSemantiCodec~z.cachehuggingfacehubc                       s   e Zd Zg dZg dZ					d  fdd		Ze  fd
dZe dd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Z  ZS )!r   )   2   d   )i       i @  i   reconstructr	   r
   r          @c                    s   z&t jt jt dd tjD } fddtjD t_dd la|t_W n ty1   tdw t 	|d| || _
|| _|| _d| _d| _tj||||td	d
| _|dkrad | j_d S d S )Nc                 S   s   g | ]}|qS  r   .0xr   r   L/home/ubuntu/.local/lib/python3.10/site-packages/audiocodecs/semanticodec.py
<listcomp>1   s    z)SemantiCodec.__init__.<locals>.<listcomp>c                    s   g | ]} |vr|qS r   r   r   root_dirr   r   r   2   s    r   z[`pip install git+https://github.com/haoheliu/SemantiCodec-inference.git` to use this modulei>     r
   )
token_ratesemantic_vocab_sizeddim_sample_step	cfg_scale
cache_pathcpuencode)ospathdirnamerealpath__file__syssemanticodecImportErrorsuper__init__r   r   r   num_codebooksacoustic_vocab_sizer   
_CACHE_DIRtomodeldecoder)selfsample_ratemoder   r   r   r   sys_path	__class__r   r   r&   %   s:   	
zSemantiCodec.__init__c                    s.   | j jjj|i || j j_t j|i |S N)r+   encodercentroid_npyr*   r%   )r-   argskwargsr1   r   r   r*   P   s   
zSemantiCodec.toc                 C   s   | j dkr	tdtt| j  j}tj	| j |d}|d d d d f 
dd| j }| |}tj|j|jd | j dddd}|dd}|S )	Nr
   z.The size of acoustic codebook is fixed to 8192devicedimr      )r   NotImplementedErrornextiterr+   
state_dictvaluesr9   torcharangeexpandr'   clone_token_to_quantized_featurecatsplitshapemovedim)r-   r9   toksembsr   r   r   rN   X   s   
 
zSemantiCodec.embsc                 C      |  |}|S r3   )_encode)r-   siglengthrM   r   r   r   _sig_to_toksi      
zSemantiCodec._sig_to_toksc                 C   rO   r3   )_encode_unquantized)r-   rQ   rR   featsr   r   r   _sig_to_featso   rT   zSemantiCodec._sig_to_featsc                 C   s   |  |d d df }|S )Nr   )_decode)r-   rM   rR   rQ   r   r   r   _toks_to_sigu   s   zSemantiCodec._toks_to_sigc                 C   sb   |d |d }}| j j|}|jd |jd }}| j jj|d||}tj||gddS )N).r   ).r>   r=   r:   r;   )	r+   r4   unquantrK   	quantizerget_output_from_indicesreshaperD   rI   )r-   tokenssemantic_tokensacoustic_tokenssemantic_feature	token_numfeature_dimacoustic_featurer   r   r   rH   {   s   

z(SemantiCodec._token_to_quantized_featurec           	         s(  |j d tjj }|tjj|tjj   }d| tjj | jj }ttjjtjj }|j d | |k rIt||j d |  }t	j
j|d|g}tjjt|j d |    fdd|D }t	|}|j d dkrt|j d d	 dksvJ | j||j}|d d d tjj|d d f }|S )
Nr>      r   c                    *   g | ]}t jj|d  t jj dd qS N)target_lengthta_kaldi_fbankr#   mainextract_kaldi_fbank_featureSAMPLE_RATEr   mel_target_lengthr   r   r          z(SemantiCodec._encode.<locals>.<listcomp>r:      r=      )rK   r#   rk   rm   AUDIOMAE_PATCH_DURATIONr+   stack_factor_KintSEGMENT_DURATIONrD   nn
functionalpadMEL_TARGET_LENGTHstackr4   r*   r9   mathceil)	r-   waveformoriginal_durationtarget_token_lensegment_sample_lengthdiffmelsmelr^   r   rn   r   rP      sB   


$$zSemantiCodec._encodec           	         s&  |j d tjj }|tjj|tjj   }d| tjj | jj }ttjjtjj }|j d | |k rIt||j d |  }t	j
j|d|g}tjjt|j d |    fdd|D }t	|}|j d dkrt|j d d	 dksvJ | ||j}|d d d tjj|d d f }|S )
Nr>   re   r   c                    rf   rg   rj   r   rn   r   r   r      rp   z4SemantiCodec._encode_unquantized.<locals>.<listcomp>r:   rq   r=   rr   )rK   r#   rk   rm   rs   r+   rt   ru   rv   rD   rw   rx   ry   rz   r{   _encoder_forwardr*   r9   r|   r}   )	r-   r~   r   r   r   r   r   r   rV   r   rn   r   rU      sB   


$$z SemantiCodec._encode_unquantizedc           
   	   C   s  | j jj|td| j j tjjd}g }t|D ]A\}}| 	|}t
j|t
|jd td| j j |jd  |jd |jd gdd}| j jj|| j j| j jd}|| qtjj|tjjtjj d	}|jd d
 d d | j j }	t
j|dd t|	tjj f |jdS )Ni   )window_lengthoverlapr   r>   r   r:   r;   )
ddim_stepsunconditional_guidance_scale)overlap_durationre      g{Gz?.r8   )r+   r4   long_token_split_windowru   rt   r#   rk   SEGMENT_OVERLAP_RATIO	enumeraterH   rD   rI   onesrK   r*   r9   r,   generate_sampler   r   appendoverlap_add_waveformrv   	as_tensorrm   )
r-   r^   windowed_token_listwindowed_waveform_windowed_tokenlatentr~   outputtrim_durationr   r   r   rX      sL   
	zSemantiCodec._decodec                 C   s   | ddkr| dd dksJ | jjjd u r-|j| jj_| jjj| jjj| jj_d}d}| d}g }|| |kru|d d ||| d d f }t  | |}|	| W d    n1 sfw   Y  ||7 }|| |ks>tj
|ddS )Nr:   rq   r=   rr   r   r>   r;   )sizer+   r4   r9   r5   r*   rD   no_grad_encoder_forward_innerr   rI   )r-   batchr   current_starttotal_length_batch
feats_listcurrent_batchr   r   r   r   r      s&   $




zSemantiCodec._encoder_forwardc                 C   s   | ddkr| ddksJ | jjjd u r+|j| jj_| jjj| jjj| jj_|d}g }|jd }t|jd D ]f}t	j
||df dddk}z)| rs|d	k}t	j|d	d
 }| dkrl|d  }nd}|d }	n|}	W n& ty }
 zdd l}|  t| t|  d}	W Y d }
~
nd }
~
ww ||	|  q>t	 M | jjj|| jjj| jjjd}| jjjdkr| jj|}|dddddddd}n|d d dd d d f }W d    |S W d    |S 1 sw   Y  |S )Nr=   rr   r:   rq   r>   r   r;   gHz>F)as_tuple)no_mask
no_average   r   )r   r+   r4   r9   r5   r*   	unsqueezerK   rangerD   stdanynonzerosqueezenumelitem	Exception	traceback	print_excprintr   r   audiomaeno_audiomae_maskno_audiomae_averagedownsampling_rateconcatepermuteflatten)r-   r   padding_cutoff_indextemporal_dimiactive_index
int_tensorfalse_indiceslast_false_index
column_maxer   representationr   r   r   r     sb    







z#SemantiCodec._encoder_forward_inner)r   r	   r
   r   r   )__name__
__module____qualname__TOKEN_RATESSEMANTIC_VOCAB_SIZESr&   rD   r   r*   rN   rS   rW   rY   rH   rP   rU   rX   r   r   __classcell__r   r   r1   r   r   !   s,    +
	**'__main__cudar   i'  r   )r   decoder   )r/   r   
   )r   r   zexample.wavzreconstruction.wav)%__doc__r   r"   rD   audiocodecs.codecr   __all__r   join
expanduserr)   r   r   
torchaudior   is_availabler9   r.   
batch_sizer/   evalr*   codeczeroslongrandninputr   r   r   rK   rN   sig_to_featsloadrQ   rec_sigsaver   r   r   r   <module>   sP     8





	

