o
    Xi2                     @   s   d dl mZ d dlmZmZ d dlZd dlmZmZm	Z	 d dl
mZmZmZ d dlmZmZmZmZ eG dd dZG d	d
 d
eZG dd dejjZG dd dejjZG dd dejjZdS )    )	dataclass)OptionalTupleN)Conv1d	LayerNormLinear)FSMNMultiHeadAttentionFSQVectorQuantizationprecompute_freqs_cis)make_non_pad_maskmask_to_biasmerge_tokenized_segmentsonnx2torch_v3c                   @   sb   e Zd ZU dZeed< dZeed< dZeed< dZeed< d	Z	eed
< dZ
eed< dZeed< dS )ModelConfigV3   n_melsi  n_audio_ctxi   n_audio_state   n_audio_head   n_audio_layeri  n_codebook_sizeFuse_sdpaN)__name__
__module____qualname__r   int__annotations__r   r   r   r   r   r   bool r    r    H/home/ubuntu/.local/lib/python3.10/site-packages/s3tokenizer/model_v3.pyr      s   
 r   c                	       s4   e Zd Z		d	dedededef fddZ  ZS )
MultiHeadAttentionV3   Fn_staten_headkernel_sizer   c                    sJ   t  |||| t||| _t||dd| _t||| _t||| _d S )NF)bias)super__init__r   querykeyvalueout)selfr$   r%   r&   r   	__class__r    r!   r)   *   s
   zMultiHeadAttentionV3.__init__r#   F)r   r   r   r   r   r)   __classcell__r    r    r/   r!   r"   (   s    r"   c                
       sj   e Zd Z		ddedededef fddZ						dd
ejdeej deej deej fddZ	  Z
S )ResidualAttentionBlockV3r#   Fr$   r%   r&   r   c                    sh   t    t||||d| _t|dd| _|d }tjt	||tj
 t	||| _t|dd| _d S )Nr   gh㈵>)eps   )r(   r)   r"   attnr   attn_lntorchnn
Sequentialr   GELUmlpmlp_ln)r.   r$   r%   r&   r   n_mlpr/   r    r!   r)   9   s   
z!ResidualAttentionBlockV3.__init__Nxmaskmask_pad	freqs_cisc                 C   s8   || j | ||||dd  }|| | | }|S )N)rA   rB   rC   r   )r7   r8   r=   r>   )r.   r@   rA   rB   rC   r    r    r!   forwardJ   s   z ResidualAttentionBlockV3.forwardr1   )NNN)r   r   r   r   r   r)   r9   Tensorr   rD   r2   r    r    r/   r!   r3   7   s0    r3   c                       s\   e Zd Zdedededededef fddZd	ejd
ejdeejejf fddZ	  Z
S )AudioEncoderV3r   r$   r%   n_layerstrider   c                    sl   t    || _t|d|dd| _tdddd| _tdd| _tj	
 fddt|D | _d S )	N      )r&   rH   padding   @   i   c                    s   g | ]	}t  d qS )r4   )r3   ).0_r%   r$   r   r    r!   
<listcomp>p   s    z+AudioEncoderV3.__init__.<locals>.<listcomp>)r(   r)   rH   r   conv1conv2r
   rC   r9   r:   
ModuleListrangeblocks)r.   r   r$   r%   rG   rH   r   r/   rP   r!   r)   X   s$   
	zAudioEncoderV3.__init__r@   x_lenreturnc           	   
   C   s8  |j d }t||d}tjj| || }|d d d | j d }|d d d | j d }t||d}tjj| 	|| }|d d d d d }|d d d d d }t||d}|
ddd}| j|j}|dd}t||j}| jD ]}|||d||d|d }q||fS )z
        x : torch.Tensor, shape = (batch_size, n_mels, T)
            the mel spectrogram of the audio
        x_len: torch.Tensor, shape = (batch_size,)
            length of each audio in x
        rJ   rL   r   N)shaper   	unsqueezer9   r:   
functionalgelurR   rH   rS   permuterC   todevice	transposer   dtyperV   size)	r.   r@   rW   TrA   x_slenrC   rB   blockr    r    r!   rD   u   s"   

$zAudioEncoderV3.forward)r   r   r   r   r   r)   r9   rE   r   rD   r2   r    r    r/   r!   rF   V   s&    rF   c                       s   e Zd ZdZe fdedef fddZdejdejde	ejejf fd	d
Z
e dejdejde	ejejf fddZe dejdejdejde	ejejf fddZedd ZdefddZdefddZd fdd	Zdd Z  ZS )S3TokenizerV3zfS3 tokenizer v3 implementation (inference-only).
    Args:
        config (ModelConfigV3): Config
    nameconfigc                    sV   t    || _|| _t| jj| jj| jj| jjd| jj	| _
t| jj| jj| _d S )NrL   )r(   r)   rh   ri   rF   r   r   r   r   r   encoderr	   r   	quantizer)r.   rh   ri   r/   r    r!   r)      s   

zS3TokenizerV3.__init__melmel_lenrX   c                 C   s   |  ||S N)quantize)r.   rl   rm   r    r    r!   rD      s   zS3TokenizerV3.forwardc                 C   sF   d}||k}|  r| |||S | ||\}}| j|}||fS )Ni  )any_quantize_mixed_batchrj   rk   encode)r.   rl   rm   
max_frameslong_audio_maskhiddencode_lencoder    r    r!   ro      s   zS3TokenizerV3.quantizert   c           +      C   s  | d}d}d}d}d}|| | }	|| | }
|	|
 }g }g }g }t|D ]}|| }|| }||  }|st|d d d |f }| }||	k rY|	| }tjj|d|f}|| |tj||j	d ||dddd	 q'd}d}||k rt
||	 |}|d d ||f }| d}||	k r|	| }tjj|d|f}|| |tj||j	d ||d
|d d	 |d7 }||7 }||k s||}|D ]}|d |kr|d r||d< qq'|stj|dtj|j	dtj|tj|j	dfS t|}t|}| ||\}}| j|}i } t|D ]F\}!}|d }|d }||!d ||!  f    }"|sJtj|"tj|j	d}#|#t|"f| |< q|| vrSg | |< | | |" qt|D ])}||  r| | }$d}%t|$||%d}&tj|&tj|j	d}'|'t|&f| |< q`tdd |  D }(tj||(tj|j	d})tj|tj|j	d}*|  D ]\}\}#}|#|)|d |f< ||*|< q|)|*fS )Nr   i>        r6   )r`   FrJ   )	batch_idxis_long_audiosegment_idxtotal_segmentsTrz   r{   r}   )rb   r`      )overlap
token_ratec                 s   s    | ]}|d  V  qdS )rJ   Nr    )rN   	code_infor    r    r!   	<genexpr>-  s    z6S3TokenizerV3._quantize_mixed_batch.<locals>.<genexpr>)rc   rU   itemr9   r:   r\   padappendtensorr`   minzeroslongstackrj   rk   rr   	enumeratecpunumpytolistlenr   maxvaluesitems)+r.   rl   rm   rt   
batch_sizesample_rate
hop_lengthwindow_sizer   frames_per_windowframes_per_overlapframes_per_strideall_segmentsall_segments_lensegment_inforz   	audio_melaudio_mel_lenr{   segmentseg_lenpad_sizestartr|   endr}   infounified_batch_melunified_batch_lensru   rv   codesresultsseg_idxsegment_codecode_tensoraudio_codesr   merged_codesmerged_codes_tensormax_code_lenoutput_codesoutput_codes_lenr    r    r!   rq      s   







z#S3TokenizerV3._quantize_mixed_batchc                 C   s   t |  jS rn   )next
parametersr`   )r.   r    r    r!   r`   <  s   zS3TokenizerV3.device	onnx_pathc                 C   s   t |d d}| j|dd d S )NFstrict)r   load_state_dict)r.   r   ckptr    r    r!   init_from_onnx@  s   zS3TokenizerV3.init_from_onnx	ckpt_pathc                 C   s"   t j|ddd}| j|dd d S )Nr   T)map_locationmmapr   )r9   loadr   )r.   r   r   r    r    r!   init_from_ptE  s   zS3TokenizerV3.init_from_ptTc                    s   t  j||dS )Nr   )r(   r   )r.   
state_dictr   r/   r    r!   r   I  s   zS3TokenizerV3.load_state_dictc                 C   s   |   D ]\}}d|_qd S )NF)named_parametersrequires_grad)r.   rO   paramr    r    r!   freezeN  s   zS3TokenizerV3.freeze)T)r   r   r   __doc__r   strr)   r9   rE   r   rD   inference_modero   rq   propertyr`   r   r   r   r   r2   r    r    r/   r!   rg      s<    

rg   )dataclassesr   typingr   r   r9   s3tokenizer.modelr   r   r   s3tokenizer.model_v2r   r	   r
   s3tokenizer.utilsr   r   r   r   r   r"   r:   Moduler3   rF   rg   r    r    r    r!   <module>   s   <