o
    Xi+U                  
   @   sH  d dl mZ d dlmZmZ d dlZd dlmZ d dlm	Z	m
Z
mZmZ d dlmZmZmZmZ eG dd dZ			d$d
ededefddZdejdejdejdeejejf fddZdejdejfddZG dd dejjZG dd dejjZG dd deZG dd dejjZG d d! d!ejjZG d"d# d#ejjZ dS )%    )	dataclass)OptionalTupleN	rearrange)Conv1d	LayerNormLinearMultiHeadAttention)make_non_pad_maskmask_to_bias
onnx2torchmerge_tokenized_segmentsc                   @   sb   e Zd ZU dZeed< dZeed< dZeed< dZeed< d	Z	eed
< dZ
eed< dZeed< dS )ModelConfig   n_melsi  n_audio_ctxi   n_audio_state   n_audio_head   n_audio_layer  n_codebook_sizeFuse_sdpaN)__name__
__module____qualname__r   int__annotations__r   r   r   r   r   r   bool r!   r!   H/home/ubuntu/.local/lib/python3.10/site-packages/s3tokenizer/model_v2.pyr      s   
 r        @dimendthetac                 C   s~   d|t d| dd | d   |    }t j||jd}|d ur%|| }t || }t t ||}t j||fddS )N      ?r      devicer$   )torcharangefloatr*   outerpolar	ones_likecat)r$   r%   r&   scalingfreqst	freqs_cisr!   r!   r"   precompute_freqs_cis%   s   *r8   xqxkr7   returnc                 C   sX  t |}|d d d d df |d d d d df }}|dd| j}|dd| j}| jd }| d d d d d d d |d f | d d d d d d |d d f }}t j| |fdd}	|jd }|d d d d d d d |d f |d d d d d d |d d f }}t j| |fdd}
| | |	|  || |
|  fS )Nr      r(   r+   r,   )r-   view_as_real	unsqueezetodtypeshaper3   )r9   r:   r7   realcossinDhalf_lhalf_rxq_rxk_rr!   r!   r"   apply_rotary_emb3   s   
.
J
J rJ   xc                    sb   |j  dd  kr k sJ  J | j|jd |jd fks J  fddt|jD }| j| S )Nr   r<   r+   c                    s,   g | ]\}}|d ks| d  kr|nd qS )r<   r!   ).0idndimr!   r"   
<listcomp>M   s     z)reshape_for_broadcast.<locals>.<listcomp>)rP   rA   	enumerateview)r7   rK   rA   r!   rO   r"   reshape_for_broadcastI   s   

rT   c                       s   e Zd Zddedef fddZe dejdejfdd	Ze dejdejfd
dZ	e dejdejfddZ
  ZS )FSQCodebook   r$   levelc                    s*   t    tj|d| _|| _d | _d S )N   )super__init__r-   nnr	   project_downrW   embed)selfr$   rW   	__class__r!   r"   rZ   U   s   

zFSQCodebook.__init__rK   r;   c                 C   s   t |d}|S )Nz... d -> (...) dr   r^   rK   r!   r!   r"   
preprocess[   s   
zFSQCodebook.preprocessc                 C   s   |j }| |}| | }| }|d }| d }t| jtj	d| j |j
|jd}tj||d dd}||d |d  }|S )Ng   ?r<   r(   )r*   r@   r   r+   r,   )rA   rb   r\   r/   tanhroundr-   powrW   r.   r*   r@   sumr>   reshaper   )r^   rK   x_shapehpowersmuindr!   r!   r"   encode`   s   
zFSQCodebook.encode	embed_indc                 C   s   t d)Nz2There is no official up project component provided)NotImplementedError)r^   rn   r!   r!   r"   decoder   s   zFSQCodebook.decode)rV   )r   r   r   r   rZ   r-   inference_modeTensorrb   rm   rp   __classcell__r!   r!   r_   r"   rU   S   s     rU   c                       sr   e Zd ZdZdedef fddZedd Ze	 dej
d	ej
fd
dZe	 dej
d	ej
fddZ  ZS )FSQVectorQuantizationzVector quantization implementation (inference-only).
    Args:
        dim (int): Dimension
        codebook_size (int): Codebook size
    r$   codebook_sizec                    s.   t    d|ksJ t|dd| _|| _d S )Nr   rV   )r$   rW   )rY   rZ   rU   	_codebookru   )r^   r$   ru   r_   r!   r"   rZ      s   

zFSQVectorQuantization.__init__c                 C   s   | j jS N)rv   r]   r^   r!   r!   r"   codebook   s   zFSQVectorQuantization.codebookrK   r;   c                 C   s   | j |S rw   )rv   rm   ra   r!   r!   r"   rm         zFSQVectorQuantization.encodern   c                 C   s   | j |}t|d}|S )Nzb n d -> b d n)rv   rp   r   )r^   rn   quantizer!   r!   r"   rp      s   
zFSQVectorQuantization.decode)r   r   r   __doc__r   rZ   propertyry   r-   rq   rr   rm   rp   rs   r!   r!   r_   r"   rt   x   s    

 rt   c                       s   e Zd Z		ddedededef fddZ		dd
ejdeej fddZ							ddejdejdejdeej deej deej fddZ
						ddejdeej deej deej fddZ  ZS )FSMNMultiHeadAttention   Fn_staten_headkernel_sizer   c              	      sz   t  || tjj|||dd|dd| _|d d | _|d | j | _tj| j| jfd| _	|| _
t||dd| _d S )Nr<   r   F)stridepaddinggroupsbiasr(           )r   )rY   rZ   r-   r[   r   
fsmn_blockleft_paddingright_paddingConstantPad1dpad_fnr   r	   key)r^   r   r   r   r   r_   r!   r"   rZ      s    zFSMNMultiHeadAttention.__init__Ninputsmaskc                 C   sx   |  \}}}}|||d}|d ur| ddkr|| }|dd}| |}| |}|dd}||7 }|| S )Nr+   r(   r   r<   )sizerS   	transposer   r   )r^   r   r   br6   _rK   r!   r!   r"   forward_fsmn   s   

z#FSMNMultiHeadAttention.forward_fsmnqkvmask_padr7   c                 C   s  |j \}}}|| j d }	|jg |j d d | jdR  }|jg |j d d | jdR  }|jg |j d d | jdR  }|d urPt|||d\}}| ||}
|dddd|	 }|dddd}| js|dddd|	 }|| }|d ur|| }| }tj	j
j|dd|j}|| ddddjdd	| |
fS |dddd|	 }|d usJ tj	j
j||||d
dd}|dd |dd|}|d |
fS )Ng      пr(   r+   )r7   r   r<   rV   r,   )	start_dimr   r'   )	attn_mask	dropout_pscale)rA   r   rS   rJ   r   permuter   r/   r-   r[   
functionalsoftmaxr?   r@   flattendetachscaled_dot_product_attentionr   
contiguousr   )r^   r   r   r   r   r   r7   r   rE   r   
fsm_memoryqkwoutputr!   r!   r"   qkv_attention   sP   $$$
z$FSMNMultiHeadAttention.qkv_attentionrK   c                 C   sJ   |  |}| |}| |}| ||||||\}}	}
| ||
 |	fS rw   )queryr   valuer   out)r^   rK   r   r   r7   r   r   r   wvr   r   r!   r!   r"   forward   s   



zFSMNMultiHeadAttention.forwardr   Frw   NNN)r   r   r   r   r    rZ   r-   rr   r   r   r   r   rs   r!   r!   r_   r"   r~      s\    

0r~   c                
       sj   e Zd Z		ddedededef fddZ						dd
ejdeej deej deej fddZ	  Z
S )ResidualAttentionBlockr   Fr   r   r   r   c                    sd   t    t||||d| _t|dd| _|d }tjt	||tj
 t	||| _t|| _d S )Nr   gh㈵>)eps   )rY   rZ   r~   attnr   attn_lnr-   r[   
Sequentialr	   GELUmlpmlp_ln)r^   r   r   r   r   n_mlpr_   r!   r"   rZ      s   
zResidualAttentionBlock.__init__NrK   r   r   r7   c                 C   s8   || j | ||||dd  }|| | | }|S )N)r   r   r7   r   )r   r   r   r   )r^   rK   r   r   r7   r!   r!   r"   r     s   zResidualAttentionBlock.forwardr   r   )r   r   r   r   r    rZ   r-   rr   r   r   rs   r!   r!   r_   r"   r      s0    r   c                       s\   e Zd Zdedededededef fddZd	ejd
ejdeejejf fddZ	  Z
S )AudioEncoderV2r   r   r   n_layerr   r   c                    sl   t    || _t|d|dd| _tdddd| _tdd| _tj	
 fddt|D | _d S )	NrV   r<   )r   r   r   r(   @   i   c                    s   g | ]	}t  d qS )r   )r   )rL   r   r   r   r   r!   r"   rQ   <  s    z+AudioEncoderV2.__init__.<locals>.<listcomp>)rY   rZ   r   r   conv1conv2r8   r7   r-   r[   
ModuleListrangeblocks)r^   r   r   r   r   r   r   r_   r   r"   rZ   %  s$   
	zAudioEncoderV2.__init__rK   x_lenr;   c              
   C   s  |j d }t||d}tjj| || }|d d d | j d }|d d d | j d }t||d}tjj| 	|| }|d d d d d }|d d d | j d }t||d}|
ddd}| j|j}|dd}t||j}t|}|dddddf |dddddf }	}
tj|	|	fdd}	tj|
|
fdd}
|	dd}	|
dd}
| jD ]}|||d||d|d }q||fS )z
        x : torch.Tensor, shape = (batch_size, n_mels, T)
            the mel spectrogram of the audio
        x_len: torch.Tensor, shape = (batch_size,)
            length of each audio in x
        r+   r<   r(   r   Nr,   )rA   r   r>   r-   r[   r   gelur   r   r   r   r7   r?   r*   r   r   r@   r=   r3   r   r   )r^   rK   r   Tr   x_slenr7   r   tmprC   rD   blockr!   r!   r"   r   A  s.   

.
$zAudioEncoderV2.forward)r   r   r   r   r    rZ   r-   rr   r   r   rs   r!   r!   r_   r"   r   #  s&    r   c                       s   e Zd ZdZe fdedef fddZdejdejde	ejejf fd	d
Z
e dejdejde	ejejf fddZe dejdejdejdede	ejejf f
ddZedd ZdefddZdefddZdd Z  ZS )S3TokenizerV2zdS3 tokenizer v2 implementation (inference-only).
    Args:
        config (ModelConfig): Config
    nameconfigc                    sp   t    || _d|vrd|v sJ d|_|| _t| jj| jj| jj| jj	d| jj
| _t| jj| jj| _d S )Nv1v2r   r(   )rY   rZ   r   r   r   r   r   r   r   r   r   encoderrt   	quantizer)r^   r   r   r_   r!   r"   rZ   l  s$   

zS3TokenizerV2.__init__melmel_lenr;   c                 C   s   |  ||S rw   )r{   )r^   r   r   r!   r!   r"   r     rz   zS3TokenizerV2.forwardc                 C   sH   d}||k}|  r| ||||S | ||\}}| j|}||fS )ap  
        Quantize mel spectrogram to tokens, with automatic long audio handling.

        Args:
            mel: mel spectrogram tensor, shape (batch_size, n_mels, T)
            mel_len: mel length tensor, shape (batch_size,)

        Returns:
            code: quantized tokens, shape (batch_size, T')
            code_len: token length, shape (batch_size,)
        i  )any_quantize_mixed_batchr   r   rm   )r^   r   r   
max_frameslong_audio_maskhiddencode_lencoder!   r!   r"   r{     s   
zS3TokenizerV2.quantizer   r   c           ,      C   s  | d}d}d}d}d}	|| | }
|	| | }|
| }g }g }g }t|D ]}|| }|| }||  }|st|ddd|f }| }||
k rY|
| }tjj|d|f}|| |tj||j	d ||ddd	d
 q'd}d}||k rt
||
 |}|dd||f }| d	}||
k r|
| }tjj|d|f}|| |tj||j	d ||d|dd
 |d	7 }||7 }||k s||}|D ]}|d |kr|d r||d< qq'|stj|dtj|j	dtj|tj|j	dfS t|}t|}| ||\}}| j|} i }!t|D ]J\}"}|d }|d }|d }| |"d||"  f    }#|sNtj|#tj|j	d}$|$t|#f|!|< q||!vrWg |!|< |!| |# qt|D ])}||  r|!| }%d}&t|%|	|&d}'tj|'tj|j	d}(|(t|'f|!|< qdtdd |! D })tj||)tj|j	d}*tj|tj|j	d}+|! D ]\}\}$}|$|*|d|f< ||+|< q|*|+fS )a  
        Handle mixed batch with both short and long audio using unified batch processing.

        Args:
            mel: mel spectrogram tensor, shape (batch_size, n_mels, T)
            mel_len: mel length tensor, shape (batch_size,)
            long_audio_mask: boolean mask for long audio, shape (batch_size,)
            max_frames: maximum frames for short audio

        Returns:
            code: quantized tokens, shape (batch_size, T')
            code_len: token length, shape (batch_size,)
        r   i>        r   Nr)   Fr<   )	batch_idxis_long_audiosegment_idxtotal_segmentsTr   r   r   )r@   r*   r      )overlap
token_ratec                 s   s    | ]}|d  V  qdS )r<   Nr!   )rL   	code_infor!   r!   r"   	<genexpr>B  s    z6S3TokenizerV2._quantize_mixed_batch.<locals>.<genexpr>)r   r   itemr-   r[   r   padappendtensorr*   minzeroslongstackr   r   rm   rR   cpunumpytolistlenr   maxvaluesitems),r^   r   r   r   r   
batch_sizesample_rate
hop_lengthwindow_sizer   frames_per_windowframes_per_overlapframes_per_strideall_segmentsall_segments_lensegment_infor   	audio_melaudio_mel_lenr   segmentseg_lenpad_sizestartr   r%   r   infounified_batch_melunified_batch_lensr   r   codesresultsseg_idxsegment_codecode_tensoraudio_codesr   merged_codesmerged_codes_tensormax_code_lenoutput_codesoutput_codes_lenr!   r!   r"   r     s   





	

z#S3TokenizerV2._quantize_mixed_batchc                 C   s   t |  jS rw   )next
parametersr*   rx   r!   r!   r"   r*   R  s   zS3TokenizerV2.device	onnx_pathc                 C   s   t |d d}| j|dd d S )NFTstrict)r   load_state_dict)r^   r  ckptr!   r!   r"   init_from_onnxV  s   zS3TokenizerV2.init_from_onnx	ckpt_pathc                 C   s"   t j|ddd}| j|dd d S )Nr   T)map_locationmmapr  )r-   loadr  )r^   r  r  r!   r!   r"   init_from_ptZ  s   zS3TokenizerV2.init_from_ptc                 C   s   |   D ]\}}d|_qd S )NF)named_parametersrequires_grad)r^   r   paramr!   r!   r"   freeze^  s   zS3TokenizerV2.freeze)r   r   r   r|   r   strrZ   r-   rr   r   r   rq   r{   r   r   r}   r*   r  r  r"  rs   r!   r!   r_   r"   r   f  s@    
 .
r   )r#   N)!dataclassesr   typingr   r   r-   einopsr   s3tokenizer.modelr   r   r	   r
   s3tokenizer.utilsr   r   r   r   r   r   r/   r8   rr   rJ   rT   r[   ModulerU   rt   r~   r   r   r   r!   r!   r!   r"   <module>   s>   


% e&C