o
    Xi+M                     @   s<  d Z ddlmZ ddlmZmZmZ ddlZddl	Z	ddl
m  mZ ddlmZ ddl	mZmZ ddlmZmZmZmZ eG d	d
 d
ZG dd dejZG dd dejZG dd dejZd ddZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZ G dd dejZ!dS )!zyModified from https://github.com/openai/whisper/blob/main/whisper/model.py
   Add EuclideanCodebook & VectorQuantization
    )	dataclass)IterableOptionalTupleN	rearrange)Tensornn   )make_non_pad_maskmask_to_bias
onnx2torchmerge_tokenized_segmentsc                   @   sb   e Zd ZU dZeed< dZeed< dZeed< dZeed< d	Z	eed
< dZ
eed< dZeed< dS )ModelConfig   n_melsi  n_audio_ctxi   n_audio_state   n_audio_head   n_audio_layeri   n_codebook_sizeFuse_sdpaN)__name__
__module____qualname__r   int__annotations__r   r   r   r   r   r   bool r    r    E/home/ubuntu/.local/lib/python3.10/site-packages/s3tokenizer/model.pyr      s   
 r   c                   @      e Zd ZdedefddZdS )	LayerNormxreturnc                 C   sJ   t | | j| jd ur| j nd | jd ur| j nd | j|jS N)	F
layer_normfloatnormalized_shapeweightbiasepstypedtypeselfr$   r    r    r!   forward-   s   zLayerNorm.forwardNr   r   r   r   r2   r    r    r    r!   r#   +       r#   c                   @   r"   )Linearr$   r%   c                 C   s2   t || j|j| jd u rd S | j|jS r&   )r'   linearr+   tor/   r,   r0   r    r    r!   r2   9   s   zLinear.forwardNr3   r    r    r    r!   r5   7   r4   r5   c                       s2   e Zd Zdededee def fddZ  ZS )Conv1dr$   r+   r,   r%   c                    s.   t  |||j|d u rd S ||jS r&   )super_conv_forwardr7   r/   )r1   r$   r+   r,   	__class__r    r!   r:   C   s
   
zConv1d._conv_forward)r   r   r   r   r   r:   __classcell__r    r    r;   r!   r8   A   s    
r8   '  c                 C   s   |d dksJ t ||d d  }t| t|d  }t| ddt jf |t jddf  }tjt|t|gddS )z*Returns sinusoids for positional embedding   r   r
   Ndim)	nplogtorchexparangenewaxiscatsincos)lengthchannelsmax_timescalelog_timescale_incrementinv_timescalesscaled_timer    r    r!   	sinusoidsI   s   rQ   c                
       sh   e Zd Zddededef fddZ	dded	ee fd
dZ	ddededed	ee fddZ	  Z
S )MultiHeadAttentionFn_staten_headr   c                    sN   t    || _t||| _t||dd| _t||| _t||| _|| _d S )NF)r,   )	r9   __init__rT   r5   querykeyvalueoutr   )r1   rS   rT   r   r;   r    r!   rU   V   s   

zMultiHeadAttention.__init__Nr$   maskc                 C   s@   |  |}| |}| |}| ||||\}}| ||fS r&   )rV   rW   rX   qkv_attentionrY   )r1   r$   rZ   qkvwvqkr    r    r!   r2   `   s
   


zMultiHeadAttention.forwardr\   r]   r^   c                 C   s|  |j \}}}|| j d }|jg |j d d | jdR  dddd| }|jg |j d d | jdR  }|jg |j d d | jdR  dddd}| js|dddd| }|| }|d urj|| }| }tjjj	|dd
|j}	|	| ddddjdd| fS |dddd| }|d usJ tjjj||||d	d
d}
|
dd |dd|}
|
d fS )Ng      пr?   r   r
      r@   )	start_dimg        g      ?)	attn_mask	dropout_pscale)shaperT   viewpermuter   r)   rD   r	   
functionalsoftmaxr7   r/   flattendetachscaled_dot_product_attention	transpose
contiguoussize)r1   r\   r]   r^   rZ   _Drf   r`   woutputr    r    r!   r[   l   sF   4$0z MultiHeadAttention.qkv_attention)Fr&   )r   r   r   r   r   rU   r   r   r2   r[   r=   r    r    r;   r!   rR   T   s$    
rR   c                       sD   e Zd Zdededef fddZ	ddedee fd	d
Z  Z	S )ResidualAttentionBlockrS   rT   r   c                    sZ   t    t|||d| _t|| _|d }tt||t	 t||| _
t|| _d S )Nr      )r9   rU   rR   attnr#   attn_lnr	   
Sequentialr5   GELUmlpmlp_ln)r1   rS   rT   r   n_mlpr;   r    r!   rU      s   

zResidualAttentionBlock.__init__Nr$   rZ   c                 C   s4   || j | ||dd  }|| | | }|S )N)rZ   r   )ry   rz   r}   r~   )r1   r$   rZ   r    r    r!   r2      s   zResidualAttentionBlock.forwardr&   )
r   r   r   r   r   rU   r   r   r2   r=   r    r    r;   r!   rv      s    rv   c                       sX   e Zd Zdededededededef fdd	Zd
ededeeef fddZ  Z	S )AudioEncoderr   n_ctxrS   rT   n_layerstrider   c                    sp   t    || _t|d|dd| _tdddd| _| dt| t	 fddt
|D | _d S )Nrb   r
   )kernel_sizer   paddingr?   positional_embeddingc                    s   g | ]	}t  d qS )rw   )rv   ).0rr   rT   rS   r   r    r!   
<listcomp>   s    z)AudioEncoder.__init__.<locals>.<listcomp>)r9   rU   r   r8   conv1conv2register_bufferrQ   r	   
ModuleListrangeblocks)r1   r   r   rS   rT   r   r   r   r;   r   r!   rU      s$   

zAudioEncoder.__init__r$   x_lenr%   c                 C   s   t |d}t| || }|d d d | j d }t |d}t| || }|d d d d d }t |d}|ddd}t||j	}|| j
d|jd ddf  |j	}| jD ]
}|||d}qf||fS )z
        x : torch.Tensor, shape = (batch_size, n_mels, T)
            the mel spectrogram of the audio
        x_len: torch.Tensor, shape = (batch_size,)
            length of each audio in x
        r
   r?   r   N)r   	unsqueezer'   gelur   r   r   ri   r   r/   r   rg   r7   r   )r1   r$   r   rZ   blockr    r    r!   r2      s   (
zAudioEncoder.forward)
r   r   r   r   r   rU   r   r   r2   r=   r    r    r;   r!   r      s"    &r   c                       s   e Zd ZdZdedef fddZe dedefdd	Z	e dedefd
dZ
e dd Ze dedefddZe dedefddZe dedefddZ  ZS )EuclideanCodebookzCodebook with Euclidean distance (inference-only).
    Args:
        dim (int): Dimension.
        codebook_size (int): Codebook size.
    rA   codebook_sizec                    s,   t    t||}|| _| d| d S )Nembed)r9   rU   rD   zerosr   r   )r1   rA   r   r   r;   r    r!   rU      s   
zEuclideanCodebook.__init__r$   r%   c                 C   s   t |d}|S )Nz... d -> (...) dr   r0   r    r    r!   
preprocess      
zEuclideanCodebook.preprocessc                 C   sZ   | j  |j}|djdddd| |  |djddd  }|jddj}|S )Nr?   r
   T)keepdimr   ra   r@   )r   tr7   r/   powsummaxindices)r1   r$   r   dist	embed_indr    r    r!   quantize   s   zEuclideanCodebook.quantizec                 C   s   |j |d d  S )Nra   )rh   )r1   r   rg   r    r    r!   postprocess_emb   s   z!EuclideanCodebook.postprocess_embr   c                 C   s   t || j}|S r&   )r'   	embeddingr   r1   r   r   r    r    r!   
dequantize   s   zEuclideanCodebook.dequantizec                 C   s*   |j }| |}| |}| ||}|S r&   )rg   r   r   r   )r1   r$   rg   r   r    r    r!   encode  s
   

zEuclideanCodebook.encodec                 C   s   |  |}|S r&   )r   r   r    r    r!   decode  r   zEuclideanCodebook.decode)r   r   r   __doc__r   rU   rD   inference_moder   r   r   r   r   r   r   r=   r    r    r;   r!   r      s    

r   c                       sj   e Zd ZdZdedef fddZedd Ze	 de
d	e
fd
dZe	 de
d	e
fddZ  ZS )VectorQuantizationzVector quantization implementation (inference-only).
    Args:
        dim (int): Dimension
        codebook_size (int): Codebook size
    rA   r   c                    s"   t    t||d| _|| _d S )N)rA   r   )r9   rU   r   	_codebookr   )r1   rA   r   r;   r    r!   rU     s
   

zVectorQuantization.__init__c                 C   s   | j jS r&   )r   r   r1   r    r    r!   codebook"  s   zVectorQuantization.codebookr$   r%   c                 C   s$   t j| ddd}| j|}|S )Nr?   ra   )prA   )r'   	normalizer)   r   r   )r1   r$   embed_inr    r    r!   r   &  s   zVectorQuantization.encoder   c                 C   s   | j |}t|d}|S )Nzb n d -> b d n)r   r   r   r   r    r    r!   r   ,  s   
zVectorQuantization.decode)r   r   r   r   r   rU   propertyr   rD   r   r   r   r   r=   r    r    r;   r!   r     s    
r   c                       s   e Zd ZdZe fdedef fddZdededeeef fd	d
Z	e
 dededeeef fddZe
 dededededeeef f
ddZedd ZdefddZdefddZdd Z  ZS )S3TokenizerzbS3 tokenizer implementation (inference-only).
    Args:
        config  (ModelConfig): Config
    nameconfigc                    sh   t    || _|| _t| jj| jj| jj| jj| jj	|dkr!dnd| jj
| _t| jj| jj| _d S )Nspeech_tokenizer_v1_25hzr?   r
   )r9   rU   r   r   r   r   r   r   r   r   r   encoderr   r   	quantizer)r1   r   r   r;   r    r!   rU   9  s   
	
zS3Tokenizer.__init__melmel_lenr%   c                 C   s   |  ||S r&   )r   )r1   r   r   r    r    r!   r2   I  s   zS3Tokenizer.forwardc                 C   sH   d}||k}|  r| ||||S | ||\}}| j|}||fS )ap  
        Quantize mel spectrogram to tokens, with automatic long audio handling.

        Args:
            mel: mel spectrogram tensor, shape (batch_size, n_mels, T)
            mel_len: mel length tensor, shape (batch_size,)

        Returns:
            code: quantized tokens, shape (batch_size, T')
            code_len: token length, shape (batch_size,)
        i  )any_quantize_mixed_batchr   r   r   )r1   r   r   
max_frameslong_audio_maskhiddencode_lencoder    r    r!   r   L  s   
zS3Tokenizer.quantizer   r   c           ,      C   s  | d}d}d}d}d}	|| | }
|	| | }|
| }g }g }g }t|D ]}|| }|| }||  }|sr|ddd|f }| }||
k rW|
| }t|d|f}|| |tj||jd ||ddd	d
 q'd}d}||k rt	||
 |}|dd||f }| d	}||
k r|
| }t|d|f}|| |tj||jd ||d|dd
 |d	7 }||7 }||k sz|}|D ]}|d |kr|d r||d< qq'|stj
|dtj|jdtj
|tj|jdfS t|}t|}| ||\}}| j|} i }!t|D ]J\}"}|d }|d }|d }| |"d||"  f    }#|sJtj|#tj|jd}$|$t|#f|!|< q||!vrSg |!|< |!| |# qt|D ]8}||  r|!| }%t| dr|| jdkr|d}&nd}&t|%|	|&d}'tj|'tj|jd}(|(t|'f|!|< q`tdd |! D })tj
||)tj|jd}*tj
|tj|jd}+|! D ]\}\}$}|$|*|d|f< ||+|< q|*|+fS )a  
        Handle mixed batch with both short and long audio using unified batch processing.

        Args:
            mel: mel spectrogram tensor, shape (batch_size, n_mels, T)
            mel_len: mel length tensor, shape (batch_size,)
            long_audio_mask: boolean mask for long audio, shape (batch_size,)
            max_frames: maximum frames for short audio

        Returns:
            code: quantized tokens, shape (batch_size, T')
            code_len: token length, shape (batch_size,)
        r   i>        rx   N)deviceFr
   )	batch_idxis_long_audiosegment_idxtotal_segmentsTr   r   r   )r/   r   r   r   speech_tokenizer_v12      )overlap
token_ratec                 s   s    | ]}|d  V  qdS )r
   Nr    )r   	code_infor    r    r!   	<genexpr>
  s    z4S3Tokenizer._quantize_mixed_batch.<locals>.<genexpr>)rq   r   itemr'   padappendrD   tensorr   minr   longstackr   r   r   	enumeratecpunumpytolistlenhasattrr   r   r   valuesitems),r1   r   r   r   r   
batch_sizesample_rate
hop_lengthwindow_sizer   frames_per_windowframes_per_overlapframes_per_strideall_segmentsall_segments_lensegment_infor   	audio_melaudio_mel_lenr   segmentseg_lenpad_sizestartr   endr   infounified_batch_melunified_batch_lensr   r   codesresultsseg_idxsegment_codecode_tensoraudio_codesr   merged_codesmerged_codes_tensormax_code_lenoutput_codesoutput_codes_lenr    r    r!   r   j  s   





	

z!S3Tokenizer._quantize_mixed_batchc                 C   s   t |  jS r&   )next
parametersr   r   r    r    r!   r     s   zS3Tokenizer.device	onnx_pathc                 C   s   t |d d}| j|dd d S )NFTstrict)r   load_state_dict)r1   r   ckptr    r    r!   init_from_onnx  s   zS3Tokenizer.init_from_onnx	ckpt_pathc                 C   s"   t j|ddd}| j|dd d S )Nr   T)map_locationmmapr  )rD   loadr  )r1   r  r  r    r    r!   init_from_pt"  s   zS3Tokenizer.init_from_ptc                 C   s   |   D ]\}}d|_qd S )NF)named_parametersrequires_grad)r1   rr   paramr    r    r!   freeze&  s   zS3Tokenizer.freeze)r   r   r   r   r   strrU   r   r   r2   rD   r   r   r   r   r   r   r  r
  r  r=   r    r    r;   r!   r   3  s(     

 0
r   )r>   )"r   dataclassesr   typingr   r   r   r   rB   rD   torch.nn.functionalr	   rj   r'   einopsr   r   utilsr   r   r   r   r   r#   r5   r8   rQ   ModulerR   rv   r   r   r   r   r    r    r    r!   <module>   s*   

=94