o
    mi                     @   s^   d Z ddlmZmZ ddlmZ ddlmZ ddlm	Z	 ddl
Z
ddlZG dd dejZdS )	z7
Created on Wed Aug 30 15:47:55 2023
@author: zhangxin
   )SEANetEncoderSEANetDecoder)ResidualVectorQuantizer    N)	rearrangec                       s   e Zd Z fddZededefddZddgfd	ejd
e	de
fddZ	dd	ejde
fddZ		dd	ejd
e	de	fddZ	ddejde	fddZ  ZS )SpeechTokenizerc                    s2  t    t|d|d|d|d|d|d|d|d|d	d
	| _|d| _|d| _t|d| _	|d|dkr[t
|d|d| _nt
 | _t|d|d|dd| _t|d|d|d|dd|d|d|d|d	d
	| _dS )zi
        
        Parameters
        ----------
        config : json
            Model Config.

        	n_filters	dimensionstrideslstm_layersbidirectionaldilation_baseresidual_kernel_sizen_residual_layers
activation)	r   r	   ratioslstmr   r   r   r   r   sample_raten_qsemantic_dimensioncodebook_size)r	   r   binsFN)super__init__r   getencoderr   r   npproddownsample_ratennLinear	transformIdentityr   	quantizerr   decoder)selfconfig	__class__ 5/home/ubuntu/SpeechTokenizer/speechtokenizer/model.pyr      s8   
	
	
"
zSpeechTokenizer.__init__config_path	ckpt_pathc                 C   s^   ddl }t|}||}W d   n1 sw   Y  | |}tj|dd}|| |S )a  

        Parameters
        ----------
        config_path : str
            Path of model configuration file.
        ckpt_path : str
            Path of model  checkpoint.

        Returns
        -------
        model : SpeechTokenizer
            SpeechTokenizer model.

        r   Ncpu)map_location)jsonopenloadtorchload_state_dict)clsr+   r,   r/   fcfgmodelparamsr)   r)   r*   load_from_checkpoint4   s   

z$SpeechTokenizer.load_from_checkpointNr   xr   layersc                 C   s\   |r|n| j }| |}| j|||d\}}}}t|d d}	| |	}	| |}
|
||	fS )a  
        
        Parameters
        ----------
        x : torch.tensor
            Input wavs. Shape: (batch, channels, timesteps).
        n_q : int, optional
            Number of quantizers in RVQ used to encode. The default is all layers.
        layers : list[int], optional
            Layers of RVQ should return quantized result. The default is the first layer.

        Returns
        -------
        o : torch.tensor
            Output wavs. Shape: (batch, channels, timesteps).
        commit_loss : torch.tensor
            Commitment loss from residual vector quantizers.
        feature : torch.tensor
            Output of RVQ's first layer. Shape: (batch, timesteps, dimension)

        )r   r;   r   zb d t -> b t d)r   r   r#   r   r!   r$   )r%   r:   r   r;   e	quantizedcodescommit_lossquantized_listfeatureor)   r)   r*   forwardP   s   



zSpeechTokenizer.forwardc                 C   s:   |  |}|r	|ntt| j}| j||d\}}}}|S )a  

        Parameters
        ----------
        x : torch.tensor
            Input wavs. Shape should be (batch, channels, timesteps).
        layers : list[int], optional
            Layers of RVQ should return quantized result. The default is all layers.

        Returns
        -------
        quantized_list : list[torch.tensor]
            Quantized of required layers.

        )r;   )r   listranger   r#   )r%   r:   r;   r<   r=   r>   r?   r@   r)   r)   r*   forward_featureq   s   
zSpeechTokenizer.forward_featurestc                 C   s:   |  |}|du rd}|r|n| j}| jj|||d}|S )a  

        Parameters
        ----------
        x : torch.tensor
            Input wavs. Shape: (batch, channels, timesteps).
        n_q : int, optional
            Number of quantizers in RVQ used to encode. The default is all layers.
        st : int, optional
            Start quantizer index in RVQ. The default is 0.

        Returns
        -------
        codes : torch.tensor
            Output indices for each quantizer. Shape: (n_q, batch, timesteps)

        Nr   )r   rG   )r   r   r#   encode)r%   r:   r   rG   r<   r>   r)   r)   r*   rH      s   
zSpeechTokenizer.encoder>   c                 C   s   | j j||d}| |}|S )at  

        Parameters
        ----------
        codes : torch.tensor
            Indices for each quantizer. Shape: (n_q, batch, timesteps).
        st : int, optional
            Start quantizer index in RVQ. The default is 0.

        Returns
        -------
        o : torch.tensor
            Reconstruct wavs from codes. Shape: (batch, channels, timesteps)

        )rG   )r#   decoder$   )r%   r>   rG   r=   rB   r)   r)   r*   rI      s   
zSpeechTokenizer.decode)N)NN)r   )__name__
__module____qualname__r   classmethodstrr9   r2   tensorintrD   rC   rF   rH   rI   __classcell__r)   r)   r'   r*   r      sL    %
#

r   )__doc__modules.seanetr   r   quantizationr   torch.nnr   einopsr   r2   numpyr   Moduler   r)   r)   r)   r*   <module>   s   