o
    Si2                     @   s   d Z ddlZddlmZ ddlZddlZddlZddlm	Z	 ddl
mZ ddl
mZ ddlmZmZmZ d	Zejejejej f ZG d
d de	jZG dd de	jZdd Zedkrae  dS dS )zEnCodec model implementation.    N)Path)nn   )quantization)modules)_check_checksum_linear_overlap_add_get_checkpoint_urlz*https://dl.fbaipublicfiles.com/encodec/v0/c                       sZ   e Zd ZdZddededef fdd	Z	ddejdej	ej
ej  defddZ  ZS )LMModelac  Language Model to estimate probabilities of each codebook entry.
    We predict all codebooks in parallel for a given time step.

    Args:
        n_q (int): number of codebooks.
        card (int): codebook cardinality.
        dim (int): transformer dimension.
        **kwargs: passed to `encodec.modules.transformer.StreamingTransformerEncoder`.
              n_qcarddimc                    sv   t     | _|| _| _tjddi|| _t	 fddt
|D | _t	 fddt
|D | _d S )Nr   c                    s   g | ]
}t  d  qS )r   )r   	Embedding.0_r   r    A/home/ubuntu/.local/lib/python3.10/site-packages/encodec/model.py
<listcomp>+   s    z$LMModel.__init__.<locals>.<listcomp>c                    s   g | ]}t  qS r   )r   Linearr   r   r   r   r   ,   s    r   )super__init__r   r   r   mStreamingTransformerEncodertransformerr   
ModuleListrangeemblinears)selfr   r   r   kwargs	__class__r   r   r   %   s   
 $zLMModel.__init__Nr   indicesstatesoffsetc           	         s~    j \}}}t fddt|D }|||\}}tjfddt|D dddddd}tj|dd||fS )	a  
        Args:
            indices (torch.Tensor): indices from the previous time step. Indices
                should be 1 + actual index in the codebook. The value 0 is reserved for
                when the index is missing (i.e. first time step). Shape should be
                `[B, n_q, T]`.
            states: state for the streaming decoding.
            offset: offset of the current time step.

        Returns a 3-tuple `(probabilities, new_states, new_offset)` with probabilities
        with a shape `[B, card, n_q, T]`.

        c                    s&   g | ]}j |  d d |f qS N)r!   r   k)r'   r#   r   r   r   >   s   & z#LMModel.forward.<locals>.<listcomp>c                    s   g | ]	}j |  qS r   )r"   r+   )outr#   r   r   r   @   s    r   )r   r         )shapesumr    r   torchstackpermutesoftmax)	r#   r'   r(   r)   BKTinput_logitsr   )r'   r-   r#   r   forward.   s
   .zLMModel.forward)r   r   r   )Nr   )__name__
__module____qualname____doc__intr   r2   TensortpOptionalListr;   __classcell__r   r   r%   r   r
      s    	

r
   c                       s  e Zd ZdZ				d;dejdejdejd	e	j
e d
ededede	je dedef fddZede	je fddZede	je fddZdejde	j
e fddZdejdefddZde	j
e dejfddZdedejfd d!Zdejdejfd"d#Zd$efd%d&Zdefd'd(Ze	)	*	+	,			d<d	e	j
e d
eded-ed.ed/ede	je defd0d1Z ed=d2ed3e	je! fd4d5Z"ed>d6ed3e	je! fd7d8Z#ed>d6ed3e	je! fd9d:Z$  Z%S )?EncodecModelav  EnCodec model operating on the raw waveform.
    Args:
        target_bandwidths (list of float): Target bandwidths.
        encoder (nn.Module): Encoder network.
        decoder (nn.Module): Decoder network.
        sample_rate (int): Audio sample rate.
        channels (int): Number of audio channels.
        normalize (bool): Whether to apply audio normalization.
        segment (float or None): segment duration in sec. when doing overlap-add.
        overlap (float): overlap between segment, given as a fraction of the segment duration.
        name (str): name of the model, used as metadata when compressing audio.
    FN{Gz?unsetencoderdecoder	quantizertarget_bandwidthssample_ratechannels	normalizesegmentoverlapnamec                    s   t    d | _|| _|| _|| _|| _|| _|| _|| _	|| _
|	| _t| jt| jj | _|
| _tt| jj| _d| j | jjksKJ dd S )Nr/   z$quantizer bins must be a power of 2.)r   r   	bandwidthrL   rI   rK   rJ   rM   rN   rO   rP   rQ   mathceilnpprodratios
frame_raterR   r@   log2binsbits_per_codebook)r#   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   r%   r   r   r   Q   s"   
zEncodecModel.__init__returnc                 C   s   | j d u rd S t| j | j S r*   )rP   r@   rM   r#   r   r   r   segment_lengthm   s   
zEncodecModel.segment_lengthc                 C   s*   | j }|d u r	d S tdtd| j | S )Nr   )r_   maxr@   rQ   )r#   r_   r   r   r   segment_strides   s   zEncodecModel.segment_stridexc           
      C   s   |  dksJ |j\}}}|dkr|dksJ | j}|du r$|}|}n	| j}|dus-J g }td||D ]}|dddd||| f }	|| |	 q5|S )aG  Given a tensor `x`, returns a list of frames containing
        the discrete encoded codes for `x`, along with rescaling factors
        for each segment, when `self.normalize` is True.

        Each frames is a tuple `(codebook, scale)`, with `codebook` of
        shape `[B, K, T]`, with `K` the number of codebooks.
        r.   r   r/   N)r   r0   r_   ra   r    append_encode_frame)
r#   rb   r   rN   lengthr_   strideencoded_framesr)   framer   r   r   encodez   s   zEncodecModel.encodec           	      C   s   |j d }|| j }| jd u s|d| j ksJ | jr=|jddd}|djddd }d| }|| }|dd}nd }| |}| j	
|| j| j}|dd}||fS )	Ngh㈵>r   T)r   keepdimr/   g:0yE>r   )r0   rM   rP   rO   meanpowsqrtviewrI   rK   ri   rY   rS   	transpose)	r#   rb   re   durationmonovolumescaler!   codesr   r   r   rd      s   


zEncodecModel._encode_framerg   c                    sN    j }|du rt|dksJ  |d S  fdd|D }t| jp%dS )zDecode the given frames into a waveform.
        Note that the output might be a bit bigger than the input. In that case,
        any extra steps at the end can be trimmed.
        Nr   r   c                    s   g | ]}  |qS r   )_decode_frame)r   rh   r^   r   r   r      s    z'EncodecModel.decode.<locals>.<listcomp>)r_   lenrv   r   ra   )r#   rg   r_   framesr   r^   r   decode   s   zEncodecModel.decodeencoded_framec                 C   sH   |\}}| dd}| j|}| |}|d ur"||ddd }|S )Nr   r   rj   )rp   rK   ry   rJ   ro   )r#   rz   ru   rt   r!   r-   r   r   r   rv      s   
zEncodecModel._decode_framec                 C   s0   |  |}| |d d d d d |jd f S )Nrj   )ri   ry   r0   )r#   rb   rx   r   r   r   r;      s   
&zEncodecModel.forwardrS   c                 C   s,   || j vrtd| d| j  d|| _d S )Nz)This model doesn't support the bandwidth z. Select one of .)rL   
ValueErrorrS   )r#   rS   r   r   r   set_target_bandwidth   s
   



z!EncodecModel.set_target_bandwidthc                 C   s   t d t|  j}t| jj| jjddt	d| j
 d|}ddd}z|| j }W n ty8   td	w tt|}t jj|d
dd}|| |  |S )zHReturn the associated LM model to improve the compression rate.
        i     r   g      @)
num_layersr   past_contextzencodec_lm_24khz-1608e3c0.thzencodec_lm_48khz-7add9fc3.thencodec_24khzencodec_48khzz0No LM pre-trained for the current Encodec model.cpuTmap_location
check_hash)r2   manual_seednext
parametersdevicer
   rK   r   r[   r@   rY   torR   KeyErrorRuntimeErrorr	   ROOT_URLhubload_state_dict_from_urlload_state_dicteval)r#   r   lmcheckpointscheckpoint_nameurlstater   r   r   get_lm_model   s,   


zEncodecModel.get_lm_model]  r   Tweight_normcausal
model_normaudio_normalizec                 C   st   t j|||d}t j|||d}	td| d  t||j d  }
tj|j	|
dd}t
||	|| |||||d	}|S )N)rN   normr   i  rj   
   r   )	dimensionr   r[   )rO   rP   rR   )r   SEANetEncoderSEANetDecoderr@   rT   rU   
hop_lengthqtResidualVectorQuantizerr   rF   )rL   rM   rN   r   r   r   rP   rR   rI   rJ   r   rK   modelr   r   r   
_get_model   s(   	$zEncodecModel._get_modelr   
repositoryc                 C   sf   |d ur%|  st| d||  }|jdd }t|| t|S tt| }tj	j
|dddS )Nz must exist and be a directory.-r   r   Tr   )is_dirr|   stemsplitr   r2   loadr	   r   r   r   )r   r   filechecksumr   r   r   r   _get_pretrained   s   


zEncodecModel._get_pretrained
pretrainedc              	   C   sf   |r| sJ g d}d}d}d}t j|||ddd| rdnd	d
}| r-t ||}|| |  |S )z2Return the pretrained causal 24khz model.
        )g      ?      @         (@      8@zencodec_24khz-d7cc33bc.thr   r   Tr   Fr   rH   )r   r   r   rR   rF   r   r   r   r   r   r   rL   r   rM   rN   r   
state_dictr   r   r   encodec_model_24khz  s    

z EncodecModel.encodec_model_24khzc              
   C   sh   |r| sJ g d}d}d}d}t j|||dddd| rd	nd
d}| r.t ||}|| |  |S )z+Return the pretrained 48khz model.
        )r   g      @r   r   zencodec_48khz-7e698e3e.thi  r/   Ftime_group_normTg      ?r   rH   )r   r   r   rP   rR   r   r   r   r   r   encodec_model_48khz  s    
z EncodecModel.encodec_model_48khz)FNrG   rH   )r   r   Tr   FNrH   r*   )TN)&r<   r=   r>   r?   r   r   r   r   r   rB   rD   floatr@   boolrC   strr   propertyr_   ra   r2   rA   EncodedFrameri   rd   ry   rv   r;   r}   r
   r   staticmethodr   r   r   r   r   rE   r   r   r%   r   rF   D   s    	
	$rF   c                  C   s   ddl m}  dd l}g d}tjtjd}| | |D ]J\}}||  }|| |dd d d }|	d| d	\}}	|d d d |j
d
 f }|d}
||
d }|j|jksfJ |j|jfqd S )Nr   )product)r.   r         r   r   r   r.   test_z.wavr/   )	itertoolsr   
torchaudiorF   r   r   keysr}   r   r   rM   	unsqueezer0   )r   r   
bandwidthsmodels
model_namebwr   audio_suffixwavsrwav_inwav_decr   r   r   test1  s    


r   __main__)r?   rT   pathlibr   typingrB   numpyrV   r2   r    r   r   r   r   utilsr   r   r	   r   TuplerA   rC   r   Moduler
   rF   r   r<   r   r   r   r   <module>   s&   ) n
