o
    	۷i                     @   sx   d Z ddlZddlmZmZ ddlZddlmZm	Z	m
Z
mZ ddlmZ ddlmZ eeZG dd	 d	eZd	gZdS )
zXcodec model configuration    N)OptionalUnion)
AutoConfig	DacConfigHubertConfigWavLMConfig   )PretrainedConfig)loggingc                       s  e Zd ZdZdZeedZdddddgddgddgdddd	ddfd
ee	e
  dedede	e
 de	e de	e dededee de
deeef deeef f fddZedefddZedefddZedefddZedefdd Zedefd!d"Zedefd#d$Z  ZS )%XcodecConfiga
  
    This is the configuration class to store the configuration of an [`XcodecModel`]. It is used to instantiate a
    Xcodec model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the
    [Manel/X-Codec](https://huggingface.co/Manel/X-Codec) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        target_bandwidths (`List[float]`, *optional*, defaults to `[0.5, 1, 1.5, 2, 4]`):
            The range of different bandwidths (in kbps) the model can encode audio with.
        sample_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio waveform should be digitalized, in hertz (Hz).
        kernel_size (`int`, *optional*, defaults to 3):
            Kernel size for the initial semantic convolution.
        channel_ratios (`List[float]`, *optional*, defaults to `[1, 1]`):
            Expansion factors for the number of output channels in each semantic block.
        strides (`List[int]`, *optional*, defaults to `[1, 1]`):
            Strides for each semantic encoder block.
        block_dilations (`List[int]`, *optional*, defaults to `[1, 1]`):
            Dilation factors for the residual units in semantic blocks.
        unit_kernel_size (`int`, *optional*, defaults to 3):
            Kernel size inside each ResidualUnit in semantic blocks.
        codebook_size (`int`, *optional*, defaults to 1024):
            Number of entries in each residual quantizer's codebook.
        codebook_dim (`int`, *optional*):
            Dimensionality of each codebook vector. Defaults to sum of hidden size of acoustic and semantic models.
        initializer_range (`float`, *optional*, defaults to 0.02):
            Standard deviation of the truncated normal initializer for all weight matrices.
        acoustic_model_config (`Union[Dict, DacConfig]`, *optional*):
            An instance of the configuration for the acoustic (DAC) model.
        semantic_model_config (`Union[Dict, HubertConfig, WavLMConfig]`, *optional*):
            An instance of the configuration object for the semantic (HuBERT) model.

    Example:

    ```python
    >>> from transformers import XcodecModel, XcodecConfig

    >>> # Initializing configuration
    >>> configuration = XcodecConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = XcodecModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```xcodec)acoustic_model_configsemantic_model_configNi>  r         g{Gz?target_bandwidthssample_ratekernel_sizechannel_ratiosstridesblock_dilationsunit_kernel_sizecodebook_sizecodebook_diminitializer_ranger   r   c                    s`  t  jdi | |d u rtdg ddg ddd| _n t|tr*tdi || _nt|tr3|| _n	tdt| |d u rEt | _	n7t|tred|v rWt
|d | _	n%td tdi || _	nt|tsot|trs|| _	n	td	t| |d u rg d
}|| _|| _|| _|| _|| _|| _|| _|| _|
| _|	d u r| jj| j	j }	|	| _d S )N@   )            r      )encoder_hidden_sizedownsampling_ratiosdecoder_hidden_sizeupsampling_ratioshidden_sizezDacoustic_model_config must be a dict or DacConfig instance, but got _name_or_pathz_Could not determine semantic model type from config architecture. Defaulting to `HubertConfig`.zUsemantic_model_config must be a dict, HubertConfig, or WavLMConfig instance, but got )g      ?r   g      ?r   r    )super__init__r   r   
isinstancedict
ValueErrortyper   r   r   from_pretrainedloggerwarningr   r   r   r   r   r   r   r   r   r   r%   r   )selfr   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__r'   e/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/xcodec/configuration_xcodec.pyr)   Y   sX   

	



zXcodecConfig.__init__returnc                 C   s   t | j| j S N)mathceilr   
hop_lengthr1   r'   r'   r5   
frame_rate      zXcodecConfig.frame_ratec                 C   s   | j jS r7   )r   r%   r;   r'   r'   r5   semantic_hidden_size   s   z!XcodecConfig.semantic_hidden_sizec                 C   s   t t| jjS r7   )intnpprodr   r"   r;   r'   r'   r5   r:      r=   zXcodecConfig.hop_lengthc                 C   s   t t | jS r7   )r8   r9   log2r   r;   r'   r'   r5   codebook_nbits   r=   zXcodecConfig.codebook_nbitsc                 C   s   | j j| jj S r7   )r   r%   r   r;   r'   r'   r5   r%      s   zXcodecConfig.hidden_sizec                 C   s   t d| jd  | j| j  S )Ni  )r?   r   r<   rC   r;   r'   r'   r5   num_quantizers   s   zXcodecConfig.num_quantizers)__name__
__module____qualname____doc__
model_typer   r   sub_configsr   listfloatr?   r   r+   r   r)   propertyr<   r>   r:   rC   r%   rE   __classcell__r'   r'   r3   r5   r      sn    2
	


Hr   )rI   r8   typingr   r   numpyr@   transformersr   r   r   r   configuration_utilsr	   utilsr
   
get_loggerrF   r/   r   __all__r'   r'   r'   r5   <module>   s   
 
