o
    eic                     @   s  d Z ddlZddlmZ ddlmZ ddlZddlmZ ddl	m  m
Z ddlmZ ddlmZ ddlmZ dd	lmZmZ d
dlmZ ddlmZ eG dd deZeG dd deZeG dd deZG dd dejZG dd dejZG dd dejZ G dd dejZ!G dd dejZ"G dd dejZ#G d d! d!ejZ$G d"d# d#ejZ%eG d$d% d%eZ&ed&d'G d(d) d)e&Z'd)d%gZ(dS )*zTransformers Xcodec model.    N)	dataclass)	lru_cache   )initialization)conv1d_output_length)PreTrainedAudioTokenizerBase)ModelOutputauto_docstring   )	AutoModel   )XcodecConfigc                   @   s6   e Zd ZU dZdZejdB ed< dZej	dB ed< dS )XcodecOutputao  
    Args:
        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
            Discrete code indices computed using `model.encode`.
        audio_values (`torch.FloatTensor` of shape `(batch_size, channels, num_samples)`, *optional*)
            Decoded audio values obtained using the decoder part of Xcodec.
    Naudio_codesaudio_values)
__name__
__module____qualname____doc__r   torch
LongTensor__annotations__r   FloatTensor r   r   h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/xcodec/modeling_xcodec.pyr       s   
 r   c                   @   $   e Zd ZU dZdZejdB ed< dS )XcodecEncoderOutputz
    Args:
        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
            Discrete code indices computed using `model.encode`.
    Nr   )r   r   r   r   r   r   r   r   r   r   r   r   r   .      
 r   c                   @   r   )XcodecDecoderOutputz
    Args:
        audio_values (`torch.FloatTensor`  of shape `(batch_size, channels, num_samples)`, *optional*):
            Decoded audio values obtained using the decoder part of Xcodec.
    Nr   )r   r   r   r   r   r   r   r   r   r   r   r   r   9   r   r   c                       sH   e Zd ZdZdedededef fddZdejd	ejfd
dZ	  Z
S )ResidualUnitzFResidual block for SemanticEncoder and SemanticDecoder used in Xcodec.configin_channelsout_channelsdilationc              
      s\   t    t | _|jd d | }tj|||jd||ddd| _tj||ddd| _d S )Nr   r
   F)stridepaddingr#   groupsbias)r!   r"   kernel_sizer'   )	super__init__nnELU
activationunit_kernel_sizeConv1dconv1conv2)selfr    r!   r"   r#   r%   	__class__r   r   r*   G   s   


zResidualUnit.__init__hidden_statereturnc                 C   s0   |  |}| |}|  |}| |}|| S N)r-   r0   r1   )r2   r5   output_tensorr   r   r   forwardW   s
   



zResidualUnit.forward)r   r   r   r   r   intr*   r   Tensorr9   __classcell__r   r   r3   r   r   D   s    r   c                       D   e Zd Zdedededef fddZdejdejfd	d
Z  Z	S )SemanticEncoderBlockr    r!   r"   r$   c                    sd   t    t fdd jD | _|dkrdnd| }|d d }tj||||dd| _d S )Nc                       g | ]	}t  |qS r   r   .0r#   r    r!   r   r   
<listcomp>c       z1SemanticEncoderBlock.__init__.<locals>.<listcomp>r   r   r
   Tr(   r$   r%   r'   )r)   r*   r+   
ModuleListblock_dilations	res_unitsr/   conv)r2   r    r!   r"   r$   kernelr%   r3   rC   r   r*   `   s   
zSemanticEncoderBlock.__init__r5   r6   c                 C   s"   | j D ]}||}q| |}|S r7   )rI   rJ   r2   r5   unitr   r   r   r9   k      


zSemanticEncoderBlock.forward
r   r   r   r   r:   r*   r   r;   r9   r<   r   r   r3   r   r>   _   s    r>   c                       2   e Zd Z fddZdejdejfddZ  ZS )SemanticEncoderc                    s   t    t|jt|jkrtdtj|j|j|j	d|j	d dd| _
|j}g }t|jD ]\}}t|j|j|  }|t||||g7 }|}q/t|| _d S )Nz:Number of strides must match the number of channel_ratios.r   r
   Fr'   )r)   r*   lenstrideschannel_ratios
ValueErrorr+   r/   semantic_hidden_sizer(   rJ   	enumerater:   r>   rG   conv_blocks)r2   r    r!   rY   ir$   r"   r3   r   r   r*   s   s$   
	zSemanticEncoder.__init__r5   r6   c                 C   "   |  |}| jD ]}||}q|S r7   )rJ   rY   r2   r5   blockr   r   r   r9      rN   zSemanticEncoder.forwardr   r   r   r*   r   r;   r9   r<   r   r   r3   r   rQ   r   s    rQ   c                       r=   )SemanticDecoderBlockr    r!   r"   r$   c              	      s   t    |dkrtj|ddddd| _n!d| }|d d }|d dkr(dnd}tj|||||dd| _t fd	d
 jD | _d S )Nr   r   TrF   r
   r   FrR   c                    r?   r   r@   rA   r    r"   r   r   rD      rE   z1SemanticDecoderBlock.__init__.<locals>.<listcomp>)	r)   r*   r+   r/   rJ   ConvTranspose1drG   rH   rI   )r2   r    r!   r"   r$   r(   r%   output_paddingr3   r`   r   r*      s&   

	
zSemanticDecoderBlock.__init__r5   r6   c                 C   r[   r7   )rJ   rI   rL   r   r   r   r9      rN   zSemanticDecoderBlock.forwardrO   r   r   r3   r   r_      s    r_   c                       rP   )SemanticDecoderc                    s   t    tj|jt|j|jd  |jd|jd dd| _g }t	|j
D ]1\}}t|j|j|  }|t|jd k rIt|j|j|d   }n|j}|t||||g7 }q%t|| _tj|j|j|jd|jd dd| _d S )Nr   r   r
   F)r!   r"   r(   r$   r%   r'   )r$   r%   r'   )r)   r*   r+   r/   rW   r:   rU   r(   r0   rX   rT   rS   r_   rG   rY   r1   )r2   r    rY   rZ   r$   r!   r"   r3   r   r   r*      s2   
zSemanticDecoder.__init__r5   r6   c                 C   s,   |  |}| jD ]}||}q| |}|S r7   )r0   rY   r1   r\   r   r   r   r9      s
   



zSemanticDecoder.forwardr^   r   r   r3   r   rc      s    rc   c                       s8   e Zd ZdZ fddZdd Zdd Zdd	 Z  ZS )
XcodecEuclideanCodebookz!Codebook with Euclidean distance.c                    sj   t    t|j|j}|j| _| dtdg | dt|j | d| | d|  d S )NinitedTcluster_sizeembed	embed_avg)	r)   r*   r   zeroscodebook_sizecodebook_dimregister_bufferr;   clone)r2   r    rg   r3   r   r   r*      s   
z XcodecEuclideanCodebook.__init__c                 C   sV   | j  }|djddd}|d| |  |djddd  }|jddj}|S )Nr
   r   T)keepdimr   dim)rg   tpowsummaxindices)r2   hidden_statesrg   scaled_statesdist	embed_indr   r   r   quantize   s
   
&z XcodecEuclideanCodebook.quantizec                 C   s8   |j }|d|d f}| |}|j|d d  }|S )Nro   )shapereshaper{   view)r2   rw   r|   rz   r   r   r   encode   s
   
zXcodecEuclideanCodebook.encodec                 C   s   t || j}|S r7   )F	embeddingrg   )r2   rz   	quantizedr   r   r   decode   s   zXcodecEuclideanCodebook.decode)	r   r   r   r   r*   r{   r   r   r<   r   r   r3   r   rd      s    
rd   c                       s6   e Zd ZdZdef fddZdd Zdd Z  ZS )	XcodecVectorQuantizationzY
    Vector quantization implementation. Currently supports only euclidean distance.
    r    c                    s   t    t|| _d S r7   )r)   r*   rd   codebookr2   r    r3   r   r   r*      s   
z!XcodecVectorQuantization.__init__c                 C   s   | ddd}| j|}|S Nr   r
   r   )permuter   r   )r2   rw   embed_inr   r   r   r     s   zXcodecVectorQuantization.encodec                 C   s   | j |}|ddd}|S r   )r   r   r   )r2   rz   r{   r   r   r   r     s   zXcodecVectorQuantization.decode)	r   r   r   r   r   r*   r   r   r<   r   r   r3   r   r      s
    r   c                       sl   e Zd ZdZdef fddZdd Zddefd	d
Zdde	j
de	j
fddZde	j
de	j
fddZ  ZS ) XcodecResidualVectorQuantizationzv
    Residual vector quantization implementation. Follows Algorithm 1 in https://huggingface.co/papers/2107.03312
    r    c                    sF   t    t fddt jD | _ j| _ j| _ j| _d S )Nc                    s   g | ]}t  qS r   )r   )rB   _r    r   r   rD     s    z=XcodecResidualVectorQuantization.__init__.<locals>.<listcomp>)	r)   r*   r+   rG   rangenum_quantizers
quantizers
frame_raterj   r   r3   r   r   r*     s
   
 z)XcodecResidualVectorQuantization.__init__c                 C   s   t | j| j d S )zReturn bandwidth per quantizer.i  )mathlog2rj   r   )r2   r   r   r   get_bandwidth_per_quantizer  s   z<XcodecResidualVectorQuantization.get_bandwidth_per_quantizerNr6   c                 C   s:   |   }| j}|dur|dkrttdt|| }|S )z:Return num_quantizers based on specified target bandwidth.N        r   )r   r   r:   ru   r   floor)r2   	bandwidthbw_per_qr   r   r   r    get_num_quantizers_for_bandwidth  s
   zAXcodecResidualVectorQuantization.get_num_quantizers_for_bandwidth
embeddingsc           
      C   sZ   |  |}|}g }| jd| D ]}||}||}|| }|| qt|}	|	S )a  
        Encode the input tensor into discrete indices using RVQ, with the number of quantizers selected based on the given bandwidth.
        Each quantizer /codebook residually quantizes the input and returns the nearest indices in terms of Euclidian distance.
        N)r   r   r   r   appendr   stack)
r2   r   r   r   residualall_indices	quantizerrv   r   out_indicesr   r   r   r   %  s   



z'XcodecResidualVectorQuantization.encodecodesc                 C   sB   t jd|jd}t|D ]\}}| j| }||}|| }q|S )z9Decode the given codes to their quantized representation.r   )device)r   tensorr   rX   r   r   )r2   r   quantized_outrZ   rv   r   r   r   r   r   r   5  s   


z'XcodecResidualVectorQuantization.decoder7   )r   r   r   r   r   r*   r   r:   r   r   r;   r   r   r<   r   r   r3   r   r     s    r   c                   @   sV   e Zd ZdZeZdZdZdZe	
 dd Zdd Zd	d
 Zedd ZdddZdS )XcodecPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    xcodecinput_valuesaudioc                 C   s  t |tjr tj|jd| jjd |jdurt	|j dS dS t |tj
tjfr7t	|j t|j dS t |tjrdt|j |jdurbt|j|j|jd   }tj|j| |d dS dS |jjdkrrt|j dS t |tjr~|  dS t |tjrtj|jddd dS t |tr|j D ]}t |tjrtj|jdd t|jd q|j  D ]}t |tjrtj|jdd t|jd qdS t |t!rt"|j#t$%d	g t	|j& t	|j' t	|j( dS dS )
zInitialize the weightsr   )meanstdNr   )abSnake1dg{Gz?)r   T))
isinstancer+   Linearinitnormal_weightr    initializer_ranger'   zeros_	LayerNorm	GroupNormones_r/   kaiming_normal_r   sqrtr&   r!   r(   uniform_r4   r   alphara   reset_parameters	EmbeddingXcodecModelacoustic_encodermodulestrunc_normal_	constant_acoustic_decoderrd   copy_re   r   r;   rf   rg   rh   )r2   modulek	submoduler   r   r   _init_weightsK  sL   



z#XcodecPreTrainedModel._init_weightsc                 C   s   t jjjj}|| jj || jj | jjD ]}||j |j	|j
|jfD ]}||j ||j q%q|| jjdd || jjdd | jjD ]"}||jdd |j	|j
|jfD ]}||jdd ||jdd qXqGdS )znApply weight norm in the acoustic encoder and decoder because the original checkpoint has weight norm applied.r   nameN)r   r+   utilsparametrizationsweight_normr   r0   r1   r]   	res_unit1	res_unit2	res_unit3r   conv_t1)r2   r   r]   res_unitr   r   r   apply_weight_normq  s$   

z'XcodecPreTrainedModel.apply_weight_normc                 C   s|   | j | jfD ]5}| D ].}ztjjj|dd W n ttfy$   Y nw t	|dr:d|j
v r:tjjjj|ddd qqdS )z=Remove the weight norm from the acoustic encoder and decoder.r   r   r   T)leave_parametrizedN)r   r   r   r   r+   r   remove_weight_normrV   AttributeErrorhasattrr   parametrizeremove_parametrizations)r2   r   mr   r   r   r     s   z(XcodecPreTrainedModel.remove_weight_normc                    s    dt jf fdd t |S )zA
        Recursively iterate to fetch all Conv1d layers.
        r   c                    s:   g }t | tjr||  |  D ]	}| | q|S r7   )r   r+   r/   r   childrenextend)r   params_listchildget_conv1d_layers_recursiver   r   r     s   
zMXcodecPreTrainedModel._get_conv1d_layers.<locals>.get_conv1d_layers_recursive)r+   Moduletuple)r2   r   r   r   r   _get_conv1d_layers  s   z(XcodecPreTrainedModel._get_conv1d_layersNc                 C   s.   |du r| }|  |}|D ]}t||}q|S )zo
        For a given module, compute the output length that would be obtained after all Conv1d layers.
        N)r   r   )r2   input_lengthr   conv1d_layerslayerr   r   r   _get_conv1d_output_lengths  s   
z0XcodecPreTrainedModel._get_conv1d_output_lengthsr7   )r   r   r   r   r   config_classbase_model_prefixmain_input_nameinput_modalitiesr   no_gradr   r   r   r   r   r   r   r   r   r   r   ?  s    
%
r   z$The Xcodec neural audio codec model.)custom_introc                       s   e Zd Z fddZedejfddZdej	dej	fdd	Z
e	
	
ddejded
B ded
B dejeB fddZe	
ddejded
B dejeB fddZe	
	
	
ddejdejd
B ded
B ded
B deejejf eB f
ddZ  ZS )r   c                    s   t  | || _|jd | _t|j}|j| _	|j
| _| | j t|| _t|| _t|j | _t|j|j| _t|j|jj| _t|j|jj| _t|| _|   d S )Nr
   )r)   r*   r    
hop_lengthpadr   from_configacoustic_model_configencoderr   decoderr   _adjust_dac_decoderrQ   encoder_semanticrc   decoder_semanticsemantic_model_configevalsemantic_modelr+   r   hidden_sizefcfc1fc2r   r   	post_init)r2   r    acoustic_modelr3   r   r   r*     s   


zXcodecModel.__init__r   c                 C   sp   |   D ]}t|tjr t|jtr|jd n|j}|d f|_qt| dr4t| jtj	r6t
 | _dS dS dS )z
        DAC implemented in Xcodec is slightly different from the HF version.
        DAC in Xcodec adjusts the output padding in every ConvTranspose1d in the decoder and removes
        the final `nn.Tanh` activation function.
        r   r
   tanhN)r   r   r+   ra   r$   r   rb   r   r   TanhIdentity)r   r   r$   r   r   r   r     s   zXcodecModel._adjust_dac_decoderr   r6   c                 C   s   |d d dd d f }t || j| jf}t  | j|dd}|j}W d    n1 s.w   Y  tj|dd}|jddS )Nr   T)output_hidden_statesr   rp   )r   r   r   r   r   rw   r   r   )r2   r   outputsrw   stackedr   r   r   _extract_semantic_features  s   
z&XcodecModel._extract_semantic_featuresNr   return_dictc           
      C   s*  |dur|n| j j}|jd }|dkrtd| |du r%| j jd }n|| j jvr8td| d| j j d| | }| |dd}| 	|jd | j
|jd kre| 
t|| j| jf}n| 
|}tj||gdd	}| |dddd}| j||}	|	d
d}	|s|	S t|	S )ac  
        input_values (`torch.FloatTensor` of shape `(batch_size, channels, num_samples)`):
            Float values of the input audio waveform.
        bandwidth (`float`, *optional*):
            The target bandwidth in (kbps) supports only values in `config.target_bandwidths`.
            Defaults to the highest available bandwidth `4.0` kbps.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`].

        Returns:
            `torch.LongTensor` of shape `(batch_size, num_quantizers, codes_length)` containing the discrete encoded audio codes.
        Nr   zAudio must be mono, but got ro   z)This model doesn't support the bandwidth z. Select one of .r
   rp   r   )r    r  r|   rV   target_bandwidthsr  detachr   	transposer   r   r   r   r   catr   r   r   r   )
r2   r   r   r  channelse_semantic_input
e_semantic
e_acousticr   r   r   r   r   r     s,   

zXcodecModel.encoder   c                 C   s`   |dur|n| j j}|dd}| j|}| |dddd}| |}|s,|S t|S )a  
        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`):
            Discrete code indices computed using `model.encode`.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`]

        Returns:
            Decoded audio values of shape `(batch_size, channels, num_samples)` obtained using the decoder part of
            Xcodec.
        Nr   r   r
   )r    r  r
  r   r   r   r   r   )r2   r   r  r   quantized_acousticr   r   r   r   r     s   
zXcodecModel.decodec                 C   sl   |dur|n| j j}|jd }|du r| j||dd}| j||dd dd|f }|s0||fS t||dS )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, channels, num_samples)`):
            The raw float values of the input audio waveform.
        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`:
            Discrete code indices computed using `model.encode`.
        bandwidth (`float`, *optional*):
            Target bandwidth in kbps. Must be one of `config.target_bandwidths`. Defaults to the highest available bandwidth.
        bandwidth (`float`, *optional*):
            Target bandwidth in kbps. Must be one of `config.target_bandwidths`. Defaults to the highest available bandwidth.
        return_dict (`bool`, *optional*):
            Whether to return a [`XcodecOutput`] instead of a plain tuple.

        Returns:
            `XcodecOutput` or tuple `(audio_codes, audio_values)`:
            - `audio_codes` of shape `(batch_size, num_quantizers, codes_length)`: the quantized discrete codes.
            - `audio_values` of shape `(batch_size, channels, num_samples)`: the reconstructed audio waveform given the codes.

        Example:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoFeatureExtractor, XcodecModel

        >>> model_id = "hf-audio/xcodec-hubert-librispeech"
        >>> model = XcodecModel.from_pretrained(model_id)
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)

        >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> dataset = dataset.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
        >>> audio_sample = dataset[0]['audio']['array']

        >>> inputs = feature_extractor(raw_audio=audio_sample, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> audio_codes = outputs.audio_codes
        >>> audio_values = outputs.audio_values
        ```
        Nro   F)r  r   .)r   r   )r    r  r|   r   r   r   )r2   r   r   r   r  lengthr   r   r   r   r9   2  s   .
zXcodecModel.forward)NNr7   )NNN)r   r   r   r*   staticmethodr+   r   r   r   r   r  r	   r;   floatboolr   r   r   r   r   r   r9   r<   r   r   r3   r   r     sR    
3r   ))r   r   dataclassesr   	functoolsr   r   torch.nnr+   torch.nn.functional
functionalr    r   r   audio_utilsr   modeling_utilsr   r   r   r	   autor   configuration_xcodecr   r   r   r   r   r   r>   rQ   r_   rc   rd   r   r   r   r   __all__r   r   r   r   <module>   sB   

( 2u 9