o
    i^                     @   s  d Z ddlZddlmZ ddlmZmZ ddlZddlm	Z	 ddl
m	  mZ ddlmZ ddlmZmZ dd	lmZ d
dlmZ eG dd deZeG dd deZeG dd deZG dd de	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZ G d d! d!e	jZ!eG d"d# d#eZ"ed$d%G d&d' d'e"Z#d'd#gZ$dS )(zTransformers Xcodec model.    N)	dataclass)OptionalUnion   )PreTrainedAudioTokenizerBase)ModelOutputauto_docstring   )	AutoModel   )XcodecConfigc                   @   s6   e Zd ZU dZdZeej ed< dZ	eej
 ed< dS )XcodecOutputao  
    Args:
        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
            Discrete code indices computed using `model.encode`.
        audio_values (`torch.FloatTensor` of shape `(batch_size, channels, num_samples)`, *optional*)
            Decoded audio values obtained using the decoder part of Xcodec.
    Naudio_codesaudio_values)__name__
__module____qualname____doc__r   r   torch
LongTensor__annotations__r   FloatTensor r   r   ^/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/xcodec/modeling_xcodec.pyr      s   
 r   c                   @   $   e Zd ZU dZdZeej ed< dS )XcodecEncoderOutputz
    Args:
        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
            Discrete code indices computed using `model.encode`.
    Nr   )	r   r   r   r   r   r   r   r   r   r   r   r   r   r   -      
 r   c                   @   r   )XcodecDecoderOutputz
    Args:
        audio_values (`torch.FloatTensor`  of shape `(batch_size, channels, num_samples)`, *optional*):
            Decoded audio values obtained using the decoder part of Xcodec.
    Nr   )	r   r   r   r   r   r   r   r   r   r   r   r   r   r   8   r   r   c                       sH   e Zd ZdZdedededef fddZdejd	ejfd
dZ	  Z
S )ResidualUnitzFResidual block for SemanticEncoder and SemanticDecoder used in Xcodec.configin_channelsout_channelsdilationc              
      s\   t    t | _|jd d | }tj|||jd||ddd| _tj||ddd| _d S )Nr   r	   F)stridepaddingr"   groupsbias)r    r!   kernel_sizer&   )	super__init__nnELU
activationunit_kernel_sizeConv1dconv1conv2)selfr   r    r!   r"   r$   	__class__r   r   r)   F   s   


zResidualUnit.__init__hidden_statereturnc                 C   s0   |  |}| |}|  |}| |}|| S N)r,   r/   r0   )r1   r4   output_tensorr   r   r   forwardV   s
   



zResidualUnit.forward)r   r   r   r   r   intr)   r   Tensorr8   __classcell__r   r   r2   r   r   C   s    r   c                       D   e Zd Zdedededef fddZdejdejfd	d
Z  Z	S )SemanticEncoderBlockr   r    r!   r#   c                    sd   t    t fdd jD | _|dkrdnd| }|d d }tj||||dd| _d S )Nc                       g | ]	}t  |qS r   r   .0r"   r   r    r   r   
<listcomp>b       z1SemanticEncoderBlock.__init__.<locals>.<listcomp>r   r   r	   Tr'   r#   r$   r&   )r(   r)   r*   
ModuleListblock_dilations	res_unitsr.   conv)r1   r   r    r!   r#   kernelr$   r2   rB   r   r)   _   s   
zSemanticEncoderBlock.__init__r4   r5   c                 C   s"   | j D ]}||}q| |}|S r6   )rH   rI   r1   r4   unitr   r   r   r8   j      


zSemanticEncoderBlock.forward
r   r   r   r   r9   r)   r   r:   r8   r;   r   r   r2   r   r=   ^   s    r=   c                       2   e Zd Z fddZdejdejfddZ  ZS )SemanticEncoderc                    s   t    t|jt|jkrtdtj|j|j|j	d|j	d dd| _
|j}g }t|jD ]\}}t|j|j|  }|t||||g7 }|}q/t|| _d S )Nz:Number of strides must match the number of channel_ratios.r   r	   Fr&   )r(   r)   lenstrideschannel_ratios
ValueErrorr*   r.   semantic_hidden_sizer'   rI   	enumerater9   r=   rF   conv_blocks)r1   r   r    rX   ir#   r!   r2   r   r   r)   r   s$   
	zSemanticEncoder.__init__r4   r5   c                 C   "   |  |}| jD ]}||}q|S r6   )rI   rX   r1   r4   blockr   r   r   r8      rM   zSemanticEncoder.forwardr   r   r   r)   r   r:   r8   r;   r   r   r2   r   rP   q   s    rP   c                       r<   )SemanticDecoderBlockr   r    r!   r#   c              	      s   t    |dkrtj|ddddd| _n!d| }|d d }|d dkr(dnd}tj|||||dd| _t fd	d
 jD | _d S )Nr   r   TrE   r	   r   FrQ   c                    r>   r   r?   r@   r   r!   r   r   rC      rD   z1SemanticDecoderBlock.__init__.<locals>.<listcomp>)	r(   r)   r*   r.   rI   ConvTranspose1drF   rG   rH   )r1   r   r    r!   r#   r'   r$   output_paddingr2   r_   r   r)      s&   

	
zSemanticDecoderBlock.__init__r4   r5   c                 C   rZ   r6   )rI   rH   rK   r   r   r   r8      rM   zSemanticDecoderBlock.forwardrN   r   r   r2   r   r^      s    r^   c                       rO   )SemanticDecoderc                    s   t    tj|jt|j|jd  |jd|jd dd| _g }t	|j
D ]1\}}t|j|j|  }|t|jd k rIt|j|j|d   }n|j}|t||||g7 }q%t|| _tj|j|j|jd|jd dd| _d S )Nr   r   r	   F)r    r!   r'   r#   r$   r&   )r#   r$   r&   )r(   r)   r*   r.   rV   r9   rT   r'   r/   rW   rS   rR   r^   rF   rX   r0   )r1   r   rX   rY   r#   r    r!   r2   r   r   r)      s2   
zSemanticDecoder.__init__r4   r5   c                 C   s,   |  |}| jD ]}||}q| |}|S r6   )r/   rX   r0   r[   r   r   r   r8      s
   



zSemanticDecoder.forwardr]   r   r   r2   r   rb      s    rb   c                       s8   e Zd ZdZ fddZdd Zdd Zdd	 Z  ZS )
XcodecEuclideanCodebookz!Codebook with Euclidean distance.c                    sj   t    t|j|j}|j| _| dtdg | dt|j | d| | d|  d S )NinitedTcluster_sizeembed	embed_avg)	r(   r)   r   zeroscodebook_sizecodebook_dimregister_bufferr:   clone)r1   r   rf   r2   r   r   r)      s   
z XcodecEuclideanCodebook.__init__c                 C   sV   | j  }|djddd}|d| |  |djddd  }|jddj}|S )Nr	   r   T)keepdimr   dim)rf   tpowsummaxindices)r1   hidden_statesrf   scaled_statesdist	embed_indr   r   r   quantize   s
   
&z XcodecEuclideanCodebook.quantizec                 C   s8   |j }|d|d f}| |}|j|d d  }|S )Nrn   )shapereshaperz   view)r1   rv   r{   ry   r   r   r   encode   s
   
zXcodecEuclideanCodebook.encodec                 C   s   t || j}|S r6   )F	embeddingrf   )r1   ry   	quantizedr   r   r   decode   s   zXcodecEuclideanCodebook.decode)	r   r   r   r   r)   rz   r~   r   r;   r   r   r2   r   rc      s    
rc   c                       s6   e Zd ZdZdef fddZdd Zdd Z  ZS )	XcodecVectorQuantizationzY
    Vector quantization implementation. Currently supports only euclidean distance.
    r   c                    s   t    t|| _d S r6   )r(   r)   rc   codebookr1   r   r2   r   r   r)      s   
z!XcodecVectorQuantization.__init__c                 C   s   | ddd}| j|}|S Nr   r	   r   )permuter   r~   )r1   rv   embed_inr   r   r   r~      s   zXcodecVectorQuantization.encodec                 C   s   | j |}|ddd}|S r   )r   r   r   )r1   ry   rz   r   r   r   r     s   zXcodecVectorQuantization.decode)	r   r   r   r   r   r)   r~   r   r;   r   r   r2   r   r      s
    r   c                       sl   e Zd ZdZdef fddZdd Zddefd	d
Zdde	j
de	j
fddZde	j
de	j
fddZ  ZS ) XcodecResidualVectorQuantizationzv
    Residual vector quantization implementation. Follows Algorithm 1 in https://huggingface.co/papers/2107.03312
    r   c                    sF   t    t fddt jD | _ j| _ j| _ j| _d S )Nc                    s   g | ]}t  qS r   )r   )rA   _r   r   r   rC     s    z=XcodecResidualVectorQuantization.__init__.<locals>.<listcomp>)	r(   r)   r*   rF   rangenum_quantizers
quantizers
frame_rateri   r   r2   r   r   r)     s
   
 z)XcodecResidualVectorQuantization.__init__c                 C   s   t | j| j d S )zReturn bandwidth per quantizer.i  )mathlog2ri   r   )r1   r   r   r   get_bandwidth_per_quantizer  s   z<XcodecResidualVectorQuantization.get_bandwidth_per_quantizerNr5   c                 C   s:   |   }| j}|dur|dkrttdt|| }|S )z:Return num_quantizers based on specified target bandwidth.N        r   )r   r   r9   rt   r   floor)r1   	bandwidthbw_per_qr   r   r   r    get_num_quantizers_for_bandwidth  s
   zAXcodecResidualVectorQuantization.get_num_quantizers_for_bandwidth
embeddingsc           
      C   sZ   |  |}|}g }| jd| D ]}||}||}|| }|| qt|}	|	S )a  
        Encode the input tensor into discrete indices using RVQ, with the number of quantizers selected based on the given bandwidth.
        Each quantizer /codebook residually quantizes the input and returns the nearest indices in terms of Euclidian distance.
        N)r   r   r~   r   appendr   stack)
r1   r   r   r   residualall_indices	quantizerru   r   out_indicesr   r   r   r~   $  s   



z'XcodecResidualVectorQuantization.encodecodesc                 C   sB   t jd|jd}t|D ]\}}| j| }||}|| }q|S )z9Decode the given codes to their quantized representation.r   )device)r   tensorr   rW   r   r   )r1   r   quantized_outrY   ru   r   r   r   r   r   r   4  s   


z'XcodecResidualVectorQuantization.decoder6   )r   r   r   r   r   r)   r   r9   r   r   r:   r~   r   r;   r   r   r2   r   r     s    r   c                   @   s4   e Zd ZdZeZdZdZdd Zdd Z	dd	 Z
d
S )XcodecPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    xcodecinput_valuesc                 C   s  t |tjr |jjjd| jjd |jdur|jj	  dS dS t |tj
tjfr8|jj	  |jjd dS t |tjrgtj|j |jduret|j|j|jd   }tjj|j| |d dS dS |jjdkrv|jjd dS t |tjr|  dS t |tjr|jjjddd dS t |tr|j D ]}t |tjrtjj|jdd	 tj |jd q|j! D ]}t |tjrtjj|jdd	 tj |jd qdS dS )
zInitialize the weightsr   )meanstdNg      ?r   )abSnake1dg{Gz?)r   )"
isinstancer*   Linearweightdatanormal_r   initializer_ranger&   zero_	LayerNorm	GroupNormfill_r.   initkaiming_normal_r   sqrtr%   r    r'   uniform_r3   r   alphar`   reset_parameters	EmbeddingXcodecModelacoustic_encodermodulestrunc_normal_	constant_acoustic_decoder)r1   modulek	submoduler   r   r   _init_weightsI  sB   


z#XcodecPreTrainedModel._init_weightsc                 C   s   t jjj}tt jjjdrt jjjj}|| jj || jj | jj	D ]}||j |j
|j|jfD ]}||j ||j q2q#|| jjdd || jjdd | jj	D ]"}||jdd |j
|j|jfD ]}||jdd ||jdd qeqTdS )znApply weight norm in the acoustic encoder and decoder because the original checkpoint has weight norm applied.weight_normr   nameN)r   r*   utilsr   hasattrparametrizationsr   r/   r0   r\   	res_unit1	res_unit2	res_unit3r   conv_t1)r1   r   r\   res_unitr   r   r   apply_weight_normi  s(   


z'XcodecPreTrainedModel.apply_weight_normc                 C   s|   | j | jfD ]5}| D ].}ztjjj|dd W n ttfy$   Y nw t	|dr:d|j
v r:tjjjj|ddd qqdS )z=Remove the weight norm from the acoustic encoder and decoder.r   r   r   T)leave_parametrizedN)r   r   r   r   r*   r   remove_weight_normrU   AttributeErrorr   r   parametrizeremove_parametrizations)r1   r   mr   r   r   r     s   z(XcodecPreTrainedModel.remove_weight_normN)r   r   r   r   r   config_classbase_model_prefixmain_input_namer   r   r   r   r   r   r   r   >  s     r   z$The Xcodec neural audio codec model.)custom_introc                       s   e Zd Z fddZedejfddZdej	dej	fdd	Z
e	
	
ddejdee dee deejef fddZe	
ddejdee deejef fddZe	
	
	
ddejdeej dee dee deeejejf ef f
ddZ  ZS )r   c                    s   t  | || _|jd | _t|j}|j| _	|j
| _| | j t|| _t|| _t|j | _t|j|j| _t|j|jj| _t|j|jj| _t|| _|   d S )Nr	   )r(   r)   r   
hop_lengthpadr
   from_configacoustic_model_configencoderr   decoderr   _adjust_dac_decoderrP   encoder_semanticrb   decoder_semanticsemantic_model_configevalsemantic_modelr*   r   hidden_sizefcfc1fc2r   r   	post_init)r1   r   acoustic_modelr2   r   r   r)     s   


zXcodecModel.__init__r   c                 C   sp   |   D ]}t|tjr t|jtr|jd n|j}|d f|_qt| dr4t| jtj	r6t
 | _dS dS dS )z
        DAC implemented in Xcodec is slightly different from the HF version.
        DAC in Xcodec adjusts the output padding in every ConvTranspose1d in the decoder and removes
        the final `nn.Tanh` activation function.
        r   r	   tanhN)r   r   r*   r`   r#   tuplera   r   r   TanhIdentity)r   r   r#   r   r   r   r     s   zXcodecModel._adjust_dac_decoderr   r5   c                 C   s   |d d dd d f }t || j| jf}t  | j|dd}|j}W d    n1 s.w   Y  tj|dd}|jddS )Nr   T)output_hidden_statesr   ro   )r   r   r   no_gradr   rv   r   r   )r1   r   outputsrv   stackedr   r   r   _extract_semantic_features  s   
z&XcodecModel._extract_semantic_featuresNr   return_dictc           
   	   C   s6  |dur|n| j j}|jd }|dkrtd| |du r%| j jd }n|| j jvr8td| d| j j d| | }| |dd}| 	|}|jd |jd krp| 	t
|ddd	ddf | j| jfd}tj||gdd
}| |dddd}| j||}	|	d	d}	|s|	S t|	S )ac  
        input_values (`torch.FloatTensor` of shape `(batch_size, channels, num_samples)`):
            Float values of the input audio waveform.
        bandwidth (`float`, *optional*):
            The target bandwidth in (kbps) supports only values in `config.target_bandwidths`.
            Defaults to the highest available bandwidth `4.0` kbps.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`].

        Returns:
            `torch.LongTensor` of shape `(batch_size, num_quantizers, codes_length)` containing the discrete encoded audio codes.
        Nr   zAudio must be mono, but got rn   z)This model doesn't support the bandwidth z. Select one of .r	   r   ro   )r   r   r{   rU   target_bandwidthsr   detachr   	transposer   r   r   	unsqueezer   catr   r   r~   r   )
r1   r   r   r   channelse_semantic_input
e_semantic
e_acousticr   r   r   r   r   r~     s,   

2zXcodecModel.encoder   c                 C   s`   |dur|n| j j}|dd}| j|}| |dddd}| |}|s,|S t|S )a  
        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`):
            Discrete code indices computed using `model.encode`.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`]

        Returns:
            Decoded audio values of shape `(batch_size, channels, num_samples)` obtained using the decoder part of
            Xcodec.
        Nr   r   r	   )r   r   r   r   r   r   r   r   )r1   r   r   r   quantized_acousticr   r   r   r   r     s   
zXcodecModel.decodec                 C   sl   |dur|n| j j}|jd }|du r| j||dd}| j||dd dd|f }|s0||fS t||dS )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, channels, num_samples)`):
            The raw float values of the input audio waveform.
        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`:
            Discrete code indices computed using `model.encode`.
        bandwidth (`float`, *optional*):
            Target bandwidth in kbps. Must be one of `config.target_bandwidths`. Defaults to the highest available bandwidth.
        bandwidth (`float`, *optional*):
            Target bandwidth in kbps. Must be one of `config.target_bandwidths`. Defaults to the highest available bandwidth.
        return_dict (`bool`, *optional*):
            Whether to return a [`XcodecOutput`] instead of a plain tuple.

        Returns:
            `XcodecOutput` or tuple `(audio_codes, audio_values)`:
            - `audio_codes` of shape `(batch_size, num_quantizers, codes_length)`: the quantized discrete codes.
            - `audio_values` of shape `(batch_size, channels, num_samples)`: the reconstructed audio waveform given the codes.

        Example:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoFeatureExtractor, XcodecModel

        >>> model_id = "hf-audio/xcodec-hubert-librispeech"
        >>> model = XcodecModel.from_pretrained(model_id)
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)

        >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> dataset = dataset.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
        >>> audio_sample = dataset[0]['audio']['array']

        >>> inputs = feature_extractor(raw_audio=audio_sample, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> audio_codes = outputs.audio_codes
        >>> audio_values = outputs.audio_values
        ```
        Nrn   F)r   r   .)r   r   )r   r   r{   r~   r   r   )r1   r   r   r   r   lengthr   r   r   r   r8     s   .
zXcodecModel.forward)NNr6   )NNN)r   r   r   r)   staticmethodr*   Moduler   r   r   r   r   r:   r   floatboolr   r   r~   r   r   r   r   r8   r;   r   r   r2   r   r     sR    
1r   )%r   r   dataclassesr   typingr   r   r   torch.nnr*   torch.nn.functional
functionalr   modeling_utilsr   r   r   r   autor
   configuration_xcodecr   r   r   r   r  r   r=   rP   r^   rb   rc   r   r   r   r   __all__r   r   r   r   <module>   s>   

( 2N 7