o
    1i                      @  sp   d Z ddlmZ ddlZddlmZ ddlZddlZddlm	Z	m
Z
 ddlmZ eeZeG dd dZdS )	z@XCodec2 wrapper: HKUSTAudio/xcodec2, 16 kHz, single VQ, ~50 TPS.    )annotationsN)Any)NeuralCodec
TokenBatch)register_codecc                   @  s   e Zd ZU dZded< dZded< d)d*d	d
Zdejfd+ddZ	d,d-ddZ
d.ddZe d/d d!Ze d0d#d$Zd1d&d'Zd(S )2XCodec2Codecxcodec2strnamei>  int	native_srHKUSTAudio/xcodec2model_idc                 C  s   || _ d | _d| _tj| _d S )Ncpu)	_model_id_model_devicetorchfloat32_dtype)selfr    r   6/home/ubuntu/bench-codecs/codecbench/codecs/xcodec2.py__init__   s   zXCodec2Codec.__init__cudadevicedtypetorch.dtypereturnNonec                 C  sF   ddl m} || _|| _|| j| _| j | t	
d| d S )Nr   )XCodec2ModelzXCodec2 loaded on %s)xcodec2.modeling_xcodec2r    r   r   from_pretrainedr   r   evaltologgerinfo)r   r   r   r    r   r   r   load   s   zXCodec2Codec.load      @   batch_secondsfloat
batch_sizec              	   C  s   | j d us	J dt|| j }tj|d|| jd}tdD ]"}t  | || j}| 	|}W d    n1 s;w   Y  qtj
  td d S )NzCall load() firstr)   )r      zXCodec2 warmup complete)r   r   r   r   randnr   rangeinference_modeencodedecoder   synchronizer%   r&   )r   r*   r,   	n_samplesdummy_tbr   r   r   warmup%   s   

zXCodec2Codec.warmupwavtorch.Tensorsrc                 C  s    || j krtj||| j }|S )N)r   
torchaudio
functionalresample)r   r9   r;   r   r   r   _resample_if_needed0   s   
z XCodec2Codec._resample_if_neededr   c              	   C  s   |  ||| j}|d}z| jj|d}|jdkr!|d}W n= ty_   t	d g }t
|jd D ]}| jj|||d  d}|jdkrP|d}|| q7tj|dd}Y nw t| j| j|dS )zEncode [B, 1, T] -> TokenBatch with tokens [B, T_tok].

        XCodec2 expects input_waveform as [B, T] (no channel dim).
        Attempts true batching first; falls back to sequential if it fails.
        r)   )input_waveformr-   z7XCodec2 batch encode failed, falling back to sequentialr   )dim)
codec_namesample_ratetokens)r?   r$   r   squeezer   encode_codendim	Exceptionr%   warningr/   shapeappendr   catr   r
   r   )r   r9   r;   wav_2dcodes	code_listicr   r   r   r1   5   s,   






zXCodec2Codec.encoder7   c                 C  sF   |j | j}|jdkr|d}| j|}|jdkr!|d}|S )zDecode tokens -> [B, 1, T].   r)   )rD   r$   r   rG   	unsqueezer   decode_code)r   r7   rN   audior   r   r   r2   U   s   



zXCodec2Codec.decodetorch.LongTensorc                 C  s"   |j }|jdkr|d}| S )z"Already single-stream: [B, T_tok].r-   r)   )rD   rG   rE   long)r   r7   tr   r   r   flatten_for_lmb   s   

zXCodec2Codec.flatten_for_lmN)r   )r   r	   )r   r	   r   r   r   r   )r(   r)   )r*   r+   r,   r   r   r   )r9   r:   r;   r   r   r:   )r9   r:   r;   r   r   r   )r7   r   r   r:   )r7   r   r   rV   )__name__
__module____qualname__r
   __annotations__r   r   r   r   r'   r8   r?   r0   r1   r2   rY   r   r   r   r   r      s   
 	
r   )__doc__
__future__r   loggingtypingr   r   r<   codecbench.codecs.baser   r   codecbench.codecsr   	getLoggerrZ   r%   r   r   r   r   r   <module>   s    
