o
    1i                     @  sd   d Z ddlmZ ddlZddlZddlZddlmZmZ ddl	m
Z
 eeZe
G dd dZdS )a  BiCodec wrapper: Spark-TTS audio tokenizer, 16 kHz.

Produces two token types:
  - semantic_tokens: 50 TPS, variable-length, captures linguistic content
  - global_tokens:   fixed-length, captures speaker/prosody attributes

Both are required for reconstruction.
    )annotationsN)NeuralCodec
TokenBatch)register_codecc                   @  s   e Zd ZU dZded< dZded< d)d*d
dZdejfd+ddZ	d,d-ddZ
d.ddZe d/d!d"Ze d0d$d%Zd1d'd(ZdS )2BiCodecCodecbicodecstrnamei>  int	native_srN	model_dir
str | Nonec                 C  s   || _ d | _d| _tj| _d S )Ncpu)
_model_dir_model_devicetorchfloat32_dtype)selfr    r   6/home/ubuntu/bench-codecs/codecbench/codecs/bicodec.py__init__   s   zBiCodecCodec.__init__cudadevicedtypetorch.dtypereturnNonec              
   C  s   || _ || _| jdu rtdz7ddl}ddlm} || j}|d  r.|j	dt
| ddlm} || j|d| _td| j| W dS  tyZ } ztd	| |d}~ww )
zLoad BiCodec tokenizer from Spark-TTS.

        Expects a local checkout or HF download of the Spark-TTS model directory
        containing the BiCodec checkpoint and config.
        Nz|BiCodec requires model_dir pointing to a Spark-TTS checkout. Download from: https://huggingface.co/SparkAudio/Spark-TTS-0.5Br   )Pathsparktts)BiCodecTokenizerr   zBiCodec loaded from %s on %szNCould not import BiCodecTokenizer. Ensure Spark-TTS repo is available. Error: )r   r   r   RuntimeErrorsyspathlibr   existspathinsertr   sparktts.models.audio_tokenizerr!   r   loggerinfoImportError)r   r   r   r$   r   
spark_rootr!   er   r   r   load"   s2   

zBiCodecCodec.load      @   batch_secondsfloat
batch_sizec              	   C  s   | j d us	J dt|| j }tj|d|| jd}tdD ]"}t  | || j}| 	|}W d    n1 s;w   Y  qtj
  td d S )NzCall load() firstr1   r"      zBiCodec warmup complete)r   r
   r   r   randnr   rangeinference_modeencodedecoder   synchronizer*   r+   )r   r2   r4   	n_samplesdummy_tbr   r   r   warmupD   s   

zBiCodecCodec.warmupwavtorch.Tensorsrc                 C  s    || j krtj||| j }|S N)r   
torchaudio
functionalresample)r   rA   rC   r   r   r   _resample_if_neededO   s   
z BiCodecCodec._resample_if_neededr   c                 C  s  |  ||| j}|jd }g }g }t|D ]_}|| }| j|}t|tr3t	|dkr3|\}	}
nt
|dr?|j}	|j}
n	tdt| t|	tjrT||	 n|tj|	| jd t|
tjrk||
 q|tj|
| jd qtdd |D }tj||tj| jd}t|D ]\}}| ||d	|jd
 f< qtjdd |D dd}t| j| j||ddd |D |jd
 ddS )zEncode [B, 1, T] -> TokenBatch with semantic + global tokens.

        BiCodecTokenizer.tokenize returns (semantic_tokens, global_tokens).
        We store as dict-based TokenBatch for explicit naming.
        r      semantic_tokensz!Unexpected tokenize output type: r"   c                 s      | ]}|j d  V  qdS Nshape.0sr   r   r   	<genexpr>w       z&BiCodecCodec.encode.<locals>.<genexpr>)r   r   NrM   c                 S  s   g | ]}|  qS r   )flatten)rQ   gr   r   r   
<listcomp>~   s    z'BiCodecCodec.encode.<locals>.<listcomp>dim)semanticglobalc                 S  s   g | ]}|j d  qS )rM   rN   rP   r   r   r   rW      s    )semantic_lengthsglobal_token_dim)
codec_namesample_ratetokensaux)rH   tor   rO   r7   r   tokenize
isinstancetuplelenhasattrrJ   global_tokens
ValueErrortyper   Tensorappendtensormaxzeroslong	enumeraterU   stackr   r	   r   )r   rA   rC   Bsemantic_listglobal_listi
single_wavresultsem_tokglob_tokmax_sem_len
padded_semrR   rh   r   r   r   r9   T   sD   


zBiCodecCodec.encoder?   c                 C  sT  |j d | j}|j d | j}|jd }g }t|D ]]}|jd|jd g| }||d|f }|| }	| j|	d|		d}
t
|
tjri|
jdkrY|
	d	d}
n
|
jdkrc|
	d}
||
 q|tj|
| jd		d	d qtd
d |D }tj|d|| jd	}t|D ]\}}|d||ddd|jd f< q|S )zqDecode semantic + global tokens -> [B, 1, T].

        Both token types are required for reconstruction.
        rZ   r[   r   r\   rM   Nr1   rI   r"   c                 s  rK   rL   rN   )rQ   ar   r   r   rS      rT   z&BiCodecCodec.decode.<locals>.<genexpr>)r`   rb   r   rO   r7   ra   getr   
detokenize	unsqueezerd   r   rk   ndimrl   rm   rn   ro   rq   squeeze)r   r?   rZ   
global_tokrs   
audio_listrv   sem_lensem_iglob_iaudiomax_lenrx   r}   r   r   r   r:      s.   



$zBiCodecCodec.decodetorch.LongTensorc                 C  sB   |j d }|j d }|  d }|| }tj||gdd S )zConcatenate semantic + global tokens for LM sequence length estimate.

        Global tokens use a separate vocab range (offset by max semantic ID + 1).
        rZ   r[   r1   rM   rX   )r`   rn   itemr   catrp   )r   r?   rZ   r   offsetglobal_offsetr   r   r   flatten_for_lm   s
   

zBiCodecCodec.flatten_for_lmrD   )r   r   )r   r   r   r   r   r   )r0   r1   )r2   r3   r4   r
   r   r   )rA   rB   rC   r
   r   rB   )rA   rB   rC   r
   r   r   )r?   r   r   rB   )r?   r   r   r   )__name__
__module____qualname__r	   __annotations__r   r   r   r   r/   r@   rH   r8   r9   r:   r   r   r   r   r   r      s   
 "
6!r   )__doc__
__future__r   loggingr   rE   codecbench.codecs.baser   r   codecbench.codecsr   	getLoggerr   r*   r   r   r   r   r   <module>   s    	
