o
    iI                     @  s   d Z ddlmZ ddlZddlZddlZddlmZmZ ddl	m
Z
 ddlZddlm  mZ ddlmZ ddlmZ ddlmZ eeZd	Zd
ZdZeee Zed ZdZdZ eG dd dZ!G dd dZ"dS )u  Hot GPU encoder: keeps XCodec2 warm and processes batched segments.

XCodec2-only mode for SFT data encoding. BiCodec removed entirely.
The encoder stays loaded for the entire worker lifetime — segments flow in,
tokens flow out.

Chunking rule (zero data loss) — IDENTICAL to pretraining pipeline:
  - Segments <= 6s: single chunk, pad to hop boundary.
  - Segments > 6s: overlapping 6s windows (stride 5.8s, overlap 0.2s).
    If the would-be last chunk is shorter than MIN_TAIL, it is absorbed
    into the previous chunk (extending it to 6-8.8s). The model handles
    variable lengths natively (Conv1d + relative position embeddings).
  - After encoding, tokens are stitched with a deterministic center-cut
    rule: drop HALF_OVERLAP tokens at each internal boundary.

Ref: XCodec2 training: min_audio_length=96000 (6s random crops).
     Inference: full-length audio, pad to multiple of 320.
     CodecEncoder hop = prod([2,2,4,4,5]) = 320, so 50 tokens/sec.
    )annotationsN)	dataclassfield)Path)
TokenBatch)CodecConfig)Segmenti@  2   g?   g      @i w c                   @  sR   e Zd ZU dZded< ded< ded< ded< ded	< d
Zded< dZded< dS )EncodedSegmenta   Result of encoding one audio segment through XCodec2.

    For multi-chunk segments (>6s), tokens are stitched from overlapping
    windows with center-cut boundary trimming. The stored tokens are
    one contiguous sequence covering the full segment.
    intsegment_idxfloatstart_send_s
duration_storch.Tensorxcodec2_tokens        encode_time_ms   
num_chunksN)__name__
__module____qualname____doc____annotations__r   r    r   r   8/home/ubuntu/bench-codecs/codecbench/pipeline/encoder.pyr   0   s   
 r   c                   @  s   e Zd ZdZd6d7ddZd8ddZd9ddZed:ddZed;ddZ	e
d<ddZd=ddZd>d(d)Zd?d,d-Z	.d@dAd2d3Z	.d@dBd4d5Zd.S )C
HotEncoderzKeeps XCodec2 loaded and warm on GPU.

    Call load() once at startup, then encode_segments() for each batch.
    Model is never unloaded between processing units.
    cudacfgr   devicestrc                 C  s(   || _ || _d | _d| _d| _d| _d S )NFr   r   )_cfg_device_xcodec_loaded_total_encode_time_total_segments)selfr!   r"   r   r   r   __init__H   s   
zHotEncoder.__init__returnNonec                 C  s   ddl m} td || jjd| _| jjr| | jj n| jj	| j
d d| j_| jj| jjd tj d }td	| d| _d
S )zLoad XCodec2 and warm it up.r   )FastXCodec2CodeczLoading XCodec2...)model_idr"   T)
batch_sizeg    .Az&XCodec2 loaded and warm. VRAM: %.0f MBN)codecbench.codecs.xcodec2_fastr.   loggerinfor$   xcodec2_model_idr&   xcodec2_custom_ckpt_load_custom_xcodec_ckptloadr%   _use_compilewarmupxcodec_batch_sizetorchr    memory_allocatedr'   )r*   r.   vramr   r   r   r8   P   s   

zHotEncoder.load	ckpt_pathc                 C  sd  ddl m} ddlm}m}m} td| || j	j
}tj|ddd}|d|d	|}i }	| D ]\}
}|
d
rE|
d
ddn|
}
||	|
< q5|j|	dd\}}|rdtdt||dd  |rstdt||dd  | | j ||j ||j| jd| j_| jj| j dtjjj_dtjj_||j || j_ | j| j_td dS )z6Load a custom XCodec2 checkpoint (fine-tuned weights).r   )XCodec2Model)GPUMelExtractor_apply_layer_truncation_apply_sdpa_patchz%Loading custom XCodec2 checkpoint: %scpuF)map_locationweights_only
state_dictmodelzmodel. r   )strictzCustom ckpt missing %d keys: %sN   z&Custom ckpt has %d unexpected keys: %sr0   Tz7Custom XCodec2 checkpoint loaded with all optimizations)!xcodec2.modeling_xcodec2r@   r2   rA   rB   rC   r3   r4   from_pretrainedr$   r5   r<   r8   getitems
startswithreplaceload_state_dictwarninglenevaltor%   semantic_modelfeature_extractorr&   _mel_extractorbackendsr    matmul
allow_tf32cudnn_model)r*   r?   r@   rA   rB   rC   rH   ckptrG   cleanedkvmissing
unexpectedr   r   r   r7   a   s2   




z#HotEncoder._load_custom_xcodec_ckptboolc                 C  s   | j S N)r'   r*   r   r   r   loaded   s   zHotEncoder.loadedr   c                 C  s   | j dkrdS | j| j  S )Nr   r   )r)   r(   rg   r   r   r   avg_encode_ms   s   
zHotEncoder.avg_encode_mswavr   c                 C  s&   | j d }t|t  }t| d|fS )a  Pad waveform to the next multiple of CODEC_HOP (320).

        Matches the original repo's padding convention for variable-length
        inference. For standard 6s chunks (96000), 96000 % 320 == 0 so this
        adds the unconditional +320 that encode_code uses.
        r   )shape	CODEC_HOPFpad)rj   Tro   r   r   r   _pad_to_hop   s   
zHotEncoder._pad_to_hopsegmentslist[Segment]?tuple[list[Segment], list[int], list[tuple[int, int, Segment]]]c              	     s  t | jj| jj  t t| jj } | }t t| jj }| jj}g }g }g }|D ]}	|	jjd }
t|}|
 krX|	t
|	j|	j| |	jd |	|
 |	|d|	f q*g }d}|  |
krq|	| ||7 }|  |
ksb|ry|d   nd}|
| }|dkr||kr|	|
|  n	 t|D ]@\}}|t|d k}|r|
}n|  }|| }|	jd||f }| |}|	t
|	j||  |	j||  |d |	| q|	|t||	f q*t fdd|D }t|t|krtdt|t|t|| | |||fS )	aW  Split segments into overlapping chunks. Never discards audio.

        Rule:
          1. T <= chunk_samples (6s): single chunk, pad to hop boundary.
          2. T > chunk_samples: stride-based 6s windows. If the would-be
             last chunk has < MIN_TAIL_S of audio, drop it and extend the
             previous chunk to the segment end (making it 6-8.8s).

        Returns:
            flat_chunks:   padded chunks ready for the GPU
            valid_samples: actual audio samples per chunk (before padding)
            groups:        (start_idx, count, original_segment) per parent
        rk   )r   r   audior   r   .c                 3  s    | ]	}| krd V  qdS )r   Nr   ).0vschunk_samplesr   r   	<genexpr>       z6HotEncoder._split_to_overlap_chunks.<locals>.<genexpr>zBOverlap split: %d segments -> %d chunks (%d standard, %d extended))r   r$   chunk_seconds	target_sr	OVERLAP_S
MIN_TAIL_Sru   rl   rT   appendr   r   r   rq   	enumeratesumr3   debug)r*   rr   overlap_samplesstridemin_tailsrflat_chunksvalid_samplesgroupssegrp   group_startstartsposlast_end	remaindercisis_lastendactual	chunk_wavpadded
n_extendedr   rx   r   _split_to_overlap_chunks   sl   




z#HotEncoder._split_to_overlap_chunksencodedlist[EncodedSegment]valids	list[int]original_segr   seg_idxr   r   c              
   C  sZ  t }tdd |D }t|dkr2|d }|d t }t||j|j|j|jddd|f |ddS g }	t|}
t	t
||D ]Z\}\}}|t }|jddd|f }|dkrk|	|dddt|| df  q?||
d kr|	|ddt||df  q?|	|ddt||t|| |d f  q?t||j|j|jtj|	dd||
dS )a  Center-cut stitch overlapping chunks into one contiguous segment.

        Rule (deterministic, identical to pretraining pipeline):
          First chunk:  keep tokens[ 0 : valid_tok - H ]
          Middle chunk: keep tokens[ H : valid_tok - H ]
          Last chunk:   keep tokens[ H : valid_tok     ]
          Single chunk: keep tokens[ 0 : valid_tok     ]

        Where H = HALF_OVERLAP = 5 tokens (0.1s).
        c                 s  s    | ]}|j V  qd S rf   )r   rv   er   r   r   rz     s    z+HotEncoder._stitch_group.<locals>.<genexpr>r   r   N)r   r   r   r   r   r   r   dim)HALF_OVERLAPr   rT   rm   r   r   r   r   r   r   zipr   maxminr<   cat)r*   r   r   r   r   Htotal_msecvtx_partsnirw   xtokr   r   r   _stitch_group   sB   
&"0zHotEncoder._stitch_groupr   EncodedSegment | Nonec                 C  sZ  t  }|j| j}|jdkr|d}z| j|| j	j
}tj  W n0 tjjtfyU } z td|j|jt|dd  tj  t  W Y d}~dS d}~ww |du retd|j|j dS t  | d }|  j|7  _|  jd7  _ztd|j|j|j|jdd  |d	W S  ttt fy } ztd
| W Y d}~dS d}~ww )z3Encode a single chunk (any length) through XCodec2.r
   r   z.CUDA error in _encode_one (seg %.1f-%.1fs): %sNx   z,XCodec2 returned None for segment %.1f-%.1fs  r   r   r   r   r   r   r   z.Failed to extract tokens from codec output: %s)!timeperf_counterru   rV   r%   ndim	unsqueezer&   encoder$   r}   r<   r    synchronizeOutOfMemoryErrorRuntimeErrorr3   rS   r   r   r#   empty_cachegccollectr(   r)   r   r   tokensrD   AttributeError
IndexErrorKeyError)r*   r   t0rj   x_tbr   
elapsed_msr   r   r   _encode_one4  s@   

 
zHotEncoder._encode_oneNchunksxcodec_batch_size_override
int | Nonec                 C  s  g }t |p| jjd}tdt||D ]}||||  }t }g }|D ]}	|	j| j	}
|
j
dkr8|
d}
||
 q%tj|dd}| j|| jj}tj  t | d }|  j|7  _|  jt|7  _|du rtd| tt|D ]}|d q{qzD|jjd t|k rtd|jjd  d	t| t|D ]#\}}	|t|| |	j|	j|	j|j||d    |t| d
 qW q t!tt"fy } ztd| tt|D ]}|d qW Y d}~qd}~ww |S )zFEncode a batch of same-length chunks (standard 6s path). XCodec2 only.r   r   r
   r   r   Nz+Batch encode returned None (batch_start=%d)zXCodec batch output too small: z < r   z.Failed extracting tokens from batch result: %s)#r   r$   r;   rangerT   r   r   ru   rV   r%   r   r   r   r<   r   r&   r   r}   r    r   r(   r)   r3   rS   r   rl   r   r   r   r   r   r   rD   r   r   )r*   r   r   resultsxbsr   batchr   x_wavsr   rj   x_inputr   r   jr   r   r   r   _encode_batchW  sX   




zHotEncoder._encode_batchc              
     s0  |sg S  |\ }}g }g }t|D ]"\}} | jjd ttjd kr1|| q|| qdgt	  }	|r fdd|D }
z	j
|
|d}W nB tjjtfy } z2dt| v sndt| v rtdt	|
 tj  t  fd	d|
D }n W Y d}~nd}~ww t||D ]\}}||	|< q|D ]} | |	|< qtd
d |	D }|rtd|t	|	 |t	|	krg S g }t|D ]B\}\}}}|	|||  }||||  }tdd |D rtd||j|jtdd |D | qӈ||||}|| q|S )a  Encode audio segments with XCodec2. Zero data loss.

        Flow (identical to pretraining pipeline):
          1. Split >6s segments into overlapping chunks (stride 5.8s).
             Short remainders are absorbed into the previous chunk.
          2. Encode standard chunks batched; extended chunks B=1.
          3. Trim tokens to actual audio length (no padding tokens stored).
          4. Stitch overlapping chunks per parent (center-cut rule).

        Returns one EncodedSegment per original segment.
        rk   Nc                   s   g | ]} | qS r   r   )rv   r   )r   r   r   
<listcomp>  s    z.HotEncoder.encode_segments.<locals>.<listcomp>)r   zout of memoryzillegal memoryz;CUDA error on batch encode (chunks=%d), falling back to B=1c                   s   g | ]}  |qS r   )r   )rv   crg   r   r   r     s    c                 s      | ]	}|d u rdV  qd S Nr   r   r   r   r   r   rz     r{   z-HotEncoder.encode_segments.<locals>.<genexpr>z%d/%d chunks failed to encodec                 s  s    | ]}|d u V  qd S rf   r   r   r   r   r   rz     s    z5Skipping segment %d (%.1f-%.1fs): %d/%d chunks failedc                 s  r   r   r   r   r   r   r   rz     r{   )r   r   ru   rl   rq   r<   zerosSTANDARD_CHUNKr   rT   r   r    r   r   r#   lowerr3   rS   r   r   r   r   r   r   anyr   r   r   )r*   rr   r   r   r   std_indicesext_indicesr   rw   encoded_flat
std_chunksstd_resultsr   encfailed_chunksr   r   startcountorig_seg	group_encgroup_vsstitchedr   )r   r*   r   encode_segments  sn   
 


zHotEncoder.encode_segments)r    )r!   r   r"   r#   )r,   r-   )r?   r#   r,   r-   )r,   re   )r,   r   )rj   r   r,   r   )rr   rs   r,   rt   )
r   r   r   r   r   r   r   r   r,   r   )r   r   r,   r   rf   )r   rs   r   r   r,   r   )rr   rs   r   r   r,   r   )r   r   r   r   r+   r8   r7   propertyrh   ri   staticmethodrq   r   r   r   r   r   r   r   r   r   r   A   s$    

!

_
;$:r   )#r   
__future__r   r   loggingr   dataclassesr   r   pathlibr   r<   torch.nn.functionalnn
functionalrn   codecbench.codecs.baser   codecbench.pipeline.configr   codecbench.pipeline.vadr   	getLoggerr   r3   rm   TOKENS_PER_SECr~   r   OVERLAP_TOKENSr   r   r   r   r   r   r   r   r   <module>   s.    
