o
    i9                     @   sL  d dl Z d dlmZ d dlmZmZmZmZ d dlZd dl	Z	d dl
mZmZmZmZ d dlmZ ddlmZ ddlmZ d	d
lmZ dZdZdZe deeedd	fdedededededededeeee f fddZ			d,dedededee fddZG d d! d!eZG d"d# d#Z d$d%d&ed'ed(ed)e!de	j"j#j$f
d*d+Z%dS )-    N)	dataclass)DictListOptionalTuple)AudioDatasetDatasetDictload_dataset)r      )VoxCPMConfig)AudioVAE   )AudioFeatureProcessingPackertextaudio
dataset_id i>  train_manifestval_manifesttext_columnaudio_columndataset_id_columnsample_ratenum_procreturnc                    sh   d| i}|r
||d< t d|d}dtdtf fdd}	|	|d }
d|v r.|	|d nd }|
|fS )	Ntrain
validationjson)
data_filesdsr   c                    s    | j vrtd  d|  td}  tkr |  t} tkr*| t} r=| j v r=tkr;| t} | S | tdgt	|  } | S )Nz
Expected 'z' column in manifest.)sampling_rater   )
column_names
ValueErrorcast_columnr   DEFAULT_AUDIO_COLUMNrename_columnDEFAULT_TEXT_COLUMNDEFAULT_ID_COLUMN
add_columnlen)r    r   r   r   r    H/home/ubuntu/.local/lib/python3.10/site-packages/voxcpm/training/data.pyprepare$   s   
z)load_audio_text_datasets.<locals>.prepare)r
   r   )r   r   r   r   r   r   r   r   dataset_dictr.   train_dsval_dsr,   r+   r-   load_audio_text_datasets   s   
r2      r    audio_vae_fps
patch_sizec                 C   s   g }d| j v }tt| D ]>}| | }t|d }|r"t|d }n|t }	t|	d t|	d  }t|| }
t|
| }|| d }|| q|S )u  
    预估每个样本经过 packer 之后的大致序列长度（text+audio），用于过滤超长样本。

    逻辑与 AudioFeatureProcessingPacker / AudioVAE 一致：
    - 文本长度: len(text_ids)
    - 音频长度:
        duration(s) * audio_vae_fps -> 近似 VAE 帧数 t_vae
        t_seq = ceil(t_vae / patch_size)
    - 序列总长约为: text_len + t_seq + 2
    durationtext_idsarrayr!   r   )r"   ranger*   floatr%   mathceilappend)r    r4   r5   lengthshas_durationiitemtext_lenr6   r   t_vaet_seq	total_lenr,   r,   r-   compute_sample_lengths;   s   
rF   c                   @   sf   e Zd ZdZdefddZdd Zdefdd	Ze	d
e
ej defddZede
e fddZdS )HFVoxCPMDatasetzh
    Thin wrapper around a tokenized HuggingFace dataset that returns
    PyTorch-friendly samples.
    datasetc                 C   s
   || _ d S N)rH   )selfrH   r,   r,   r-   __init__h      
zHFVoxCPMDataset.__init__c                 C   s
   t | jS rI   )r*   rH   )rJ   r,   r,   r-   __len__k   rL   zHFVoxCPMDataset.__len__idxc                 C   s>   | j | }|t }|d |d |d |td|dddS )Nr7   r8   r!   r   	is_promptF)r7   audio_arrayaudio_sampling_rater   rO   )rH   r%   getr(   )rJ   rN   rA   r   r,   r,   r-   __getitem__n   s   


zHFVoxCPMDataset.__getitem__seqs	pad_valuec                 C   sv   | st dS tdd | D }g }| D ]!}|jd |k r0d||jd  f}t jjj|||d}|| qt |S )Nr   c                 s   s    | ]}|j d  V  qdS )r   N)shape).0seqr,   r,   r-   	<genexpr>}   s    z0HFVoxCPMDataset.pad_sequences.<locals>.<genexpr>)value)	torchemptymaxrV   nn
functionalpadr=   stack)rT   rU   max_lenpaddedrX   	pad_widthr,   r,   r-   pad_sequencesy   s   

zHFVoxCPMDataset.pad_sequencesbatchc           	      C   s   dd |D }dd |D }t jdd |D t jd}dd |D }| j|dd}| j|d	d}t j|d
t jd}|||||dS )Nc                 S       g | ]}t j|d  t jdqS )r7   dtype)r[   tensorint32rW   sampler,   r,   r-   
<listcomp>        z.HFVoxCPMDataset.collate_fn.<locals>.<listcomp>c                 S   rg   )rP   rh   )r[   rj   float32rl   r,   r,   r-   rn      ro   c                 S   s   g | ]}|d  qS )r   r,   rl   r,   r,   r-   rn      s    rh   c                 S   s   g | ]
}t |d dqS )rO   F)boolrR   rl   r,   r,   r-   rn      s    i)rU   g      Yr   )text_tokensaudio_tokenstask_idsdataset_ids
is_prompts)r[   rj   rk   re   onessize)	clsrf   text_tensorsaudio_tensorsru   rv   text_paddedaudio_paddedrt   r,   r,   r-   
collate_fn   s   zHFVoxCPMDataset.collate_fnN)__name__
__module____qualname____doc__r   rK   rM   intrS   staticmethodr   r[   Tensorr:   re   classmethodr   r~   r,   r,   r,   r-   rG   b   s    rG   c                   @   sR   e Zd ZdZdedededejfddZ	de
eejf d	e
eejf fd
dZdS )BatchProcessorzq
    Wraps ``AudioFeatureProcessingPacker`` so the training loop can mirror
    the minicpm-audio mechanics.
    config	audio_vaedataset_cntdevicec                C   s>   || _ || _|| _| j| t||j|j|j| jd| _d S )N)r   rb   r5   feat_dimr   )	r   r   r   tor   
max_lengthr5   r   packer)rJ   r   r   r   r   r,   r,   r-   rK      s   zBatchProcessor.__init__rf   r   c                 C   s\   |d  | j}|d  | j}|d  | j}|d  | j}| j|||||d d}|S )Nrs   rr   rt   ru   rv   )rs   rr   rt   ru   rv   )r   r   r   )rJ   rf   rs   rr   rt   ru   packedr,   r,   r-   __call__   s   zBatchProcessor.__call__N)r   r   r   r   r   r   r   r[   r   rK   r   strr   r   r,   r,   r,   r-   r      s    
*r   F)	drop_last
hf_dataset
batch_sizenum_workersr   c                C   s    t | }|j|||dt j|dS )NT)r   r   shuffler~   r   )rG   prepare_dataloaderr~   )r   acceleratorr   r   r   torch_datasetr,   r,   r-   build_dataloader   s   r   )r3   r   )&r;   dataclassesr   typingr   r   r   r   argbindr[   datasetsr   r   r	   r
   torch.utils.dataTorchDatasetmodel.voxcpmr   modules.audiovaer   packersr   r'   r%   r(   bindr   r   r2   rF   rG   r   rq   utilsdata
DataLoaderr   r,   r,   r,   r-   <module>   s|    (
'80