o
    ´©i]0  ã                   @   sF   d dl mZmZmZ d dlZd dlmZ d dlmZ G dd„ dƒZ	dS )é    )ÚDictÚListÚTupleN)Ú	rearrangec                   @   sØ   e Zd ZdZdededededejf
dd„Zed	e	j
fd
d„ƒZd	e	j
fdd„Zd	e	j
fdd„Zde	j
fdd„Zde	j
de	j
de	j
de	j
dee deee	j
f fdd„Zde	j
fdd„Zd%de	j
d e	j
d!efd"d#„Zd$S )&ÚAudioFeatureProcessingPackerz¤
    Adapted from the minicpm-audio training utilities. It converts raw text and
    audio tokens into the packed multimodal representation required by VoxCPM.
    Údataset_cntÚmax_lenÚ
patch_sizeÚfeat_dimÚ	audio_vaec                 C   s€   d| _ d| _d| _d| _d| _|| _|j| j | _|| _t	|dƒ| _
|| _|| _d| ji| _ddi| _dd	„ | j ¡ D ƒ| _d S )
Née   éf   ég   éh   é   é   Úttsc                 S   s   i | ]\}}||“qS © r   )Ú.0ÚusageÚidxr   r   úK/home/ubuntu/.local/lib/python3.10/site-packages/voxcpm/training/packers.pyÚ
<dictcomp>!   s    z9AudioFeatureProcessingPacker.__init__.<locals>.<dictcomp>)Úaudio_start_idÚaudio_end_idÚaudio_prompt_start_idÚaudio_prompt_end_idÚtext_eos_token_idr	   Ú
hop_lengthÚ	patch_lenr
   Úmaxr   r   r   Úprocess_tts_dataÚprocess_functionsÚtask_id_mapÚitemsÚ
id_to_task)Úselfr   r   r	   r
   r   r   r   r   Ú__init__   s   
z%AudioFeatureProcessingPacker.__init__Útokensc                 C   s4   | dkj dd}|d  ¡ dkrd S t|d d ƒS )NiœÿÿÿT)Úas_tupler   )ÚnonzeroÚnumelÚint)r(   Ú	positionsr   r   r   Ú_first_pad_position&   s   z0AudioFeatureProcessingPacker._first_pad_positionc                 C   ó"   |   |¡}|d u r|S |d |… S ©N©r.   ©r&   r(   Úpad_posr   r   r   Úunpad_text_tokens-   ó   
z.AudioFeatureProcessingPacker.unpad_text_tokensc                 C   r/   r0   r1   r2   r   r   r   Úunpad_audio_tokens1   r5   z/AudioFeatureProcessingPacker.unpad_audio_tokensÚwavc                 C   sœ   |  d¡}|  d¡}| d¡}|| j dkr(| j|| j  }tjj |d|f¡}t ¡  | j 	|| jj
¡}| dd¡}W d  ƒ |S 1 sGw   Y  |S )zÜ
        Encode raw waveform into latent features using AudioVAE.

        AudioVAE.encode expects shape [B, 1, T'] and returns [B, D, T].
        We then transpose to [B, T, D] to match downstream expectations.
        r   r   éÿÿÿÿr   N)Ú	unsqueezeÚsizer   ÚtorchÚnnÚ
functionalÚpadÚno_gradr   ÚencodeÚsample_rateÚ	transpose)r&   r7   Úwav_lenÚpadding_sizeÚzÚfeatr   r   r   Úencode_audio5   s   




þýz)AudioFeatureProcessingPacker.encode_audioÚaudio_tokensÚtext_tokensÚtask_idsÚdataset_idsÚ
is_promptsÚreturnc           4         s  |j }| ¡ dkrt| ¡  ¡ ƒnd}t| j|d ƒ}g }	g }
g }g }g }g }g }g }g }tj|tj|d}tj|tj|d}t	||| 
¡ | 
¡ |ƒD ]†\}}}}}|  |¡ tj¡}|  |¡}| j| }| j| |||ƒ\}}}}} }!}"}#||  |"7  < ||  |#7  < t |¡}$| j| |$|dk< t |¡}%|d |%|dk< |	 |¡ | |¡ |
 |¡ | |¡ | | ¡ | |!¡ | |$¡ | |%¡ | |jd ¡ qK|rÝt| jt|ƒƒ‰ n| j‰ ddtjdtdtjf‡ fdd	„‰dtjdtjf‡ fd
d„‰|r«tj‡fdd„|	D ƒdd}&tj‡fdd„|D ƒdd}'tj‡fdd„|
D ƒdd}(tj‡fdd„|D ƒdd})tj‡fdd„|D ƒdd}*tj‡fdd„|D ƒdd}+tj‡fdd„|D ƒdd},tj‡fdd„|D ƒdd}-g }.|D ].}/t|/ˆ ƒ}0tjd|0|d}1|0ˆ k rœtjˆ |0 |1j|d}2tj|1|2gdd}1|. |1¡ qttj|.dd}3n?tjd| jftj|d}&t |&¡}'tjd| j| j| jftj|d}(t |&¡})t |&¡}*t |&¡}+t |&¡},t |&¡}-t |&¡}3| tj¡}| tj¡}|&|(|'|)|*|3|+|,|-||dœS )zÞ
        Padding-based batching: each sample in the input batch is processed
        independently and then padded to a common length (capped by ``max_len``).
        The result tensors all have shape [B, T, ...].
        r   r8   r   ©ÚdtypeÚdeviceÚxÚ	pad_valuerM   c                    sN   |   d¡ˆ kr| d ˆ … S tjˆ |   d¡ f|| j| jd}tj| |gddS )Nr   rN   ©Údim)r:   r;   ÚfullrO   rP   Úcat)rQ   rR   r>   ©r   r   r   Úpad_1d”   s   "z5AudioFeatureProcessingPacker.__call__.<locals>.pad_1dc                    sZ   |   d¡ˆ kr| d ˆ … S tjˆ |   d¡ f| jdd …  | j| jd}tj| |gddS )Nr   r   rN   rS   )r:   r;   ÚzerosÚshaperO   rP   rV   )rQ   r>   rW   r   r   Úpad_3dš   s   $ÿz5AudioFeatureProcessingPacker.__call__.<locals>.pad_3dc                    ó   g | ]}ˆ |d d‘qS ©r   )rR   r   ©r   Út©rX   r   r   Ú
<listcomp>£   ó    z9AudioFeatureProcessingPacker.__call__.<locals>.<listcomp>rS   c                    r\   r]   r   ©r   Úmr`   r   r   ra   ¤   rb   c                    s   g | ]}ˆ |ƒ‘qS r   r   )r   Úf)r[   r   r   ra   ¥   s    c                    r\   r]   r   rc   r`   r   r   ra   ¦   rb   c                    r\   r]   r   rc   r`   r   r   ra   §   rb   c                    r\   r]   r   )r   Úlr`   r   r   ra   ¨   rb   c                    r\   r]   r   r^   r`   r   r   ra   ª   rb   c                    r\   r]   r   )r   Údr`   r   r   ra   ­   rb   )rP   )rI   Úaudio_featsÚ	text_maskÚ
audio_maskÚ	loss_maskÚposition_idsÚlabelsÚaudio_task_idsÚaudio_dataset_idsÚaudio_duration_consumedÚtext_token_consumedN)r   )rP   r+   r,   r    Úitemr   r;   rY   Úfloat32ÚzipÚtolistr6   Útor4   r%   r"   Ú
zeros_liker#   ÚappendrZ   Úminr   ÚTensorÚstackÚarangerO   rV   Úint32r	   r
   Úlong)4r&   rH   rI   rJ   rK   rL   rP   Úmax_dataset_idr   Útext_tokens_listÚaudio_feats_listÚtext_mask_listÚaudio_mask_listÚloss_mask_listÚlabels_listÚaudio_task_ids_listÚaudio_dataset_ids_listÚlengthsrp   rq   Úaudio_tokenÚ
text_tokenÚtask_idÚdataset_idxÚ	is_promptÚunpad_audio_tokenÚunpad_text_tokenr   Úpacked_textÚ
audio_featri   rj   rk   rm   Úaudio_durationÚtext_token_countÚaudio_task_idÚaudio_dataset_idÚtext_tokens_batchÚtext_mask_batchÚaudio_feats_batchÚaudio_mask_batchÚloss_mask_batchÚlabels_batchÚaudio_task_ids_batchÚaudio_dataset_ids_batchÚposition_ids_listÚLÚL_clipÚposr>   rl   r   )r   rX   r[   r   Ú__call__K   sÂ    ÿ

÷









 ÿÿ


ÿ





õz%AudioFeatureProcessingPacker.__call__Ú
audio_datac                 C   s€   |   |¡}| d¡| j dkr-| dd¡}tj |d| j| d¡| j  f¡}| dd¡}| d¡d }t|d| jd}||fS )Nr   r   r   é   zb (t p) c -> b t p c)Úp)rG   r:   r	   rB   r<   r=   r>   r   )r&   r£   rh   Úaudio_feats_Úpaddingr’   r   r   r   Úextract_audio_featsÜ   s   
$z0AudioFeatureProcessingPacker.extract_audio_featsFr‰   rŠ   r   c              	   C   sÊ  t j|t j|r
| jn| jgt j|jdgdd}t|ƒ}|jd }|  	|¡\}}| 
d¡}|jd }	t j|	t j|jd}
t  ||
t j|rG| jn| jgt j|jdg¡}t j|| j| d¡ft j|jd}t j|||dd…df gdd}t  t  |¡t  |	¡t  d¡g¡ t j¡ |j¡}t  t  |¡t  |	¡t  d¡g¡ t j¡ |j¡}t  t  |¡|r³t  |	¡nt  |	¡t  d¡g¡ t j¡ |j¡}t  ||	 d ¡ t j¡ |j¡}d|d< ||||||||fS )NrN   r8   rS   r   r   .éþÿÿÿ)r;   rV   Útensorr   r   r}   rP   ÚlenrZ   r¨   ÚsqueezerY   r   r   r	   r:   rs   ÚonesÚtyperv   )r&   r‰   rŠ   r   Útext_token_infor“   Útext_lengthÚaudio_feat_infor’   Úaudio_lengthÚtext_pad_tokenÚaudio_pad_featri   rj   rk   rm   r   r   r   r!   ç   sh   ýþ÷


ýýÿý *ÿ"ÿþ@"øz-AudioFeatureProcessingPacker.process_tts_dataN)F)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r,   r<   ÚModuler'   Ústaticmethodr;   rz   r.   r4   r6   rG   r   Úboolr   Ústrr¢   r¨   r!   r   r   r   r   r   	   s0     þýüûú
ù  r   )
Útypingr   r   r   r;   Útorch.nnr<   Úeinopsr   r   r   r   r   r   Ú<module>   s
   