o
    kݱi?Q                     @   sT  d dl Z d dlmZ d dlmZmZmZmZmZm	Z	m
Z
 d dlZd dlZd dlZd dlZzd dlZW n ey=   dZY nw zd dlZW n eyO   dZY nw dejdededejfdd	ZG d
d dZddddddejdedededededejfddZdddde
eejejeeef f dededejfddZeG dd dZdS )    N)	dataclass)AnyDictListOptionalSequenceTupleUnionwavorig_sr	target_srreturnc                 C   sr   ||kr| j tjddS td urt|  tj||S td ur+tj|  tj||dS tdt | j tjddS )NFcopy)yr   r   zcNo resampler available; treating audio as target_sr without resampling. Install resampy or librosa.)	astypenpfloat32resampyresamplelibrosawarningswarnRuntimeWarning)r
   r   r    r   7/home/ubuntu/VibeVoice-finetuning/src/data_vibevoice.py_resample_if_needed   s   r   c                   @   s^   e Zd Z			ddedededee dd	f
d
dZdefddZdede	eef fddZ
d	S )VibeVoiceDatasettextaudiovoice_promptsdatasettext_columnaudio_columnvoice_prompts_columnr   Nc                 C   s   || _ || _|| _|| _d S N)r!   r"   r#   r$   )selfr!   r"   r#   r$   r   r   r   __init__%   s   
zVibeVoiceDataset.__init__c                 C   s
   t | jS r%   )lenr!   )r&   r   r   r   __len__1   s   
zVibeVoiceDataset.__len__idxc              
   C   sp  | j | }i }|| j |d< || j |d< d }| jr$| j|v r$|| j }|r8t|ts2|g|d< |S ||d< |S z]d}t|| j |d}t|| }td|d }td|d	 }	||	kr^|	}t|	|}	|	d
krt	
||	}
t|
| }t|| }t	d|}||||  }|g|d< W |S d |d< W |S  ty } ztd| d|  d |d< W Y d }~|S d }~ww )Nr   r   r    ]  )r   g      @g      @g      .@g       @g?r   z'Could not create voice prompt for item z: )r!   r"   r#   r$   
isinstancelist_load_audio_to_24kr(   minrandomuniformintrandint	Exceptionr   r   )r&   r*   itemdatauser_provided_promptr   	wav_arrayaudio_len_secondsmin_len_secmax_len_secprompt_len_secprompt_len_samplesmax_start_samplestart_sampleprompt_croper   r   r   __getitem__4   sL   



!

zVibeVoiceDataset.__getitem__)r   r   r    )__name__
__module____qualname__r   strr   r'   r2   r)   r   rB   r   r   r   r   r   $   s"    
r   g      ?g      ?)pre_silence_secpre_crossfade_secpost_crossfade_secpost_silence_secsample_raterG   rH   rI   rJ   c                C   s  t j| t jdd} tt|| }tt|| }tt|| }tt|| }	| jd }
|
dkr]g }|dkrE|t j|t jd |dkrT|t j|t jd |r[t 	|S | S t
||
}t|
| d}t
|	|}|
| }| d| }| || }| |d }dtdtdtdt jfd	d
}|||dd }|||jd dd }g }|dkr|t j|t jd |jdkr||jt jdd |jdkr||jt jdd |jdkr||jt jdd |dkr|t j|t jd t 	|S )zPad audio with leading/trailing silence and apply crossfades.

    Structure: [pre_silence][pre_crossfade][audio_body][post_crossfade][post_silence]
    Crossfades blend the audio with silence linearly to avoid hard edges.
    dtyper   Nnum_samplesstartendr   c                 S   s.   | dkrt jdt jdS t j||| dt jdS )Nr   )r   rL   T)endpointrM   )r   zerosr   linspace)rO   rP   rQ   r   r   r   _linear_fade   s   z3_apply_silence_with_crossfade.<locals>._linear_fade              ?Fr   )r   asarrayr   reshaper2   roundshapeappendrS   concatenater/   maxfloatndarraysizer   )r
   rK   rG   rH   rI   rJ   start_sil_samplesend_sil_samplespre_crossfade_samplespost_crossfade_samples	total_lenpieces	start_lenremaining_after_startend_lenmiddle_end_idxstart_segmentmiddle_segmentend_segmentrU   start_crossfadeend_crossfader   r   r   _apply_silence_with_crossfadee   sF   






rq   r+   Fr   augment_with_silencer   rs   c                C   s   t | tjr| tj}n^t | tjr|   	 
 }nMt | tr>td u r+tdtj| d dd\}}t|t||}n-t | trbd| v rbd| v rbtj| d tjd}t| d }t|||}n	tdt|  tj|tjd}|r{t||d}|S )	NzIlibrosa is required to load audio file paths. Please pip install librosa.T)srmonoarraysampling_raterL   zUnsupported audio type: )rK   )r,   r   r`   r   r   torchTensordetachcpur_   numpyrF   r   RuntimeErrorloadr   r2   dictrX   
ValueErrortyperq   )r   r   rs   wav_outr
   rt   arrr   r   r   r.      s$   
r.   c                   @   s   e Zd ZU eed< dZee ed< dZeed< dZ	eed< dZ
eed	< dZeed
< dZeed< dZeed< dZeed< dZeed< deeeef  deeef fddZdS )VibeVoiceCollator	processorN
max_lengthi  speech_compress_ratio   semantic_vae_dimFcompute_semanticsdebug_checksr   
text_fieldr   audio_fieldr    voice_prompts_fieldrV   voice_prompt_drop_ratefeaturesr   c           L   	   C   s	  t |}g }g }g }g }g }g }g }	|D ]R}
|
| jd}|
| j}|
| j}| j}|dk r4d}n|dkr:d}| j|g|d urKt |krK|gnd dd| jdd}|d d 	 }|d	t
|d d 	 }|d
}|d u r~t
j|d t
jd}|d 	 }t|ddd}d }z|t| jdd }|d urt|dr||}d }zIt|drt t|dg dkrt|jd }n0|}tdD ]}t|ttfrt |dkr|d }qt|drt t|dg dkrt|jd }W n ty   d }Y nw |d ur|dkr|}W n ty   d }Y nw |d u r,tdttt |t| j }| jjj}|g| }|| }|dg|  }|dg|  }dgt | dg|  } | jjj }!|!|! |!d |!d | !d t| jjdd }"|"d u rt| jjdd }"|"d ur|"dkr|!|" |!d |!d | !d | jd urt || jkrt |t| j }#d}$|D ]}%|%r n|$d7 }$q|#|$krt"d| j d|# d|$ d||#d  }||#d  }||#d  }| |#d  } |!| |!| |!| |!|  g }&g }'|dd urD|d # $ }(|d # $ %t})t|(jd D ]}*|&!|(|*  |'!t|)|* &  q.|'|& |'|' |	'dgt |&  |!| |!| |	!d qtdd |D }+g },g }-g }.g }/| jj}0t|0dd }1|1d u s|1dk rt|0dd }1|1d u s|1dk rt"dt(||||D ]5\}}}2}3|+t | }4|,!||1g|4   |-!|dg|4   |.!|2dg|4   |/!|3dg|4   qt
j)|,t
j*d}5t
j)|-t
j*d}6t
j)|.t
jd}7t
j)|/t
jd}8|rQtd d |D }9t+j,t ||9ft+j-d}:t.|D ]\};}<|<jd }=|<|:|;d |=f< q|r4t|nd}>t+j,t ||>ft+j/d}?t.|D ]\};}@d|?|;d |@f< qFt
j)|:t
j-d}At
j)|?t
jd}Bt+j|?t+j/d}Ct.|	D ]\};}D|Dr}|?|; |C|;< qpt
j)|Ct
jd}E| j0rMt| jd!rM| jj1d urMg }F|D ]}<z| jj1|<}Gt+j2|Gt+j-d}GW n ty   t+j,d| j3ft+j-d}GY nw |Gj4dkrt5d"|Gj d#|Gjd }=|Gjd }H|H| j3kr|H| j3k rt+j,|=| j3|H ft+j-d}It+j6|G|Igdd$}Gn|Gd d d | j3f }G|=|>k r)t+j,|>|= | j3ft+j-d}Jt+j6|G|Jgdd$}Gn|=|>kr4|Gd |> }G|F!|G%t+j- qt
j)t+j7|Fdd$t
j-d}Knt5d%d }Ad }Bd }Ed }K| j8rx|5dk9 shJ d&|Ad urx|A: dksxJ d'|5|6|A|B|K|7|8|Ed(S ))N rV   rW   Fpt)r   voice_samplespadding
truncationr   return_tensors	input_idsr   attention_maskspeech_input_maskrL   r+   Trr   acoustic_tokenizerencoder[         eos_ideos_token_idz--max_length=z1 would truncate into acoustic tokens. Needed cut=z, but only zb leading non-acoustic tokens available. Increase max_length or shorten text/voice-prompt preamble.speech_tensorsspeech_masksc                 s   s    | ]}t |V  qd S r%   )r(   ).0xr   r   r   	<genexpr>X  s    z-VibeVoiceCollator.__call__.<locals>.<genexpr>pad_token_idzUTokenizer has no pad_token_id or eos_token_id; please set one or pass a valid pad id.c                 s   s    | ]}|j d  V  qdS )r   N)r[   )r   wr   r   r   r   t  s    semantic_tokenizerz-Semantic tokenizer returned unexpected shape z. Expect [T, D].)axiszSemantic features are required but could not be computed. Ensure processor.semantic_tokenizer is available or precompute and provide features.z#input_ids contains negative indicesz.Expected speech_tensors 2D [segments, samples])r   r   r   r   speech_semantic_tensorsacoustic_input_maskacoustic_loss_maskspeeches_loss_input);r(   getr   r   r   r   r   r0   r   tolistrx   	ones_like
zeros_likeboolr.   getattrhasattrr   r2   r[   ranger,   r-   tupler4   r^   mathceilr_   r   	tokenizerspeech_diffusion_idspeech_end_idr\   r   r{   r|   r   sumextendziptensorlongr   rS   r   	enumeratebool_r   r   rX   r   ndimr}   r]   stackr   alldim)Lr&   r   
batch_sizesample_input_idssample_attention_maskssample_acoustic_input_maskssample_acoustic_loss_masksall_speech_waveformsall_speech_latent_lengthsper_segment_is_targetexr   r    target_audio
_drop_rateprocidsattnr   speech_input_mask_list
wav_targettarget_latent_lenacoustic_tokenc_outTcand_speech_diff_idtarget_placeholdersids_extendedattn_extendedr   r   r   r   cutleading_non_acousticvvoice_speechesvoice_latent_lengthsvoice_npvoice_masksseg_idxmax_seq_lenpadded_input_idspadded_attention_maskspadded_acoustic_input_maskspadded_acoustic_loss_maskstokr   ain_mask
aloss_maskpad_leninput_ids_tensorattention_mask_tensoracoustic_input_mask_tensoracoustic_loss_mask_tensormax_wave_lenpadded_speechesir   Lmax_latent_lenspeech_masks_npL_latspeech_tensors_tensorspeech_masks_tensorspeeches_loss_input_np	is_targetspeeches_loss_input_tensor	sem_featssemDpad_dpadr   r   r   r   __call__   s  
	


"






















$




zVibeVoiceCollator.__call__)rC   rD   rE   r   __annotations__r   r   r2   r   r   r   r   r   r   rF   r   r   r   r_   r   r   r  r   r   r   r   r      s   
 *r   )r   dataclassesr   typingr   r   r   r   r   r   r	   r|   r   rx   r   r0   r   r4   r   r`   r2   r   r   r_   rq   rF   ry   r   r.   r   r   r   r   r   <module>   sf    $E
B
