o
    聱i                     @   s  d dl Z d dlZd dlmZmZ d dlmZmZmZm	Z	m
Z
 d dlZd dlmZ d dlm  mZ d dlmZmZmZ d dlmZmZmZmZ d dlmZ d dlmZmZmZ d dl m!Z! d d	l"m#Z# d d
l$m%Z% d dl&m'Z'm(Z( e )e*Z+d dl,Z,d dlZd dlmZ G dd deZ-eG dd dZ.eG dd dZ/eG dd deZ0de.defddZ1de.defddZ2d'dej3dej3dej3de4dej3f
d d!Z5d"d# Z6d(d$d%Z7e*d&kre7  dS dS ))    N)	dataclassfield)AnyDictListOptionalTuple)load_datasetDatasetDictVerificationMode)HfArgumentParserTrainerset_seedTrainerCallback)TrainingArguments)
LoraConfigget_peft_modelTaskType)!VibeVoiceForConditionalGeneration)VibeVoiceConfig)VibeVoiceProcessor)VibeVoiceDatasetVibeVoiceCollator)r   c                   @   st   e Zd ZdddZdd Zdd	d
ZdddZdd Zdd ZdddZ	dddZ
dddZdddZdddZdS )EmaCallbackmodel.prediction_head+?cpuc                 C   s,   || _ t|| _t|| _d| _d| _dS )z
        attr_path: where the head lives under self.model (Trainer wraps your VibeVoiceForConditionalGeneration)
        decay:     EMA decay (0.999 ~ stable, 0.9999 ~ very smooth, slower to adapt)
        N)	attr_pathfloatdecaytorchdeviceshadow_orig)selfr   r   r!    r%   @/home/ubuntu/VibeVoice-finetuning/src/finetune_vibevoice_lora.py__init__%   s
   

zEmaCallback.__init__c                 C   s$   |}| j dD ]}t||}q|S )N.)r   splitgetattr)r$   modelmodnamer%   r%   r&   _get_module0   s   zEmaCallback._get_moduleNc                    s*     |} fdd|  D  _d S )Nc                    s&   i | ]\}}||   j qS r%   )detachtor!   clone).0kpr$   r%   r&   
<dictcomp>9   s    z.EmaCallback.on_train_begin.<locals>.<dictcomp>)r.   
state_dictitemsr"   )r$   argsstatecontrolr+   kwargsheadr%   r5   r&   on_train_begin7   s   


zEmaCallback.on_train_beginc           	      K   s   | j d u rd S | |}t + |  D ]\}}| j | | jj|	 
| jd| j d qW d    d S 1 s>w   Y  d S )N      ?)alpha)r"   r.   r    no_gradr7   r8   mul_r   add_r/   r0   r!   )	r$   r9   r:   r;   r+   r<   r=   r3   vr%   r%   r&   on_step_end<   s   

0"zEmaCallback.on_step_endc                 C   s.   |  |}t| | _|j| jdd d S NF)strict)r.   copydeepcopyr7   r#   load_state_dictr"   r$   r+   r=   r%   r%   r&   _swap_in_emaD   s   
zEmaCallback._swap_in_emac                 C   s2   | j d u rd S | |}|j| j dd d | _ d S rF   )r#   r.   rJ   rK   r%   r%   r&   
_swap_backI   s   

zEmaCallback._swap_backc                 K      |  | d S NrL   r$   r9   r:   r;   r+   r<   r%   r%   r&   on_evaluateO      zEmaCallback.on_evaluatec                 K   rN   rO   rM   rQ   r%   r%   r&   on_evaluate_endS      zEmaCallback.on_evaluate_endc                 K   rN   rO   rP   rQ   r%   r%   r&   on_saveV   rS   zEmaCallback.on_savec                 K   rN   rO   rT   rQ   r%   r%   r&   on_save_endZ   rV   zEmaCallback.on_save_endc                 K   rN   rO   rP   rQ   r%   r%   r&   on_train_end]   rS   zEmaCallback.on_train_end)r   r   r   rO   )__name__
__module____qualname__r'   r.   r>   rE   rL   rM   rR   rU   rW   rX   rY   r%   r%   r%   r&   r   $   s    






r   c                   @   s2  e Zd ZU edddidZee ed< edddidZee ed< eddZ	ee ed	< ed
dZ
eed< ed
dZeed< eddZeed< eddZeed< eddZeed< edddidZeed< edddidZeed< edddidZeed< edddidZeed< edddidZee ed< dS )ModelArgumentsNhelpz-Path to VibeVoice base model with config.jsondefaultmetadatamodel_name_or_pathzIPath to processor dir (preprocessor_config.json). Defaults to model path.processor_name_or_pathr`   	cache_dirTfreeze_acoustic_tokenizerfreeze_semantic_tokenizer   lora_r    
lora_alphag?lora_dropoutz7q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_projz=Comma-separated list of target module names in the LLM blockslora_target_modulesFz"Wrap diffusion head with PEFT LoRAlora_wrap_diffusion_headz0Train diffusion prediction head (full fine-tune)train_diffusion_headz3Train acoustic/semantic connectors (full fine-tune)train_connectorszOComma-separated indices of diffusion head layers to freeze (e.g., '0,1,5,7,8').layers_to_freeze)rZ   r[   r\   r   rb   r   str__annotations__rc   re   rf   boolrg   ri   intrk   rl   r   rm   rn   ro   rp   rq   r%   r%   r%   r&   r]   b   s0   
 r]   c                   @   s,  e Zd ZU edddidZee ed< eddZee ed< eddZ	eed	< ed
dZ
ee ed< eddZeed< eddZeed< eddZee ed< eddZeed< eddZeed< eddZee ed< edddidZee ed< edddidZee ed< edddidZeed< dS )DataArgumentsNr^   z<HF dataset name or 'json' with --train_jsonl for local filesr_   dataset_namerd   dataset_config_nametraintrain_split_name
validationeval_split_nametexttext_column_nameaudioaudio_column_namevoice_promptsvoice_prompts_column_name        eval_split_sizeFignore_verifications
max_lengthz=Path to local train JSONL with {text, audio, [voice_prompts]}train_jsonlz'Optional path to local validation JSONLvalidation_jsonlzaProbability to drop conditioning voice prompt during training (0.0 keep always, 1.0 drop always).voice_prompt_drop_rate)rZ   r[   r\   r   rw   r   rr   rs   rx   rz   r|   r~   r   r   r   r   r   rt   r   ru   r   r   r   r%   r%   r%   r&   rv   |   s"   
 rv   c                   @   s   e Zd ZU eddZeed< eddZeed< eddZ	eed< eddZ
eed< ed	dZeed
< eddZeed< eddZeed< edddidZeed< edddidZeed< dS )CustomTrainingArguments   rd   ddpm_batch_mulr?   ce_loss_weightdiffusion_loss_weightFdebug_ce_details   debug_ce_topkdebug_ce_max_examples   debug_ce_every_n_stepsr^   zEnable gradient clipping using max_grad_norm (set via --max_grad_norm, default 1.0). When False, disables clipping by forcing max_grad_norm=0.0.r_   gradient_clippingzUIf set, saves model components BEFORE training starts, into output_dir/debug_initial.
debug_saveN)rZ   r[   r\   r   r   ru   rs   r   r   r   r   rt   r   r   r   r   r   r%   r%   r%   r&   r      s    
 r   r9   returnc                 C   s2   dd | j dD }t| j| j| jdtj|dS )Nc                 S   s   g | ]
}|  r|  qS r%   )stripr2   sr%   r%   r&   
<listcomp>   s    z%build_lora_config.<locals>.<listcomp>,nonerrk   rl   bias	task_typetarget_modules)rm   r)   r   ri   rk   rl   r   	CAUSAL_LMr9   r   r%   r%   r&   build_lora_config   s   r   c                 C   s$   g d}t | j| j| jdtj|dS )N)noisy_images_proj	cond_proj	gate_projup_proj	down_projlinearr   r   )r   ri   rk   rl   r   FEATURE_EXTRACTIONr   r%   r%   r&   build_head_lora_config   s   r   labelsattention_maskacoustic_input_maskpad_idc           	      C   s   | d d dd f   }|d ur%| dkr%|d d dd f   dntj|tjd}|d d dd f   }|| @ }| }||| < |S )Nr   r   dtype)
contiguousnumeleqr    	ones_likert   r1   )	r   r   r   r   shifted	base_masklabel_is_acoustic
final_maskoutr%   r%   r&   mask_for_ce   s   B

r   c              
      s   z/t t | d| dd }|d u st|ds|d W d S |j  fdd}||_|d W d S  tyJ } z|d|  W Y d }~d S d }~ww )	Nr+   acoustic_tokenizerencodez.No acoustic_tokenizer.encode() found to patch.c                     s    | i |}z	|d d }|W S  t y   Y nw t|tr@dD ]}||v r/|| gg  S q!t|dkr@tt| ggS dD ]}t||rRt||gg  S qBzt|t	j
r_|ggW S W |ggS  t yo   Y |ggS w )Nr   )framescodestokenslatentshidden_states)	Exception
isinstancedictlennextitervalueshasattrr*   r    Tensor)r9   r<   r   _r3   attrbase_encoder%   r&   encode_wrapped   s6   


zB_patch_acoustic_encode_for_legacy_indexing.<locals>.encode_wrappedzJPatched acoustic_tokenizer.encode() to return [[...]] for legacy indexing.z-Failed to patch acoustic_tokenizer.encode(): )r*   r   warningr   infor   )	model_objlogger_acousticr   er%   r   r&   *_patch_acoustic_encode_for_legacy_indexing   s   
r   c            Y         s  t tttf} |  \}} tjdd jdv rtjntj	d t
d  t j t dds>t dr=d	 _t
d
 nt drM jd u sM jdkrPd _t
d j  |jp^|j}|d u rgtdt|}|j}dD ]}t||rt||d u rtd| qq|jd u rtdtj} jrtj}n	t ddrtj}tj|j|d}t|t
 t|jdd |_ z|! }	|" }
t|	dd }t|
dd }t#|d uo|d uo|$ |$ k}d}|d ur	|d ur	|j%|j%kr	z
t#t&||}W n t'y   d}Y nw ztt|j(d|j(dd }W n t'y(   t|j(dd }Y nw t
d| d| d|  |d urGt
dt#|j)  W n t'yc } zt
*d|  W Y d }~nd }~ww zH|! }|" }t|drt|dr|j+j%|j+j%kr|j+$ |j+$ krt,  |j+|_+W d    n	1 sw   Y  t
d W n t'y } zt
*d|  W Y d }~nd }~ww zg d}zt-t|j(j.dd}W n t'y   d}Y nw |! }	|" }
t|	dd }t|
dd }|D ]}t||d }|d u}|ot/|t-od|  ko|k n  }d }|rW|d urW|d urW|j%|j%krW|0d|krWzt#t&|| || }W n t'yV   d}Y nw d }|rt/|t-rz|1|g}W n t'y   z|2|}W n t'y   d }Y nw Y nw t
d!| d"| d#| d$| d%| d&|  qW n t'y } zt
*d'|  W Y d }~nd }~ww zt
d( t
d)t3|j4  t
d*|j5  t, i d+}tj6|j7|d,d-g|j8d.}t9|}|! |}|j||d,d/}|:|j;} | d d d d0d d f < }!|d d d1d f < }"t=j>|!?d0|!0d0|"?d0d2d3}#t
d4|#@ d5 W d    n	1 sJw   Y  W n t'yk } zt
*d6|  W Y d }~nd }~ww t|j(d7r{ jAr{d|j(_B|jCrt|jd8r|jjDE D ]}$d|$_)q|jFrt|jdr|jj E D ]}$d|$_)qtG|}%d9d: |jHId;D }&tJ|&dkptKd<d= |&D }'|'stL|jjM|%|j_Mnt
d> z|N  W n
 t'y   Y nw |O D ]\}(}$d|$_)qz|jjMO D ]\})}$d?|)v sd@|)v rd,|$_)qW n t'y$   t
*dA Y nw t|dBdrt|jdCrG dDdE dEtPjQ}*z*|*|jjR}+tL|+tS||j_R|jjRO D ]\})}$d?|)v s`d@|)v rcd,|$_)qRW n t'y } zt
*dF|  W Y d }~nd }~ww t|dGdrt|jdCr|jjRE D ]}$d,|$_)q|jTd urt|jdCrtU|jjRO },z;dHdI |jTId;D }-d}.tV|,D ]\}/\}}0|/|-v rd|0_)|.d17 }.t
dJ|/ dK|  qt
dL|. dM W n t'y } z	t
WdN|   d }~ww t|dOdr4t|jdPr|jjXE D ]}$d,|$_)qt|jdQr3|jjYE D ]}$d,|$_)q,n(t|jdPrH|jjXE D ]}$d|$_)qAt|jdQr\|jjYE D ]}$d|$_)qUz'|! }1t|1drm|1j+Zd |" }2|2d urt|2dr|2j+Zd W n
 t'y   Y nw dRdS }3znt|jdTr|3|jjMO nd}4t|jdCr|3|jjRO nd}5t|jdPr|3|jjXO nd}6t|jdQr|3|jjYO nd}7t[dUd= |E D }8t
dV|4d;dW|5d;dX|6d;dY|7d; t
dZ|8d; W n
 t'y
   Y nw |j\rt]j^nt]j_}9|j`d ur5d[|j`i}:|jad ur+|ja|:d\< tbd]|:|9|jcd^};n|jdd u r?td_tb|jd|je|9|jcd`};|;|jf }<d }= jgr|jhre|jh|;v re|;|jh }=n#|jir|jidkrtJ|<d1kr|<jj|ji jda}>|>d[ |>db }<}=tk|<|jl|jm|jndc}?d }@|=d urtk|=|jl|jm|jndc}@t|ddde}At|j(dfd }B|Bd u rzt-t|j(jodgdh}BW n t'y   dh}BY nw t|do|j d u}Ctp||jq|A|B|Cd|jrdi}DG djdk dkts}EG  fdldmdmtt}Ftudndodpdq}G|F| |?|@|D|G|Et-t drdspdsdtgdu}Ht dvdrzgtvjwx jydw}Itvjwx|Idx}Jtvjz|Jd,dy t
dz|I  zt|jjMd{rP|jjM{|J W n t'yl }K zt
*d||K  W Y d }K~Knd }K~Kww zt|jdCrt|jjRd{r|jjR{tvjwx|Jd} W n t'y }L zt
*d~|L  W Y d }L~Lnd }L~Lww z;t|jdCd }M|Md urt|Mdr|M| }Nt}|Ntvjwx|Jd tvjztvjwx|Jd}d,dy t}|Ntvjwx|Jd}d W n t'y } zt
*d|  W Y d }~nd }~ww z)t|jdPd }O|Od ur%tvjwx|JdP}Ptvjz|Pd,dy t}|O| tvjwx|Pd W n t'yA }Q zt
*d|Q  W Y d }Q~Qnd }Q~Qww z)t|jdQd }R|Rd urjtvjwx|JdQ}Stvjz|Sd,dy t}|R| tvjwx|Sd W n t'y }T zt
*d|T  W Y d }T~Tnd }T~Tww W n t'y } zt
*d|  W Y d }~nd }~ww t ddrz|~  W n t'y   t
*d Y nw  jA	r|Hj jd tvjwx jydx}Jtvjz|Jd,dy t|jdTd }Ut|Ud{r|U{|J t|jdCd }Mt|Md{	rtvjwx|Jd}}Vtvjz|Vd,dy |M{|V z5|Md u	rAt|Md	rA|M| }Nt}|Ntvjwx|Jd tvjwx|Jd}}Vtvjz|Vd,dy t}|Ntvjwx|Vd W n t'	y] } zt
*d|  W Y d }~nd }~ww z)t|jdPd }W|Wd u	rtvjwx|JdP}Ptvjz|Pd,dy t}|W| tvjwx|Pd W n t'	y } zt
*d|  W Y d }~nd }~ww z)t|jdQd }X|Xd u	rtvjwx|JdQ}Stvjz|Sd,dy t}|X| tvjwx|Sd W n t'	y } zt
*d|  W Y d }~nd }~ww  jg	r|@d u	r|H  d S d S d S )Nz4%(asctime)s - %(levelname)s - %(name)s - %(message)sz%m/%d/%Y %H:%M:%S)r   )formatdatefmtlevelz!Training/evaluation parameters %sr   Fmax_grad_normr   zVGradient clipping disabled (set max_grad_norm=0.0). Use --gradient_clipping to enable.r   r?   z)Gradient clipping enabled: max_grad_norm=zC--model_name_or_path (or --processor_name_or_path) must be provided)speech_start_idspeech_diffusion_idspeech_end_idz'Tokenizer missing required special id: z=--model_name_or_path is required to load VibeVoice base modelfp16)torch_dtypesemantic_tokenizerweightdecoder_configtie_word_embeddingsz%LM head diagnostics -> shared_params=z, values_equal=z, tie_word_embeddings=z%LM head requires_grad before freeze: z LM head tie diagnostics failed: z>Force-tied LM head weight to input embeddings (pointer share).zForce-tie of LM head failed: 
vocab_sizez<decode_failed>zSpecial token check -> =z, decoded='z
', exists=z, in_vocab_range=z, emb_vs_head_row_equal=z(Special token ID/row validation failed: z=== TOKENIZER DIAGNOSTICS ===zTokenizer class: zTokenizer vocab_size: zThe cat sat on the mat.T)add_special_tokensr!   )inputs_embedsr   return_dictr   r   mean)	reductionzSimple text CE loss: .4fzTokenizer diagnostics failed: 	use_cacher   c                 S   s    g | ]}|  r|   qS r%   )r   lowerr   r%   r%   r&   r   z       zmain.<locals>.<listcomp>r   c                 s   s    | ]}|d v V  qdS ))r   offdisabledisabledNr%   )r2   tr%   r%   r&   	<genexpr>{  s    zmain.<locals>.<genexpr>z@Skipping LLM LoRA wrapping (lora_target_modules indicates none).lora_Alora_Bz2Could not re-enable LoRA params on language_model.rn   prediction_headc                       s,   e Zd Zdejf fddZdd Z  ZS )zmain.<locals>._HeadForwardShimbasec                    s   t    || _d S rO   )superr'   r   )r$   r   	__class__r%   r&   r'     s    z'main.<locals>._HeadForwardShim.__init__c                 _   sL   t |dkr|d d \}}}n|d}|d}|d}| |||S )N   noisy_images	timesteps	condition)r   getr   )r$   r9   r<   r  r  r  r%   r%   r&   forward  s   


z&main.<locals>._HeadForwardShim.forward)rZ   r[   r\   nnModuler'   r  __classcell__r%   r%   r   r&   _HeadForwardShim  s    r
  z$Could not LoRA-wrap diffusion head: ro   c                 S   s    h | ]}|  rt|  qS r%   )r   ru   r2   xr%   r%   r&   	<setcomp>  r   zmain.<locals>.<setcomp>zFroze layer [z]: zSuccessfully froze z( parameter groups in the diffusion head.z$Could not parse --layers_to_freeze: rp   acoustic_connectorsemantic_connectorc                 S   s   t dd | D S )Nc                 s   s"    | ]\}}|j r| V  qd S rO   requires_gradr   )r2   r   r4   r%   r%   r&   r     s     z,main.<locals>._sum_params.<locals>.<genexpr>)sum)
named_iterr%   r%   r&   _sum_params  s   zmain.<locals>._sum_paramslanguage_modelc                 s   s    | ]
}|j r| V  qd S rO   r  )r2   r4   r%   r%   r&   r         z Trainable by block -> LLM-LoRA: z | diff_head: z | ac_conn: z | se_conn: zTOTAL trainable: %sry   r{   json)
data_filesverification_modere   z]Provide --dataset_name (HF datasets) or use --train_jsonl/--validation_jsonl for local files.)r  re   )	test_sizeseedtest)text_columnaudio_columnvoice_prompts_columnspeech_tok_compress_ratioi  semantic_vae_dimvae_dim   )	processorr   speech_compress_ratior!  compute_semanticsdebug_checksr   c                   @   s0   e Zd Zd
defddZdddZddd	ZdS )zmain.<locals>.LoRADebugCallback2   log_every_n_stepsc                 S   s    t dt|| _i | _g | _d S )Nr   )maxru   r)  prev_param_normslora_param_names)r$   r)  r%   r%   r&   r'   "  s   
z(main.<locals>.LoRADebugCallback.__init__Nc                    sN  z|d u rW d S t |  dd   D | _| jD ]} | }t|j  | j|< qt	| j}t
 fdd| jD }	t
dd | jD }
t
dd | jD }t
 fdd| jD }td| d	|
 d
| d|	 d| d |dkr}td |	|krtd W d S W d S  ty } ztd|  W Y d }~d S d }~ww )Nc                 S   s    g | ]}d |v sd|v r|qS )r   r   r%   r2   nr%   r%   r&   r   ,  r   zBmain.<locals>.LoRADebugCallback.on_train_begin.<locals>.<listcomp>c                 3   s    | ]
} | j rd V  qdS )r   N)r  r-  namedr%   r&   r   1  r  zAmain.<locals>.LoRADebugCallback.on_train_begin.<locals>.<genexpr>c                 s       | ]	}d |v rdV  qdS r   r   Nr%   r-  r%   r%   r&   r   2      c                 s   r1  r   r   Nr%   r-  r%   r%   r&   r   3  r3  c                 3   s6    | ]}d |v rt  | j  dkrdV  qdS )r   r   r   N)r   datanormitemr-  r/  r%   r&   r   4  s   4 zLoRA debug: found z LoRA params (A=z, B=z); trainable=z. Initial lora_B_zero=r(   r   z@LoRA debug: No LoRA parameters found. Check lora_target_modules.zBLoRA debug: Some LoRA params are frozen. They should be trainable.z$LoRA debug (on_train_begin) failed: )r   named_parameterskeysr,  r   r5  r6  r7  r+  r   r  loggerr   r   r   )r$   r9   r:   r;   r+   r<   r.  r4   totalreq_gradnum_Anum_Bzero_Br   r%   r/  r&   r>   '  s.   

*
z.main.<locals>.LoRADebugCallback.on_train_beginc                 [   s  z|d u st | jdkrW d S tt|ddpd}|| j dkr'|dkr'W d S t| }d}d}	d}
d}| jD ]J}||d }|d u rEq8| j|d}t	|j
  }d|v ret|| |kre|d7 }d|v r}t|| |kru|	d7 }	|dkr}|
d7 }
|| j|< q8tdd	 | jD }td
d	 | jD }td| d| d| d|	 d| d|
 d W d S  ty } ztd|  W Y d }~d S d }~ww )Nr   global_stepr   g-q=r   r   r   c                 s   r1  r2  r%   r-  r%   r%   r&   r   W  r3  z>main.<locals>.LoRADebugCallback.on_step_end.<locals>.<genexpr>c                 s   r1  r4  r%   r-  r%   r%   r&   r   X  r3  zLoRA debug step z: changed A /z, changed B z, lora_B_zero_now=r(   z!LoRA debug (on_step_end) failed: )r   r,  ru   r*   r)  r   r8  r  r+  r   r5  r6  r7  absr  r:  r   r   r   )r$   r9   r:   r;   r+   r<   stepr0  	changed_A	changed_Br?  epsr.  r4   prevcurrtotal_Atotal_Br   r%   r%   r&   rE   =  s@   
6z+main.<locals>.LoRADebugCallback.on_step_end)r(  rO   )rZ   r[   r\   ru   r'   r>   rE   r%   r%   r%   r&   LoRADebugCallback!  s    
rK  c                	       sz   e Zd Zddedeeef dee f fddZ	de
jd	e
jd
ee
j dee
j f fddZddee ddfddZdS )zmain.<locals>.VibeVoiceTrainerFNr+   inputsnum_items_in_batchc                    s  | d}| d}| d}| dd }zt|jj j}	W n ty/   | jj}	Y nw |d u rX| d}
|
d urWt	j
|
d|
dt|jdd	|	|
jd
}||d< nt|t	jrf|j|	d|d< || d|| d| d| d|| d| d jd	}z| d}| d}| d}|d urt|  nd}|d urt|  nd}|d urt|  nd}|d ur|d urt||@   nd}| t|t|t|t|d |d ur|d ur|d ur||krtd| d|  W n
 ty   Y nw |j}t|||dd}|d d d dd d f  }tjdd}||d|d|d}z
|  |||| W n tye } ztd|  W Y d }~nd }~ww |j!d uro|j!nt	j"d|jd} j#|  j$|  }zY|j%rdnd}| | d|&  | dt|t	jr|&  nt|i t'| dr| j(d urt)| j(j*dkr| j(j*d  dd }|d ur| d t|i W n
 ty   Y nw |r||fS |S )!N	input_idsr   r   speech_semantic_tensorsspeech_masksr   r   r!  r#  )r   r!   r   speech_tensorsacoustic_loss_maskspeeches_loss_input)	rN  r   rQ  rP  rO  r   rR  rS  r   )zdebug/num_tok_totalzdebug/num_tok_losszdebug/num_lat_totalzdebug/num_lat_lossz,Loss selection mismatch: acoustic_loss_mask=z vs speeches_loss_input=r   )r   r   )ignore_indexzFailed invoking CE debug: r   r   ry   evalz/ce_lossz/diffusion_loss	optimizerlrztrain/learning_rate_real)+r  r   r+   r  
parametersr   r   get_input_embeddingsr   r    zerossizer*   configr!   r   r   r0   r   ru   r  r7  logr   r:  r   logitsr   r   r  CrossEntropyLossview	_debug_cediffusion_losstensorr   r   trainingr/   r   rV  r   param_groups)r$   r+   rL  return_outputsrM  r   r   r   semtarget_dtypesmrZ  outputsal_masksp_maskssp_loss_selnum_tok_totalnum_tok_lossnum_lat_totalnum_lat_lossr^  	ce_labelsshift_logitsloss_fctce_lossr   rb  r;  prefixlr_valtraining_argsr%   r&   compute_loss^  s   






(
"(*
z+main.<locals>.VibeVoiceTrainer.compute_lossrs  rr  r   r   c              
      s  zt  dds
W d S tt | jddpd}tdtt  ddpd}|dks/|| dks/W d S t  |d}tj|	d||	dd	d
d
|}|d
}	t|	  }
|
dkrgt||	   ntd}g }tdtt  ddpwd}|d}tt||D ]'}|	| }t|  dkr|t|| |    q|td qtd|
 d|dddd |D   W d    W d S 1 sw   Y  W d S  ty } ztd|  W Y d }~d S d }~ww )Nr   Fr@  r   r   r   r   r   r   r   )r   rT  nanr   zCE debug: tokens_in_loss=z, avg_loss=r   z, per_example_avgs=c                 S   s"   g | ]}||krt |d ndqS )   N)roundr  r%   r%   r&   r     s   " z<main.<locals>.VibeVoiceTrainer._debug_ce.<locals>.<listcomp>zCE detailed debug failed: )r*   ru   r:   r*  r    rA   r[  Fcross_entropyr`  view_asner  r7  r   r   rangeminappendr:  r   r   r   )r$   rs  rr  r   r   rC  every_nvocabper_token_loss
valid_mask	num_validavg_lossper_ex_avgsmax_examplesBbvbr   rx  r%   r&   ra    sD   



$
 *&z(main.<locals>.VibeVoiceTrainer._debug_ce
output_dirr   c              
   S   s  z|p| j j}tj|d}tj|dd t| jjdd }t|dr'|	| t| jjdd }t|drGtj|d}tj|dd |	| |d urxt|drx|
 }t|tj|d	 tj|d}tj|dd t|tj|d	 t| jjd
d }	|	d urtj|d
}
tj|
dd t|	
 tj|
d t| jjdd }|d urtj|d}tj|dd t|
 tj|d W d S W d S  ty } ztd|  W Y d }~d S d }~ww )NloraTexist_okr  save_pretrainedr   diffusion_headr7   diffusion_head_full.binr  pytorch_model.binr  zFailed to save LoRA assets: )r9   r  ospathjoinmakedirsr*   r+   r   r  r7   r    saver   r:  r   )r$   r  r7   
target_dirlora_outr  	pred_headph_dirsdacac_dirsese_dirr   r%   r%   r&   _save  sB   



 z$main.<locals>.VibeVoiceTrainer._save)FN)NN)rZ   r[   r\   r   r   rr   r   r   ru   rz  r    r   ra  r  r%   rx  r%   r&   VibeVoiceTrainer]  s    (.\&r  r   r   r   )r   r   r!   logging_stepsr(  )r)  )r+   r9   train_dataseteval_datasetdata_collator	callbacksr   debug_initialr  r  z@[debug_save] Saving initial (pre-training) model components to: r  z,[debug_save] Failed to save language_model: r  z-[debug_save] Failed to save prediction_head: r7   r  z1[debug_save] Failed to save FULL diffusion head: r  z0[debug_save] Failed to save acoustic_connector: z0[debug_save] Failed to save semantic_connector: z;[debug_save] Unexpected failure saving initial components: gradient_checkpointingz5Failed to enable gradient checkpointing on the model.)resume_from_checkpointz+Failed to save FULL diffusion head at end: z#Failed to save acoustic_connector: z#Failed to save semantic_connector: )r   r]   rv   r   parse_args_into_dataclassesloggingbasicConfig
local_rankINFOWARNr:  r   r   r  r*   r   r   rc   rb   
ValueErrorr   from_pretrained	tokenizerRuntimeErrorr    float32bf16bfloat16float16r   r   r+   r   rY  get_output_embeddingsrt   data_ptrshapeallcloser   r\  r  r   r   rA   ru   r   r   r[  decodeconvert_ids_to_tokenstyperZ   r   rc  r   r!   r   lm_headlast_hidden_stater   r~  r  r`  r7  do_trainr   rf   r   rX  rg   r   rm   r)   r   allr   r  tie_weightsr8  r  r  r   r   rq   list	enumerateerrorr  r  requires_grad_r  r   r   	NO_CHECKSBASIC_CHECKSr   r   r	   re   rw   rx   rz   do_evalr|   r   train_test_splitr   r~   r   r   semantic_tokenizer_configr   r   r   r   r   r   r  r  r  r  r  r  r7   r  gradient_checkpointing_enablery   r  evaluate)Yparser
model_args	data_argsprocessor_pathr$  tokrequiredr   r+   
in_emb_modout_emb_modin_wout_w
shared_ptrvalues_equaltie_cfgr   
emb_modulehead_modulespecial_namesr   r-   valexistsin_range	equal_rowdecoded_strsimple_text
simple_idssimple_maskr  rj  r^  rs  shift_labelsru  r4   lora_cfgtm_lowerskip_lm_lorar   r.  r
  shimhead_paramsindices_to_freezefrozen_countiparamembr=   r  lm_lorapred_head_trainac_conn_trainse_conn_traintotal_trainabler  r  rawtrain_dseval_dsr)   r  r  r%  semantic_dimcompute_semantics_flagr  rK  r  ema_cbtrainer	debug_dirr  e_lme_headphr  ac_connr  e_acse_connr  e_selmr  r  r  r%   rx  r&   main   sN  





$"
(


,82


$

 








""""*


"


< 3






r  __main__)r   )r   N)8r  r  dataclassesr   r   typingr   r   r   r   r   r    torch.nnr  torch.nn.functional
functionalr~  datasetsr	   r
   r   transformersr   r   r   r   r   HfTrainingArgumentspeftr   r   r   $vibevoice.modular.modeling_vibevoicer   )vibevoice.modular.configuration_vibevoicer   'vibevoice.processor.vibevoice_processorr   data_vibevoicer   r   	getLoggerrZ   r:  rH   r   r]   rv   r   r   r   r   ru   r   r   r  r%   r%   r%   r&   <module>   sN   
>(	
"     (
