o
    }oi52                    @   s|  d dl Z d dlZd dlZd dlZd dlmZ d dlZd dlZd dl	Z
d dlZd dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d dlm  mZ d d
lm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3 dddZ4dd Z5G dd de/Z6G dd de6Z7G dd de6Z8dS )     N)List)instantiate)
DictConfig	open_dict)Trainer)TensorBoardLogger)nn)get_worker_info)AutoTokenizerT5Tokenizer)word_error_rate)AggregatedTTSTokenizer)ForwardSumLoss)AudioCodecModel)transformer_2501)get_mask_from_lengthsplot_alignment_to_numpy)stack_tensors)ModelPT)PretrainedModelInfo)loggingtrainc                 C   s   g }g }| D ]?}| | }|j dkrt|j}n#i }d|v r%t|j|d< t|fi |}|dkr;t|dr;|d || || qt	||}	d }
|rTt
d}
|	|
fS )Nr
   g2ptestset_phone_prob      ?zgoogle-t5/t5-small)_target_r
   from_pretrainedpretrained_modelr   r   hasattrr   appendr   r   )all_tokenizers_configuse_text_conditioning_tokenizermode
tokenizerstokenizer_namestokenizer_nametokenizer_config	tokenizertext_tokenizer_kwargsaggregated_tokenizertext_conditioning_tokenizer r,   Y/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/models/magpietts.pysetup_tokenizers-   s&   




r.   c                 C   sF   t d|  d t }|j}t|j|j|jd\}}||_||_	d S )NzWorker z initializing...r#   )
r   infor	   datasetr.   r'   r"   dataset_typetext_tokenizerr+   )	worker_idworker_infor1   r(   r+   r,   r,   r-   worker_init_fnK   s   

r6   c                       s^  e Zd ZdZdTdeddf fddZdd	 ZdU fdd	ZdV fdd	ZdWddZ	e
dd ZdXddZdd Zdd Zdd Zdd  Zd!d" Zd#d$ ZdYd'd(ZdZd*d+Z		d[d,d-Zd.d/ Zd\d0d1Zd2d3 Zd4d5 Zd]d7d8Zd9d: Zd;d< Zd^d?d@ZdAdB ZdCdD ZdEdF Z dGdH Z!dIdJ Z"dKdL Z#dMdN Z$dOdP Z%e&dQe'e( fdRdSZ)  Z*S )_MagpieTTS_Modela[  
    Magpie-TTS Model Base Class used for training a TTS model that can generate audio codes from transcript and a context
    audio/text

    Supports multiple model types:

    - single_encoder_sv_tts: Transcript goes into the encoder and target audio goes to the decoder. Additionally,
    speaker_embedding of target audio (or context audio if provided) from TitaNet gets added to encoder
    output(all timesteps).

    - multi_encoder_context_tts: Transcript and context audio go to different encoders. Transcript encoding feeds to
    layers given by cfg.model.transcript_decoder_layers and the context encoding feeds into the layers given by
    context_decoder_layers .Also supports text context which gets encoded by the same encoder as context audio.
    Only one of context audio or contex text is supported.

    - decoder_context_tts: Text goes into the encoder; context & target audio go to the decoder. Also supports text
    context. Supports fixed sized context so we set context_duration_min and context_duration_max to the same
    value (5 seconds). Text context, which is usually shorter than number of codec frames of 5 second of audio, is
    padded to the max context duration in this model.

    - decoder_pretrain_synthesizer: This is the model type used for pretraining the decoder only on audio data using
    next frame prediction loss.
    Ncfgtrainerr   c                    s  d| _ |d ur|j|j | _ t|dr0t| d|ji|_|d= W d    n1 s+w   Y  |dd| _| 	|\}}|| _
|| _t| j
j}|d }|d | _|d | _|jd | _|jd | _|jd | _|jd | _|dd| _| jd	kr|jd
 | _|jd | _d | _| jd	k| _|dd| _t j||d g }t|jD ]}|t|j|j  qt!|| _"| jdkrt||j | _#t$j%d$i t&|j'| _'t$j%d$i t&|j(| _(t)|j(j*|j|j | _+t,j-|ddd}	|	`.|	/  | 0|	 |	| _1| jdkr5t2j3j4j5dd}
|
/  | 0|
 |
| _6t)|j7|j | _8dd t|j(j9D | _:nu| jdkr~|dg d| _:|dg d| _;dd t|j(j9D }| j:D ]}d||< q[| j;D ]}d||< qf|| _<t$j%d$i t&|j=| _=n,| jd	krdd t|j(j9D | _:n| jdkr|j>dksJ dnt?d| j | jrt| jj@|j | _AtjBd d!| _C|d"d}|dkrtD|d#| _Ed S d S )%N   r3   english_phonemeuse_text_conditioning_encoderF   
model_typesingle_encoder_sv_ttsdecoder_context_tts      use_kv_cache_for_inference)r8   r9   decoder_pretrain_synthesizercodecmodel_pathstricttitanet_large
model_namec                 S      g | ]}|qS r,   r,   .0idxr,   r,   r-   
<listcomp>       z,MagpieTTS_Model.__init__.<locals>.<listcomp>multi_encoder_context_ttstranscript_decoder_layers)rB   rA               context_decoder_layers)r   r:   r=   	   
      c                 S      g | ]}d qS Nr,   rM   _r,   r,   r-   rO          r   c                 S   rK   r,   r,   rL   r,   r,   r-   rO      rP           z@Alignment loss is not supported for decoder pretrain synthesizerUnsupported model type none)	reductionalignment_loss_scale)
loss_scaler,   )F
world_size	num_nodesnum_devicesr   r   r3   text_tokenizersgetr<   _setup_tokenizersr(   r+   lentokensbos_ideos_idnum_audio_tokens_per_codebookaudio_bos_idaudio_eos_idcontext_audio_bos_idcontext_audio_eos_idr>   
_tb_logger pad_context_text_to_max_durationrC   super__init__rangenum_audio_codebooksr    r   	Embeddingembedding_dim
ModuleListaudio_embeddingstext_embeddingr   TransformerdictencoderdecoderLineard_model
final_projr   restore_fromdiscriminatorevalfreeze_model_codec_modelnemo_asrmodelsEncDecSpeakerLabelModelr   _speaker_verification_modelspeaker_emb_dimspeaker_projection_layern_layersrR   rW   multi_encoder_mappingcontext_encoderrd   
ValueError
vocab_sizecontext_text_embeddingCrossEntropyLosscross_entropy_lossr   alignment_loss)selfr8   r9   r(   r+   num_tokens_tokenizer
num_tokensr~   r^   codec_modelspeaker_verification_modelr   layerrd   	__class__r,   r-   rx   q   s   














zMagpieTTS_Model.__init__c                 C   s   |  D ]}d|_qd S )NF)
parametersrequires_grad)r   modelparamr,   r,   r-   r      s   zMagpieTTS_Model.freeze_model Fc                    s^   t | dr
| jr
i S t |||}ddg}t| D ] t fdd|D r,| = q|S )N_no_state_dictr   r   c                       g | ]}| v qS r,   r,   rM   	substringkeyr,   r-   rO          z.MagpieTTS_Model.state_dict.<locals>.<listcomp>)r   r   rw   
state_dictlistkeysanyr   destinationprefix	keep_varsr   keys_substrings_to_excluder   r   r-   r      s   zMagpieTTS_Model.state_dictTc                    s   t  j|dd d S )NFrF   )rw   load_state_dict)r   r   rG   r   r,   r-   r      s   zMagpieTTS_Model.load_state_dictr   c                 C   s   t |j|j|d\}}||fS )Nr/   )r.   ri   r<   )r   r8   r#   r(   r+   r,   r,   r-   rk      s   

z!MagpieTTS_Model._setup_tokenizersc                 C   sZ   | j d u r*| jd u r| jjd u rd S | jj}| jjD ]}t|tr&|j} nq|| _ | j S r\   )ru   logger
experimentr9   loggers
isinstancer   )r   	tb_loggerr   r,   r,   r-   r      s   

zMagpieTTS_Model.tb_loggertargetc                 C   s0  |dkr| j }| j}n|dkr| j}| j}ntd| d| j  t g | jj	||d\}}tj
|d|ddf||j|jd}tj
|d|ddfd|j|jd}	tj|||	gd	d
}t|dD ]}
|||
d d ||
 d f< qk|d }| | fW  d    S 1 sw   Y  d S )Nr   contextzReceived audio_type of z. Must be `target` or `context`)audio	audio_lenr   r:   dtypedevicedimr=   )rr   rq   rt   rs   r   r   r   torchno_gradencodefullsizer   r   catry   long)r   r   r   
audio_typerr   rq   codes	codes_len
bos_tensor
pad_tensorrN   r,   r,   r-   audio_to_codes   s,   

$zMagpieTTS_Model.audio_to_codesc                 C   sn   | j   t $ d||| jk< d||| jk< | j j||d\}}||fW  d    S 1 s0w   Y  d S )Nr   )rm   
tokens_len)r   r   r   r   rq   rr   decode)r   r   r   r   r   r,   r,   r-   codes_to_audio  s   

$zMagpieTTS_Model.codes_to_audioc                 C   s`   d }t |dD ]}| j| |d d |d d f }|d u r"|}q	|| }q	||d }|S )Nr:   )ry   r   r~   )r   audio_tokensaudio_embeddingc	embeddingr,   r,   r-   embed_audio_tokens*  s    
z"MagpieTTS_Model.embed_audio_tokensc                 C   sN   | j   t  | j j||d\}}|W  d    S 1 s w   Y  d S )Ninput_signalinput_signal_length)r   r   r   r   forward)r   audio_16khzaudio_len_16khzr^   speaker_embeddingsr,   r,   r-   get_speaker_embeddings7  s   


$z&MagpieTTS_Model.get_speaker_embeddingsc                 C   s   t |}d }t|dD ]E}|| jj }|| jj }|d d d d ||f }	|d d |f }
| |	ddd|
}|| }| |  }|d u rN|}q|| }q||d }||fS )Nr:   r   r=   )r   ry   r   r8   rp   r   permutesum)r   logitsaudio_codesaudio_codes_lens	loss_masktotal_codebook_losscodebooksieicodebook_logitscodebook_targetscodebook_lossr,   r,   r-   compute_lossA  s"   
zMagpieTTS_Model.compute_lossc           
      C   s4   | j ||||||d}|d }| |d }	|	|fS )N)cond	cond_mask
attn_priorr   attn_probabilitiesoutput)r   r   )
r   dec_input_embeddeddec_input_maskr   r   r   r   decoder_outr   all_code_logitsr,   r,   r-   r   Y  s   zMagpieTTS_Model.forwardc                 C   s   g }t | jjD ].}|| jj }|| jj }|d d d d ||f }tj|dd}tj|dd}	||	 qtj|dd}t	|}
||

d }|S )Nr   r   r:   )ry   r8   rz   rp   r   softmaxargmaxr    stackr   	unsqueeze)r   r   r   	all_predsrN   r   r   r   codebook_probscodebook_preds
audio_maskr,   r,   r-   logits_to_audio_codesf  s   z%MagpieTTS_Model.logits_to_audio_codesffffff?P   c                 C   s   g }t | jjD ]M}|| jj }|| jj }|d d ||f }tj||ddd }	||	d d df dk }
| }td||
< tj	|| dd}t
|d}|| qtj|dd }|S )Nr   r   r   z-infr:   )ry   r8   rz   rp   r   topkr   clonefloatr   multinomialr    r   r   )r   all_code_logits_ttemperaturer  r   rN   r   r   r   codebook_logits_topkindices_to_removecodebook_logits_rescoredr  r  r,   r,   r-   sample_codes_from_logitsy  s    z(MagpieTTS_Model.sample_codes_from_logitsr   c           
      C   s   t  R t j|dd}|jdd}ttd|dD ]1}|| ||||  d || f }|  	 }t
|j}	| jj| d| |	| jdd qW d    d S 1 sYw   Y  d S )Nr:   r   rB   r   attention_matrix_HWC)global_stepdataformats)r   r   r   meanry   minr   detachcpunumpyr   Tr   	add_imager  )
r   attention_prob_matrixr   	text_lensr   dec_context_sizeattention_prob_matrix_meanrN   item_attn_matrixattn_npr,   r,   r-   log_attention_probs  s"   

"z#MagpieTTS_Model.log_attention_probsc                 C   sX  |  ||}| ||\}}| ||\}	}
d\}}|d ur-|jd dkr-| ||\}}ttd|dD ]r}||    	 }|	|    	 }|d ||  }|d |
|  }| j
jd| || j| jjd | j
jd| || j| jjd |d ur||    	 }|d ||  }| j
jd| || j| jjd q7d S )	NNNr=   rB   r   pred_audio_r  sample_ratetarget_audio_context_audio_)r  r   shapery   r  r   r	  r  r  r  r   	add_audior  r8   r&  )r   r   target_audio_codesaudio_codes_lens_targetcontext_audio_codescontext_audio_codes_lenspred_audio_codes
pred_audiopred_audio_lenstarget_audiotarget_audio_lenscontext_audiocontext_audio_lensrN   pred_audio_nptarget_audio_npcontext_audio_npr,   r,   r-   log_train_val_example  sD   z%MagpieTTS_Model.log_train_val_examplec                 C   s   |d u rd S | j j}| j j}||k r|S ||krd S t  d| }||||  ||   }|W  d    S 1 s;w   Y  d S )Nr   )r8   prior_end_stepprior_scaledown_start_stepr   r   )r   priorr  r:  r;  residual	new_priorr,   r,   r-   scale_prior  s&   
$zMagpieTTS_Model.scale_priorc                 C   sP   t j|dd}|jddd}|d d d d |d d d f }| j|||d}|S )Nr:   r   T)r   keepdim)attn_logprobin_lensout_lens)r   r   r  r   )r   attention_scoresr  
audio_lensr  attention_scores_combinedattention_scores_meanr   r,   r,   r-   compute_alignment_loss  s   z&MagpieTTS_Model.compute_alignment_lossc                  C   sD  d}d }d }d }d }d }d }d }	d }
d }d }d }| j dkrF|d }|d }| |}t|}| j||d d dd }|dd }| || j}| j dkrm|d	 }|d
 }| ||}| |}||	d }	|}
d }|}n'| j dv rd|v r|d }|d }n| j
|d |d dd\}}| |}| jr$|d }|d }| |}|d|dk rtj|d|d|d |d|jd}tj||gdd}n+|d|dkrtj|d|d|d |d|jd}tj||gdd}|d 	d	d }|| d| |  }|d  | d|d   |  }n|}|}t|}| j dkrM| j||d d dd }||g}	||g}
| j}|d g}nG| j dkr|d}|}|}|d urztj|d||d|jd}tj||gdd}|}	|}
d }|}|}n| j dkrntd| j  |	|
|||||||||dS )Nr   rD   textr  )r   r   r   align_prior_matrixr?   r   audio_lens_16khzr:   )rQ   r@   r-  r.  r4  r5  r   )r   context_text_tokenscontext_text_tokens_lensr=   r   r   has_text_contextr   rQ   r@   ra   )r   r   r   r   additional_decoder_inputaddtional_decoder_maskr  rI  r  r-  r.  )r>   r   r   r   rj   r?  r  r   r   r   r   r   r<   r   r   r   zerosr   r   r	  r   r   r   ) r   batchr  rP  rQ  r-  r.  _attn_priorr   r   r   r   rI  r  text_embedded	text_masktext_encoder_outtarget_audio_16khztarget_audio_lens_16khzr   speaker_embeddings_projectedcontext_audio_embeddedrL  context_text_lenscontext_text_embeddedpaddingrO  context_input_embeddedcontext_input_lenscontext_maskcontext_embeddingspadding_zerosr,   r,   r-   prepare_context_tensors  s   










z'MagpieTTS_Model.prepare_context_tensorsc                 C   s   d }d }|d urt |}t |}t|tr?dd |D }dd |D }g }	|D ]}
t |
}d|d d df< |	| q)n$t|t jrZt |}t |}	d|	d d df< d }n	tdt| ||	|||fS )Nc                 S   s   g | ]}t |qS r,   )r   
zeros_like)rM   	cond_itemr,   r,   r-   rO   o      z>MagpieTTS_Model.prepare_dummy_cond_for_cfg.<locals>.<listcomp>c                 S   r[   r\   r,   r]   r,   r,   r-   rO   p  r_   r:   r   zUnsupported type for cond )	r   re  	ones_liker   r   r    Tensorr   type)r   r   r   rP  additional_dec_maskdummy_additional_decoder_inputdummy_additional_dec_mask
dummy_condr   
dummy_mask	mask_itemmaskr,   r,   r-   prepare_dummy_cond_for_cfge  s*   





z*MagpieTTS_Model.prepare_dummy_cond_for_cfgr   c           !         s    |}d}d|vr |d |d \}}n|d }|d }|d d d d d df }|d d d d dd f }|d  }	}
t|	} jdd	d	koV|d
koV|d d u}|r|td  jjk r| 	|d |d |d |d \}}}}}d}na|d }|d }|d }|d }|d }|d
kr݈ jdd	d	krtd dk r݈ jd jj
}tjd|| |jd}||d }tjdd|df|jd jjk}|| ||   } |}|d d urtj||gdd}tj||gdd}n|}|} j||||||d d\}}|d }|d d |d d d f } |||
\}}d } jjd	krP|sP|d } fddt|D } |||
|}|| } n|} ||| |||||
|d |d |d |d  |d!S )"NFr   r   rE  r   r   r:   cfg_unconditional_probr`   r   r   r   rP  rQ  Tr   decoder_input_dropout_probg      ?dec_random_input_maxr   rN  r=   r   r   r   r   r   r   r   r   r  r  c                    &   g | ]\}}| j v r|d  d qS )cross_attn_probabilitiesr:   rR   rM   	layer_idxattnr   r,   r-   rO     
    

z1MagpieTTS_Model.process_batch.<locals>.<listcomp>rI  r-  r.  )r   	attn_infolossr   r   r   audio_codes_targetr,  rI  r  r-  r.  r  )rd  r   r   r8   rj   r   randitemrs  rr  rp   randintr   r   r   rt  r   r   r   r   rd   	enumeraterH  )!r   rS  r#   context_tensorsdisable_alignment_lossr   r   audio_codes_inputr  audio_codes_lens_inputr,  audio_codes_maskuse_cfgr   r   rP  additional_decoder_maskr   max_codebook_valrandom_audio_tokensdec_dropout_maskaudio_codes_embeddedr   r   r   r  r  r   r   r   r  cross_attention_scoresr  r,   r}  r-   process_batch  s   



	

zMagpieTTS_Model.process_batchc                 C   sv   |  |}|d }|d }| jd|ddd | jdddkr0|d }|d ur0| jd	|ddd | jd
|ddd |S )Nr  r   train_codebook_lossTprog_bar	sync_distrs  r`   r   train_alignment_loss
train_loss)r  logr8   rj   )r   rS  	batch_idxbatch_outputr  r   r   r,   r,   r-   training_step  s   
zMagpieTTS_Model.training_stepc                    s   j |dd}|d }|d }|d }|d }|d }|d }	|d	 }
|d
 }|d }|d }|d }|d u r?tjd|jd}|dkrx jdkrx |||	|
|  jdkrxt| jd  d dkrx fddt	|D } j
||	|d|d |||d} j| |S )Nvalr/   r  r   r   r   r  r,  r-  r.  r  r  r  r`   rN  r   rD   rx  r:   c                    rw  )rx  r   ry  rz  r}  r,   r-   rO     r~  z3MagpieTTS_Model.validation_step.<locals>.<listcomp>val_)r   r  )val_lossval_codebook_lossval_alignment_loss)r  r   tensorr   global_rankr9  r>   rl   rR   r  r"  validation_step_outputsr    )r   rS  r  r  r  r   r   r   r  r,  r-  r.  r  r  r  cross_attention_probs
val_outputr,   r}  r-   validation_step  sH   


	zMagpieTTS_Model.validation_step  r   c           +   
      s  t   | jj| jd | |}|d }t j|d| jj	df| j
|jd }	t j|dfd|jd }
|	}t|
}g }i  |r[| |d |d |d |d	 \}}}}}tD ]X}|d
 dkrotd|  | |}|d d urt j|d |gdd}t j|d	 |gdd}n|}|}|r*|d}t|d trdd t|d |D }dd t|d |D }nt j|d |gdd}t j|d |gdd}t j||gdd}t j||gdd}|d ur|||d d |df< |||d d |df< | j||||d |d d\}}|d | }||d  }d| | ||  }n| j|||d |d d |d d\}}|d d dd d f } | j| ||d}!| j| dd}"t|"dD ]0}#|# vr|"|# d  }$|!|# d  }%|$| jks|%| jkrtd|#| | |#< q^||! t j||!dgdd}|
d }
t|
}t |dkrtd  nq_t j|dd}& fddt|dD }'t j|'|jd }(|  |&|(\})}*t j!"  |)|*|&|(fW  d    S 1 sw   Y  d S )N)	use_cacherI  r   r:   rN  r   r   rP  rQ     zDecoding timestep r   c                 S   "   g | ]\}}t j||gd dqS r   r   r   r   )rM   rf  dummy_cond_itemr,   r,   r-   rO   Y      z/MagpieTTS_Model.infer_batch.<locals>.<listcomp>c                 S   r  r  r  )rM   cond_mask_itemdummy_cond_mask_itemr,   r,   r-   rO   ]  r  r   rv  r   )r  r  {Gz?)r  z'End detected for item {} at timestep {}zAll ends reachedc                    s   g | ]}  |qS r,   )rj   rL   end_indicesmax_decoder_stepsr,   r-   rO     s    )#r   r   r   reset_cacherC   rd  r   r   r8   rz   rq   r   r   r   rr  ry   printr   r   r   r   zipr   r  r  rr   formatr    r   rl   r   r  r   cudaempty_cache)+r   rS  r  r  r  r  	cfg_scaler  rI  audio_codes_bosr   r  r  all_predictionsrn  dummy_cond_maskrl  dummy_addition_dec_maskr^   rN   r  _audio_codes_embedded_audio_codes_mask
batch_sizecfg_condcfg_cond_maskcfg_audio_codes_embeddedcfg_audio_codes_maskcombined_logitscond_logitsuncond_logitsr   r  audio_codes_nextall_codes_next_argmaxitem_idx
pred_tokenpred_token_multinomialpredicted_codespredicted_lenspredicted_codes_lenspredicted_audiopredicted_audio_lensr,   r  r-   infer_batch.  s   
	



	




&zMagpieTTS_Model.infer_batchc              
   C   sR  t   | jj}| jdd}| jdd}| jdd}| jdd}| j|| jd	d
||||d\}}	}
}t|dD ]V}|| 	 
   }|d |	|  }|| | }| jjd||| jjd | jj}tj|d}tj|s~t| tj|d| j d| d}t||| jj q@W d    d S 1 sw   Y  d S )Ninference_temperaturer  inference_topkr  inference_use_cfgFinference_cfg_scaler   r  r  r  r  r  r  r  r   r  r%  audiospredicted_audioRankr^   .wav)r   r   _test_dlr  r8   rj   r  ry   r   r	  r  r  r  r   r*  r&  r   log_dirospathjoinexistsmakedirsr  sfwrite)r   rS  r  test_dl_batch_sizer  r  r  r  r  r  r  r  rN   predicted_audio_npr  r  	audio_dir
audio_pathr,   r,   r-   	test_step  s@   

"zMagpieTTS_Model.test_stepc                    sh    fdd}|d}|d}|d} j d|ddd  j d|ddd  j d|ddd  j  d S )Nc                    s   t  fddjD  S )Nc                    s   g | ]}|  qS r,   r,   )rM   xr   r,   r-   rO     r   zMMagpieTTS_Model.on_validation_epoch_end.<locals>.<lambda>.<locals>.<listcomp>)r   r   r  r  r   r}  r   r-   <lambda>  s    z9MagpieTTS_Model.on_validation_epoch_end.<locals>.<lambda>r  r  r  Tr  r  r  clear)r   collectr  r  r  r,   r}  r-   on_validation_epoch_end  s   z'MagpieTTS_Model.on_validation_epoch_endc                 C   sr   t |jf| j| j| j| j| j| j| jj	| jj
| jj| jj|| jj| j| jj| jjd}| jdk|_| jj|_|S )N)rn   ro   rq   rr   rs   rt   rz   codec_model_downsample_factorprior_scaling_factorload_cached_codes_if_availabler2   r"   rv   context_duration_mincontext_duration_maxr?   )r   r1   rn   ro   rq   rr   rs   rt   r8   rz   r  r  r  r<   rv   r  r  r>   load_16khz_audiori   r'   )r   r8   r2   r1   r,   r,   r-   get_dataset  s.   zMagpieTTS_Model.get_datasetc                 C   sz   | j |dd}|j|jj| jjd}d}|jjdkr&d}| | j\|_	|_
tjjj|f|j|d|jt|d}|S )	Nr   r2   )rf   Tr   F)
collate_fnsamplerr6   persistent_workers)r  get_samplerdataloader_paramsr  r9   rf   num_workersrk   r8   r3   r+   r   utilsdata
DataLoaderr  r6   )r   r8   r1   r  r  data_loaderr,   r,   r-   _setup_train_dataloader  s$   
z'MagpieTTS_Model._setup_train_dataloaderc                 C   sf   | j |dd}d}|jjdkrd}| j| jdd\|_|_tjj	j
|fd|ji|jt|d}|S )	Nr   r  Tr   Fr/   r  r  )r  r  r   rk   r8   r3   r+   r   r  r  r  r  r6   )r   r8   r1   r  r  r,   r,   r-   _setup_test_dataloader  s    
z&MagpieTTS_Model._setup_test_dataloaderc                 C      |  || _d S r\   )r  	_train_dlr   r8   r,   r,   r-   setup_training_data     z#MagpieTTS_Model.setup_training_datac                 C   r  r\   )r  _validation_dlr	  r,   r,   r-   setup_validation_data  r  z%MagpieTTS_Model.setup_validation_datac                 C   r  r\   )r  r  r	  r,   r,   r-   setup_test_data  r  zMagpieTTS_Model.setup_test_datareturnc                 C   s   g S r\   r,   )clsr,   r,   r-   list_available_models  s   z%MagpieTTS_Model.list_available_modelsr\   Nr   F)T)r   )r   )r  r  )r   r   r#  )r   r   )r  r  r  Fr   )+__name__
__module____qualname____doc__r   rx   r   r   r   rk   propertyr   r   r   r   r   r   r   r  r  r"  r9  r?  rH  rd  rr  r  r  r  r  r  r  r  r  r  r
  r  r  classmethodr   r   r  __classcell__r,   r,   r   r-   r7   X   sN    f






*
z
m
/z!
r7   c                       sL   e Zd ZdZddeddf fddZdd	 Zd
d Zdd Zdd Z	  Z
S )MagpieTTS_ModelInferencea1  Small override of MagpieTTS_Model for parallel multi-GPU inference and metrics calculation.
    This class is used in 'test' mode and leverages trainer.test() for multi-GPU/multi-node inference.
    Saves the predicted audio files and logs the CER/WER metrics as individual json files for each audio.
    Nr8   r9   r   c                    s   t  || |dddkr"tjjjdd| _| j  | j	  tjj
jdd| _| j  | j	  |ddrVdd	lm}m} |d
| _|d
| _| j	  d S d S )Npref_set_languageenznvidia/parakeet-tdt-1.1brI   rH   load_whisper_modelFr   )WhisperForConditionalGenerationWhisperProcessorzopenai/whisper-large-v3)rw   rx   rj   r   r   EncDecRNNTBPEModelr   eval_asr_modelfreezer   r   eval_speaker_verification_modeltransformersr  r   whisper_processorwhisper_model)r   r8   r9   r  r   r   r,   r-   rx   #  s$   



z!MagpieTTS_ModelInference.__init__c           
      C   s   t j|dd\}}|r| jj|dnd }| j||ddj}|| j}t  | j	j
||d}W d    n1 s:w   Y  | jj|dd}|d	 }	|	S )
N>  )sr)languagept)sampling_ratereturn_tensors)forced_decoder_idsT)skip_special_tokensr   )librosaloadr&  get_decoder_prompt_idsinput_featurestor   r   r   r'  generatebatch_decode)
r   audio_filepathr*  speech_arrayr,  r.  inputspredicted_idstranscriptionresultr,   r,   r-   transcribe_with_whisper9  s   
z0MagpieTTS_ModelInference.transcribe_with_whisperc                 C   s   |  }|dd}|dd}|dd}|dd}|dd}d| }|tddtj}|dd	 |d
d |S )zc
        Normalizes text for CER/WER calculation.
        Taken from hallucination_eval.py
        ,r   - ';.zh t t phttpzw w wwww)	lowerreplacer  split	translatestr	maketransstringpunctuation)r   
input_textlower_case_textno_comma_textno_dash_textsingle_space_textr,   r,   r-   process_textD  s   z%MagpieTTS_ModelInference.process_textc                 C   s   g }g }|D ]-}t |\}}|dkrtjj||dd}tj|tj| jd}|	| |	|
d qtj|| jd }t|  }	t||	gd}| jj||d\}
}|S )Nr(  )orig_sr	target_srr   r   rN  )max_lensr   )r  readr0  coreresampler   r  float32r   r    r   r   intmaxr  r   r$  r   )r   	filepathsaudio_batchaudio_lengthsfilepathr   r)  audio_tensorbatch_audio_lensmax_audio_lenr^   r   r,   r,   r-   %get_speaker_embeddings_from_filepaths_  s    

z>MagpieTTS_ModelInference.get_speaker_embeddings_from_filepathsc           #         s   t     jj} jdd} jdd} jdd} jdd} j| jd	d
||||d\}}	}
}g }g }d}t|dD ]}|| 	 
   }|d |	|  }|| | } jj}tj|d}tj|szt| tj|d j d| d}|t| jj  t|| jj |
|  t j}|d d d || f }t |tj|d j d| d || |s[t   z3 jdddkr jj|t|dd } fdd|D }n fdd|D } fdd|D }W n8 ty> } z+|	| dk   sJ d|	 t!"d|  t!"d d}W Y d }~W d    qGd }~ww  #|} #|d }W d    n	1 sVw   Y  qGt|dD ]}|s|| | }|| } $|d | }t%|g|gdd }t%|g|gdd }||   }||   }t&'||t&j()|t&j()|  } nd}d}d!} d"} $|d | }t	|t	||| t	| ||d#}!t*tj|d j d| d$d%}"t+,|!|" W d    n	1 sw   Y  qcW d    d S 1 s	w   Y  d S )&Nr  r  r  r  r  Fr  r   r  r  r  r   r  r  r^   r  z	_codes.ptr  r  )r  c                       g | ]}  |qS r,   rS  rM   
transcriptr}  r,   r-   rO     rg  z6MagpieTTS_ModelInference.test_step.<locals>.<listcomp>c                    s   g | ]
}  | jjqS r,   )r=  r8   r  )rM   r  r}  r,   r-   rO     s    c                    re  r,   rf  rg  r}  r,   r-   rO     rg  i  zYExpected short audio file to be the only cause of ASR errors, but got error with lengths z$Exception during ASR transcription: zkSkipping processing of the batch; generating metrics indicating a WER of 100% and Speaker Similarity of 0.0Taudio_filepaths	raw_texts)use_cerr`   z	<INVALID>)cer_gtwer_gtdurationspk_similaritypred_transcriptgt_transcriptz_metrics.jsonw)-r   r   r  r  r8   rj   r  ry   r   r	  r  r  r  r   r  r  r  r  r  r  r  r    rl   r&  r  r  rj  int16saver"  
transcribe	Exceptionr   r   warningrd  rS  r   npdotlinalgnormopenjsondump)#r   rS  r  r  r  r  r  r  r  r  r  r  predicted_audio_pathsaudio_durationsbatch_invalidrN   r  r  r  r  r  predicted_codes_torchpred_transcriptsepred_speaker_embeddingsgt_speaker_embeddingsrp  rq  rl  rm  spk_embedding_predspk_embedding_gtro  item_metricsfr,   r}  r-   r  t  s   







	$z"MagpieTTS_ModelInference.test_stepr\   )r  r  r  r  r   rx   r=  rS  rd  r  r  r,   r,   r   r-   r    s    r  c                       s|   e Zd ZdZddeddf fddZd fd
d	ZdddZ								dddZdd Z	dd Z
dd Zdd Z  ZS ) MagpieTTS_ModelDPOa4  Extends MagpieTTS_Model to support Direct Preference Optimization (DPO) training.
    This class is used for training the model with preference-based losses, including DPO, RPO, and IPO losses.
    It maintains a frozen reference model to compare log probabilities between policy and reference outputs.

    Nr8   r9   r   c                    s   t  || t|}t| d|_d|_W d   n1 s!w   Y  t|d| _t	d | j
tj|jddd  | | j | j  d| j_t	d dS )	zInitialize the MagpieTTS_ModelDPO class.

        Args:
            cfg (DictConfig): Configuration object containing model hyperparameters.
            trainer (Trainer, optional): Trainer instance for model training.
        N)r8   z'Loading reference model from checkpointr  )map_locationr   Tz!Reference model loaded and frozen)rw   rx   copydeepcopyr   train_dsvalidation_dsr7   _reference_modelr  r   r   r1  reference_model_ckpt_pathr   r   r   )r   r8   r9   ref_model_cfgr   r,   r-   rx     s   


zMagpieTTS_ModelDPO.__init__r   Fc                    sJ   t  |||}g d}t| D ] t fdd|D r"| = q|S )a  Return the state dictionary excluding non-trainable components.

        Excludes state keys related to `_speaker_verification_model`, `_codec_model`, and `_reference_model`.

        Args:
            destination (dict, optional): The destination dictionary for the state_dict.
            prefix (str, optional): Prefix to prepend to keys.
            keep_vars (bool, optional): If True, tensors in the returned dictionary will not be detached.

        Returns:
            dict: Filtered state dictionary.
        )r   r   r  c                    r   r,   r,   r   r   r,   r-   rO     r   z1MagpieTTS_ModelDPO.state_dict.<locals>.<listcomp>)rw   r   r   r   r   r   r   r   r-   r     s   zMagpieTTS_ModelDPO.state_dictc                 C   sL   t j|dd|ddd}|r|| d|d S || dS )a  Compute the log probabilities of the given labels under the given logits.

        Args:
            logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
            labels: Labels for which to compute the log probabilities. Label tokens with a value of -100 are ignored.
                Shape: (batch_size, sequence_length)
            average_log_prob: If True, return the average log probability per (non-masked) token. Otherwise, return
                the sum of the log probabilities of the (non-masked) tokens.

        Returns:
            A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under
            the given logits.
        r   r=   )r   index)r   gatherlog_softmaxr   squeezer   )r   r   labelsr   average_log_probper_token_logpsr,   r,   r-   _get_batch_logps  s   "z#MagpieTTS_ModelDPO._get_batch_logps皙?r   r   dpoc                 C   sX  || }|| }|rd}|| }|
dkr|dd|   d }nx|
dkr^t jj|| }t jj| | }|||  }t jj|}t jj| }t |||  t |||   }n9|
dkrq|||  }|| | d }n&|
dkrt jj}|||  d|	  || | |	  }ntd|
|||   }|||   }|||fS )	a  Compute the DPO loss for a batch of policy and reference model log probabilities.

        Args:
            policy_chosen_logps: Log probabilities of the policy model for the chosen responses.
                Shape: (batch_size,)
            policy_rejected_logps: Log probabilities of the policy model for the rejected responses.
                Shape: (batch_size,)
            reference_chosen_logps: Log probabilities of the reference model for the chosen responses.
                Shape: (batch_size,)
            reference_rejected_logps: Log probabilities of the reference model for the rejected responses.
                Shape: (batch_size,)
            beta: Temperature parameter for the DPO loss, typically something in the range of 0.1 to 0.5. We ignore
                the reference model as beta -> 0.
            label_smoothing: conservativeness for DPO loss, which assumes that preferences are noisy (flipped with
                probability label_smoothing)
            ipo: If True, use the IPO loss instead of the DPO loss.
            reference_free: If True, we ignore the _provided_ reference model and implicitly use a reference model
                that assigns equal probability to all responses.

        Returns:
            A tuple of three tensors: (losses, chosen_rewards, rejected_rewards).
            The losses tensor contains the DPO loss for each example in the batch.
            The chosen_rewards and rejected_rewards tensors contain the rewards for the chosen and rejected
            responses, respectively.
        r   ipor:   r=   rporpo_sqr  zloss type {} is not implemented)r   r   
functional
logsigmoidexpNotImplementedErrorr  r  )r   policy_chosen_logpspolicy_rejected_logpsreference_chosen_logpsreference_rejected_logpschosen_gt_rewardsrejected_gt_rewardsbetagt_reward_scalelabel_smoothing	loss_typereference_freepi_logratiosref_logratiosr   losseslogbeta_hat_chosenlogbeta_hat_rejectedgt_rewards_deltalogalpha_hat_chosenlogalpha_hat_rejectedFchosen_rewardsrejected_rewardsr,   r,   r-   preference_loss&  s:   '*
z"MagpieTTS_ModelDPO.preference_lossc           #      C   s  |d }|d }|  |}|  |}t  | j |}| j |}W d   n1 s-w   Y  d}d}	d}
d}t| jjD ]}|| jj }|| jj }|d dddd||f }|d dddd||f }|d dddd||f }|d dddd||f }|d dd|f }|d dd|f }| |||d }| |||d }t  | |||d }| |||d }W d   n1 sw   Y  |du r|}|}	|}
|}q@||7 }|	|7 }	|
|7 }
||7 }q@|d }|d }t	|dksJ t	|dk sJ | j
||	|
|||| jd	d
| jddd\}}}| }|  }| jdd}| jdd} || ||   }!|d }"|"durV|!|"7 }!|!|||"dS )a  Process a batch for Direct Preference Optimization (DPO) training.

        This method computes the preference loss by comparing the model's policy outputs with a frozen reference model.
        It processes chosen and rejected samples, extracts log probabilities for each codebook, and calculates the
        preference loss based on the difference in likelihoods between chosen and rejected responses.

        Args:
            batch_chosen_rejected (dict): A dictionary containing two keys:
                - 'chosen': The batch of chosen responses.
                - 'rejected': The batch of rejected responses.

        Returns:
            dict: A dictionary containing:
                - 'loss': The total computed loss.
                - 'pref_loss': The preference loss.
                - 'sft_loss': The supervised fine-tuning loss.
                - 'alignment_loss': The alignment loss, if applicable.
        chosenrejectedNr   r  r   rewardsr:   dpo_betar  dpo_loss_typer  )r  r  r  r  dpo_pref_loss_weightr   dpo_sft_loss_weightr`   r   )r  	pref_losssft_lossr   )r  r   r   r  ry   r8   rz   rp   r  allr  rj   r  )#r   batch_chosen_rejectedbatch_chosenbatch_rejectedmodel_output_chosenmodel_output_rejectedreference_model_output_chosenreference_model_output_rejectedchosen_policy_logprobsrejected_policy_logprobschosen_ref_logprobsrejected_ref_logprobscodebook_idxr   r   codebook_logits_chosencodebook_logits_rejectedref_codebook_logits_chosenref_codebook_logits_rejectedcodebook_labels_chosencodebook_labels_rejectedcodebook_log_probs_chosencodebook_log_probs_rejectedref_codebook_log_probs_chosenref_codebook_log_probs_rejectedrewards_chosenrewards_rejectedr  r  r  r  pref_loss_weightsft_loss_weightr  r   r,   r,   r-   process_batch_dpow  s   










z$MagpieTTS_ModelDPO.process_batch_dpoc                 C   sT   |  |}| jd|d ddd | jd|d ddd | jd|d ddd |d S )	zPerform a training step using DPO loss.

        Args:
            batch (dict): Batch data containing chosen and rejected samples.
            batch_idx (int): Index of the batch.

        Returns:
            Tensor: Training loss.
        r  r  Tr  train_pref_lossr  train_sft_lossr  )r  r  )r   rS  r  dpo_outputsr,   r,   r-   r    s
   

z MagpieTTS_ModelDPO.training_stepc                 C   sD   |  |}|d }|d }|d }|d }| j||||d dS )zPerform a validation step using DPO loss.

        Args:
            batch (dict): Validation batch data.
            batch_idx (int): Batch index.
        r  r  r  r   )r  val_pref_lossval_sft_lossr  N)r  r  r    )r   rS  r  r  r  r  r  r  r,   r,   r-   r    s   
z"MagpieTTS_ModelDPO.validation_stepc                    s    fdd}|d}|d}|d}|d} j d|ddd  j d|ddd  j d|ddd |d	ur> j d|ddd  j  d	S )
z?Aggregate validation losses at the end of the validation epoch.c                    sT   g } j D ]}||  d ur|||   q|tjd jd qt|}| S )Nr`   rN  )r  r    r   r  r   r   r  )r   valuesr  stacked_valuesr}  r,   r-   r    s   

z;MagpieTTS_ModelDPO.on_validation_epoch_end.<locals>.collectr  r  r  r  Tr  Nr  )r   r  r  r  r  r  r,   r}  r-   r    s   
z*MagpieTTS_ModelDPO.on_validation_epoch_endr\   r  )F)NNr  r   r   r  F)r  r  r  r  r   rx   r   r  r  r  r  r  r  r  r,   r,   r   r-   r    s"    

Qjr  r  )9r  r}  r  rL  typingr   r0  r  rx  	soundfiler  r   hydra.utilsr   	omegaconfr   r   pytorch_lightningr   pytorch_lightning.loggersr   r   torch.utils.datar	   r%  r
   r   nemo.collections.asrcollectionsasrr    nemo.collections.asr.metrics.werr   @nemo.collections.common.tokenizers.text_to_speech.tts_tokenizersr   (nemo.collections.tts.losses.aligner_lossr   nemo.collections.tts.modelsr   nemo.collections.tts.modulesr   (nemo.collections.tts.parts.utils.helpersr   r   2nemo.collections.tts.parts.utils.tts_dataset_utilsr   nemo.core.classesr   nemo.core.classes.commonr   
nemo.utilsr   r.   r6   r7   r  r  r,   r,   r,   r-   <module>   sP   
       L A