o
    ߥig                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlZd dl	Z
d dlZd dlZd dlmZmZ d dlmZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZm Z  d dl!m"Z" e" Z#dd Z$				dddZ%dddZ&G dd dZ'dS )    N)OrderedDict)Lock)get_am_datasetsget_voc_datasets)model_builder)criterion_builder)GAN_TrainerSambert_Trainerdistributed_init)KanTtsLinguisticUnit)
DataLoader)TtsCustomParams)TtsModelConfigurationExceptionTtsModelNotExistsException)
get_loggerc                 C   s   t dd |  D S )Nc                 s   s    | ]
}|j r| V  qd S )N)requires_gradnumel).0p r   U/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/audio/tts/voice.py	<genexpr>   s    z#count_parameters.<locals>.<genexpr>)sum
parameters)modelr   r   r   count_parameters   s   r      333333?mean_stdc           	      C   s(  |dkrP|}| d d df }| d d df }d|||k < d|||k< ||dd d d f  |ddd d f  }||||k < || d d df< || d d df< | S |}| d d df }| d d df }d|||k < d|||k< ||d |d   |d  }||||k < || d d df< || d d df< | S )Nr                 ?   r   r   )	melf0_thresholduv_threshold	norm_type
f0_featuref0_mvnf0uvf0_global_max_minr   r   r   	denorm_f0    s2   ,r-   c                 C   sN   |   }t| d d df |k d }d|d d df< d|d d df |< |S )Nr    r   r"   r!   )clonetorchwhere)r$   	thresholdres_melindexr   r   r   binarizeF   s
   r4   c                   @   sn   e Zd Zdi ddfddZdd Zdd	 Zd
d Zdd Zdd Zde	 fddZ
de	 fddZdd ZdS )VoiceNTFc                 C   s  || _ || _|| _|| _tj std| _d| _ndtj	j
_t \| _| _| _| _t|dkr|tj | _|tj | _tj| jsMtj|| j| _tj| js]tj|| j| _|tj }|tj }tj|sttj||}tj|stj||}| || _| || _|tjd| _ tj| j stj|| j | _ |tj!d| _"tj| j"stj|| j"| _"|tj#d| _$tj| j$stj|| j$| _$|tj%d| _&tj| j&stj|| j&| _&nNtj|d| _$tj|d	d
| _tj|dd
| _tj|d	d| _ | tj|d	d| _| tj|dd| _tj|d	d| _&tj|ddd| _"t'(d| j d| j  t'(d| j$  t'(d| j  t'(d| j  t'(d| j  d| j"  t'(d| j&  tj)| jst*dtj)| jst*dt| jdkrt+dt| jdkrt+dt,| jd}t-j.|t-j/d| _0W d    n	1 sw   Y  t,| jd}t-j.|t-j/d| _1W d    n	1 sw   Y  d| j0vrt*d| j0d dd| _2d| _3t4 | _5t6| j0| _7| j78 | _9| jr0t:d }	| j9; D ]\}
}|
|	v r.|d! | j9|
< q| j0d" d# d$ <| j9 | j0d" d# d$ d%d| _=| j=rk| jsktj)| j sdt*d&| j  d't>.| j | _?| j0d" d# d$ d(d| _@| j@r| js| j0d" d# d$ d)d*| _A| jAd*krtj)| j&st+d+| j& d't>.| j&| _Bd S | j0d" d# d$ d,d-}| j0d" d# d$ d.d/}||g| _Bd S d S d S )0NcpuFTr   zse.npyzse.onnxzaudio_config.yamlzmvn.npyamconfig.yamlvocckptsez
am_config=z voc_config=zaudio_config=z	am_ckpts=z
voc_ckpts=zse_path=z se_model_path=z	mvn_path=z,modelscope error: am configuration not foundz-modelscope error: voc configuration not foundz)modelscope error: am model file not foundz*modelscope error: voc model file not foundrLoaderlinguistic_unitzno linguistic_unit in am configlanguagePinYin)sytonesyllable_flagword_segmentemotionspeakerr#   ModelKanTtsSAMBERTparamsSEzse enabled but se_file: not existsNSFnsf_norm_typer   zf0_mvn_file: nsf_f0_global_minimumg      >@nsf_f0_global_maximumg     І@)C
voice_name
voice_pathignore_maskis_trainr/   cudais_availabledevicedistributedbackendscudnn	benchmarkr
   
local_rank
world_sizelenr   	AM_CONFIGam_config_path
VOC_CONFIGvoc_config_pathospathisabsjoinAM_CKPTVOC_CKPT	scan_ckptam_ckpts	voc_ckptsgetSE_FILEse_pathSE_MODELse_model_pathAUIDO_CONFIGaudio_configMVN_FILEmvn_pathloggerinfoexistsr   r   openyamlloadr>   	am_config
voc_config	lang_typemodel_loadedr   lockr   	ling_unitget_unit_sizeling_unit_sizesetitemsupdate	se_enablenpr;   
nsf_enablerN   r(   )selfrQ   rR   custom_ckptrS   rT   am_ckptvoc_ckptf
target_setkvrO   rP   r   r   r   __init__Q   sL  







zVoice.__init__c                 C   s   |}d}t j|sd}t j|}t |}t|dkri S i }|D ]=}t|d dkr.q#|dd  dkr`|dd dkr`|d	d }t|d
d }t j||}	|r\|	|kr\q#|	||< q#t	t
| }
|
S )NFTr      .pth
   
checkpoint._r    )rc   rd   isdirdirnamelistdirr^   splitintrf   r   sortedr   )r   	ckpt_pathselect_targetinput_not_dirfilelistckptsfilenamefilename_prefixidxrd   odr   r   r   ri      s,   
 zVoice.scan_ckptc                 C   sd   t | j| j\| _}}| jd | _tj| jtt	| j | jd}| jj
|d dd | j  d S )NrI   map_locationr   F)strict)r   r{   rW   am_modelr7   r/   rz   rj   nextreversedload_state_dicteval)r   r   
state_dictr   r   r   load_am   s   zVoice.load_amc                 C   s   ddl m} |di | jd d d | _tj| jtt| j | j	d}| j
|d d  | jd d d d	 d
krFddlm} | | _| j  | j | j	 d S )Nr   )	GeneratorrH   r   rJ   r   r   	generatorout_channelsr#   )PQMFr   )kantts.models.hifigan.hifiganr   r|   	voc_modelr/   rz   rk   r   r   rW   r   kantts.models.pqmfr   remove_weight_normr   to)r   r   statesr   r   r   r   load_vocoder   s   
zVoice.load_vocoderc              	   C   s  | j J t 3 | j|}d}| j r1t||  | j	}tj
|gddd}nNt||  | j	}|d }t||  | j	}|d }t||  | j	}|d }t||  | j	}	tj
||||	gddd}|d }t||  | j	d}
|d }| jrt| jjt|| dd | j	dd d d dd d f }nt||  | j	dd d d df }td| j	 |
d d }| |d d d dd d f |
d d d df ||}|d }|d }t|d  }|dd |d d f  }| jr-t|| j| jd}|W  d    W  d    S 1 sAw   Y  W d    d S 1 sRw   Y  d S )	Nr   r    )dimr#   )axispostnet_outputsLR_length_rounded)r'   r(   )r   r/   no_gradr   encode_symbol_sequence
using_byte
from_numpylongr   rW   stack	unsqueezer   r;   repeatr^   floatzerossizer7   r   itemr6   r   r-   rN   r(   )r   
symbol_seqinputs_feat_lstinputs_feat_indexinputs_byte_indexinputs_ling	inputs_syinputs_toneinputs_syllable	inputs_ws
inputs_emo
inputs_spk
inputs_lenresr   r   valid_lengthmel_postr   r   r   
am_forward  s   



.$zVoice.am_forwardc                 C   s   t  ; || j}| jjrt|}|ddd}| |}t	| jdr-| j
|}|d  }|W  d    S 1 sBw   Y  d S )Nr#   r   pqmfr    )r/   r   r   rW   r   r   r4   	transposer   hasattr	synthesisviewr6   numpy)r   melspecxyr   r   r   vocoder_forwardQ  s   

$zVoice.vocoder_forwardc           !         s  t d t| jdkrtd|dd}|dk r!|dd}n|dd}|d	d}	t| jd
}
tj	|
tj
d}W d    n1 sFw   Y  t|d
}
|tj	|
tj
d || W d    n1 skw   Y  d }|rtt| j}| j| }tj|std| dn|| jvrtd| | j| }|	dkr|	| }||d< t d|  tdt |d< ddlm} ||d< ttj|dd}
tj||
tjd d W d    n1 sw   Y  | D ]\}}t | d|  q| jrtj |d< d|d< | jrd}d}nd}d}|d d d d d  fd!d"|D }t||||d# d$| d%\}}t d&t| d' t d(t| d' d d d)}| jr}dd*lm } ||| j!dd+|d,< |ry||| j!dd+nd |d-< t"|| jrdnd|j#|d. |d/ |d, |d0 d1}|rt"|| jrdnd|j#|d. |d/ |d- |d0 d1nd }|j$% }|d d d | t&|| j'| j(| j\}}}t)|| j'}t*|||||| j'||||||d2 |d3 |d4 |d5 d6}|d ur
|+|dd t d7| d' z|,  W d S  t-t.fyL }  z,t j/| dd8 |0tjtj|d9d:|j1 d; t d<|j1 d= W Y d } ~ d S d } ~ ww )>NzTRAIN SAMBERT....r   "resume pretrain but model is emptyresume_from_stepsr    resume_from_latestTFtrain_stepsr<   r=   latest model:rL   no such model from steps:train_max_stepsTRAINING steps: %Y-%m-%d %H:%M:%Screate_time__version__modelscope_versionr8   wDumperdefault_flow_style = rankrX   r!   g{Gz?rH   rI   rJ   FPc                    s"   g | ]}t j| sd ndqS )zraw_metafile.txtzfprm_metafile.txt)rc   rd   rf   )r   d	fp_enabler   r   
<listcomp>  s    
z'Voice.train_sambert.<locals>.<listcomp>allow_cacher"   )split_ratioThe number of training files = r   !The number of validation files = trainvalidDistributedSamplerdatasetnum_replicasshuffler  r  
batch_sizenum_workers
pin_memoryr  
collate_fnr  r  samplerr  save_interval_stepseval_interval_stepslog_interval	grad_norm)configr   	optimizer	scheduler	criterionrW   r  train_loadervalid_loader	max_stepssave_dirsave_intervalvalid_intervalr  	grad_clipSuccessfully resumed from exc_infor:   checkpoint-r    Successfully saved checkpoint @ steps.)2ru   rv   r^   rj    TtsTrainingInvalidModelExceptionrl   rx   rr   ry   rz   r>   r   r   r   rc   rd   rw   timestrftime	localtime
modelscoper   rf   dumpr   r   rX   r/   get_rankr   r   torch.utils.data.distributedr  r]   r   r  r   r   r   rW   r\   r   r	   load_checkpointr  	ExceptionKeyboardInterrupterrorsave_checkpointsteps)!r   work_dir	stage_dirdata_dirconfig_pathignore_pretrainhparams
from_stepsfrom_latestr   r   r  resume_fromr   r   keyvaluevalid_enablevalid_split_ratio	meta_filetrain_datasetvalid_datasetr  r  train_dataloadervalid_dataloaderr   r   r  r  r  trainerer   r   r   train_sambert]  s"  








	



zVoice.train_sambertc                 C   s  t d t| jdkrtd|dd}|dk r!|dd}n|dd}|d	d}	t| jd
}
tj	|
tj
d}W d    n1 sFw   Y  t|d
}
|tj	|
tj
d || W d    n1 skw   Y  d }|rtt| j}| j| }tj|std| dn|| jvrtd| | j| }|	dkr|	}||d< t d|  t d|  tdt |d< ddlm} ||d< ttj|dd}
tj||
tjd d W d    n1 sw   Y  | D ]\}}t | d|  qt||\}}t dt| d t dt| d d d d}| jrEddlm} ||| jdd|d < ||| jdd|d!< t|| jrMdnd|j |d" |d# |d  |d$ d%}t|| jrgdnd|j |d" |d# |d! |d$ d%}t!|| j"| j#| j\}}}t$|| j"}t%|||||| j"||||||d& |d' |d( d)}|d ur|&| t d*| d z|'  W d S  t(t)fy } z,t j*|dd+ |+tjtj|d,d-|j, d. t d/|j, d0 W Y d }~d S d }~ww )1NzTRAIN HIFIGAN....r   r   r   r    r   TFr   r<   r=   r   rL   r   r   r   zresume from: r   r   r   r   r8   r   r   r   r  r   r  r  r  r  r  r  r  r  r  r  r  r  log_interval_steps)r  r   r  r  r  rW   r  r  r  r  r  r  r  r  r!  r"  r:   r$  r   r%  r&  )-ru   rv   r^   rk   r'  rl   rx   rr   ry   rz   r>   r   r   r   rc   rd   rw   r(  r)  r*  r+  r   rf   r,  r   r   r   rX   r.  r  r]   r   r  r   rW   r\   r   r   r/  r  r0  r1  r2  r3  r4  )r   r5  r6  r7  r8  r9  r:  r;  r<  r   r   r  r=  r   r   r>  r?  rC  rD  r  r  rE  rF  r   r  r  r  rG  rH  r   r   r   train_hifigan   s   












zVoice.train_hifiganc                 C   sR   | j  | js|   |   d| _W d    n1 sw   Y  | | |S )NT)r   r~   r   r   r   r   )r   r   r   r   r   forward  s   zVoice.forward)__name__
__module____qualname__r   ri   r   r   r   r   dictrI  rK  rL  r   r   r   r   r5   O   s*    
 	E
 )
 r5   )r   r   r   N)r   )(rc   picklepklr(  collectionsr   	threadingr   jsonr   r   r/   ry   kantts.datasets.datasetr   r   kantts.modelsr   kantts.train.lossr   kantts.train.trainerr   r	   r
    kantts.utils.ling_unit.ling_unitr   torch.utils.datar   "modelscope.utils.audio.audio_utilsr   %modelscope.utils.audio.tts_exceptionsr   r   modelscope.utils.loggerr   ru   r   r-   r4   r5   r   r   r   r   <module>   s6   

&	