o
    wia                     @   s   d dl Z d dlm  mZ d dl mZ d dlmZ d dlmZm	Z	m
Z
mZmZmZmZmZ d dlmZmZmZ d dlmZmZ d dlmZmZmZmZmZ d dlmZ e jj d	d
 Z!e jj dd Z"dddZ#G dd dej$Z%G dd deeZ&dS )    N)nn)get_attribute_prediction_model)AffineTransformationLayerBiLSTMConvAttentionExponentialClassInvertible1x1ConvInvertible1x1ConvLUS
LinearNormget_radtts_encoder)get_mask_from_lengths
mas_width1regulate_len)
ExportableNeuralModule)IndexLengthsTypeMelSpectrogramTypeTokenDurationType
TokenIndex)
NeuralTypec                 C   s<   | j d |j d k r|j d | j d  }t| d|g} | S )Nr   )shapeFpad)durtxt_encto_pad r   `/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/tts/modules/radtts.pypad_dur#   s   r    c                 C   s   t || jd  }|dkr%t|d  d|gd }t| d  d|gd } t ||jd  }|dkr>t|d  d|gd }| |fS )N   r   )intr   r   r   )
energy_avgf0max_out_lenr   r   r   r   pad_energy_avg_and_f0+   s   r&   Tc                 C   s   |dkrO|r!| |   | |  }}|| }| |  |9  < | S t| | \}}| | | j| jd} |dkr;|n|}| | | j| jd} | | d} | S )N        dtyper   )meanstdtorchstd_meantor)   masked_fill)r$   f0_meanf0_std
vmask_boolmusical_scalingf0_muf0_sigma	f0_factorr   r   r   	adjust_f07   s   r7   c                       s2   e Zd Z					d fdd	Zdd	d
Z  ZS )FlowStepsimple_convexp softplusFc	           	   	      sH   t t|   |dkrt|| _nt|| _t|||||||d| _d S )NLUS)affine_model
scaling_fnaffine_activationuse_partial_padding)superr8   __init__r	   invtbl_convr   r   
affine_tfn)	selfn_mel_channelsn_context_dimn_layersr>   r?   matrix_decompositionr@   rA   	__class__r   r   rC   G   s   
zFlowStep.__init__Nc                 C   sR   |r| j ||||d}| ||}|S | |\}}| j |||d\}}|||fS )Nseq_lens)rE   rD   )rF   zcontextinverserN   	log_det_Wlog_sr   r   r   forwardb   s   
zFlowStep.forward)r9   r:   r;   r<   F)FN)__name__
__module____qualname__rC   rT   __classcell__r   r   rK   r   r8   F   s    r8   c                       s   e Zd ZdZ																	d. fd
d	Zdd Zdd Zd/ddZdd Zd/ddZ	dd Z
d0ddZdd Z					d1ddZ					 				!	!			d2d"d#Zd3d$d%Zd&d' Zd(d) Zed*d+ Zed,d- Z  ZS )4RadTTSModuleaD  
    Takes model parameters (modelConfig) from config file to initialize radtts module.
    Specify the type of training in the include_modules parameter. "decatnvpred" for decoder training. and "decatnunvbiasdpmvpredapm" for feature training
    n_speakers (int): Number of speakers
    n_speaker_dim (int): number of speakers dimension
    n_text (int): Symbols embedding size
    n_text_dim (int):
    n_flows (int):
    n_conv_layers_per_step (int): number of convolution layers per step
    dummy_speaker_embedding (bool):
    include_modules (string): A string that describes what to train. "decatnvpred" for decoder training. and "decatnunvbiasdpmvpredapm" for feature training.
    scaling_fn (string): scaling function
    decoder_use_partial_padding (Bool): Set this to True to add partial padding
    learn_alignments (Bool): set this to true to learn alignments
    attn_use_CTC (Bool): set True to use CTC
    n_f0_dims (int): number of Pitch dimension
    n_early_size (int):
    n_early_every (int):
    n_group_size (int):
    decoder_use_unvoiced_bias (bool):
    context_lstm_w_f0_and_energy (bool):
    use_first_order_features (bool):
    ap_pred_log_f0 (bool):
    dur_model_config: model configuration for duration
    f0_model_config: model configuration for Pitch
    energy_model_config: model configuration for energy
    Ndecr:   r;   Fr<   Tr   c           %         sl  t t|   |	d dksJ || _|| _|| _|d | _|| _| jd dks)J tj	
|| j| _tj	
||| _tj	 | _t|d| _|| _|| _|| _|| _t|| _t|| _|| _|| _t|| _|d | _|| _|d | _d|v s}d|v r"| jrt|| j|| _|| _ || _!| j|| | |  } | jr| j||  }!t"| j||  d }"| jr|| | }!|!|9 }!|!| j7 }!| j||  } t#|!|"d	d
| _$| j!d	kr|d	f|dd	d| _%t	j&di | j%| _'g | _(|	| _)|| }t*| j D ]*}#|#dkr|#|
 dkr|| j)8 }| j(+|# | j+t,|| |||||| jd qd|v r2||d d< t-|| _.d| _/d| _0|d | _1| jsE| jrp|dv sLJ d| _/|dkrYt	2 }$n|dkrbt3 }$nt4d	 t	5t6|d	|$| _7| j1s}| j/s}d|v rd| _0| j0r||d d< t-|| _8| j1rtj	
d|| _9d| _:d|v r4||d d< ||d d< | jrd|d d< d|d d< d|d v r|d d d urd|d d d< d|d v r|d d d urd|d d d< n8d|d v r|d d d ur|d d |d d d< d|d v r(|d d d ur(|d d |d d d< t-|| _;t-|| _<d S d S )N   r   decoder_use_partial_padding)encoder_embedding_dimdecoder_use_unvoiced_biasap_use_unvoiced_biasatnrZ   r!   )
input_sizehidden_size
num_layers)kernel_sizestridepaddingdilation)r@   rA   dpmhparamsn_speaker_dimFap_use_voiced_embeddings>   r:   reluTrl   r:   vpred         ?apmn_in_dimspline_flow_paramsn_in_channelsr   )=rB   rY   rC   rG   	n_f0_dimsn_energy_avg_dimsr\   rj   r,   r   	Embeddingspeaker_embedding	embedding
ModuleListflowsr   encoderdummy_speaker_embeddinglearn_alignmentsr@   include_modulesboolattn_use_CTCuse_context_lstmcontext_lstm_normcontext_lstm_w_f0_and_energyuse_first_order_featuresr^   ap_pred_log_f0r_   r   	attentionn_flowsn_group_sizer"   r   context_lstmunfold_paramsUnfold
unfold_mod
exit_stepsn_early_sizerangeappendr8   r   dur_pred_layeruse_unvoiced_biasuse_vpred_modulerk   ReLUr   exit
Sequentialr
   unvoiced_bias_modulev_pred_modulev_embeddingsv_pred_thresholdf0_pred_moduleenergy_pred_module)%rF   
n_speakersrj   n_text
n_text_dimr   n_conv_layers_per_steprG   r|   r   n_early_everyr   r>   dur_model_configf0_model_configenergy_model_configv_model_configr~   r?   rJ   r}   r@   r   r   r   rt   ru   r   r   unvoiced_bias_activationr   kwargsn_flowstep_cond_dimsn_in_context_lstmn_context_lstm_hiddeniunvbias_nonlinrK   r   r   rC      s   "

















zRadTTSModule.__init__c                 C   s    | j r|d n|}| |}|S )Nr   )r|   rw   )rF   spk_idsspk_vecsr   r   r   encode_speaker=  s   
zRadTTSModule.encode_speakerc                 C   s.   |  |dd}| ||dd}||fS )Nr!   r[   )rx   	transposer{   )rF   textin_lenstext_embeddingstext_encr   r   r   encode_textB  s   zRadTTSModule.encode_textc           
      C   s0  | j dkr4| j||d}|d ur | j|d d d d d f |d}|d ur4| j|d d d d d f |d}|d dd|jd }t||fd}| jr{| jrf|d urZt||fd}|d urft||fd}|| j  }| |	dd|}	|		dd}| js|d urt||fd}|d urt||fd}|S )Nr!   assume_padded.Nr   r[   )
r   unfoldexpandr   r,   catr   r   r   r   )
rF   rP   speaker_vecsout_lensr$   r#   r   context_w_spkvecunfolded_out_lenscontext_lstm_padded_outputr   r   r   preprocess_contextI  s.   
  
zRadTTSModule.preprocess_contextc                 C   s:   |j \}}}||d| j|dd}||d|| j S )zInverse of the self.unfold() operation used for the
        grouping or "squeeze" operation on input

        Args:
            mel: B x C x T tensor of temporal data
        r   r[      )r   reshaper   r   )rF   melbdtr   r   r   foldi  s   zRadTTSModule.foldc                 C   sN   |r|j \}}}|||d| jdd}|||| j dS | |dS )zoperation used for the
        grouping or "squeeze" operation on input

        Args:
            mel: B x C x T tensor of temporal data
        r   r[   r   )r   r   r   r   r   	unsqueeze)rF   r   r   r   r   r   r   r   r   r   t  s
   zRadTTSModule.unfoldc           	   
   C   s   |j d }t E |j  }t|}t|D ]+}t||dd|| d|| f }tj	||
 d||dd|| d|| f< qW d   |S 1 sQw   Y  |S )zFor training purposes only. Binarizes attention with MAS. These will
        no longer receive a gradient
        Args:
            attn: B x 1 x max_mel_len x max_text_len
        r   Ndevice)r   r,   no_graddatacpunumpy
zeros_liker   r   tensor
get_device)	rF   attnr   r   b_sizeattn_cpuattn_outind	hard_attnr   r   r   binarize_attention  s   


$0
zRadTTSModule.binarize_attentionr!   c                 C   s   t j|t |ddd|f fdd}t jt |ddd|f |fdd}|dd|df | }||ddd| f  }|| d S )z?
        feats: b x max_length
        out_lens: b-dim
        Nr   r!   dimro   )r,   r   r   )rF   featsrg   feats_extended_Rfeats_extended_Ldfeats_Rdfeats_Lr   r   r   get_first_order_features  s
   ((z%RadTTSModule.get_first_order_featuresc           	      C   s   | d}| jjdddddf }| jjdddddf }| jjdddddf }| jjdddddf }t|| |d|   }dt|| |d|    }|| | S )z@
        text_enc: b x C x N
        voiced_mask: b x N
        r!   r   Nr[   r   rn   g?)r   r   weightr,   sigmoidtanh)	rF   r   voiced_maskvoiced_embedding_sunvoiced_embedding_svoiced_embedding_bunvoiced_embedding_bscalebiasr   r   r   apply_voice_mask_to_text  s   
z%RadTTSModule.apply_voice_mask_to_textc           ,      C   s  |  |}| ||\}}g g g }}}d }d| jv s!d| jv rQt|d  }| j||||||d\}}|rA| |||}|}n|}t||d	dd}nt
dd}|
 }| jrs| |ddd}|d	  }||d
 d| jv r| jdkr| |}| |||||
 | |	}g g g }}}|| j }t| jD ]8\}}|| jv r|d d d | jf }|| |d d | jd f }||||d\}}} ||  || q|| t|d}d }!d| jv r|d u r| |||}|dd d dd d f }"| t|t|t|" |}!d }#d }$d }%d| jv r|d u r1| |||}|r9| }&nt||d	dd}&| jre| t|&t|t|
|}%| j re| !|&|
}&| j"rst||
 | }'nt|}'t#|'| |'|< |'d }'|	d d }	| j$r| %|'}(| %|	})tj|'d d d f |(d d d f fdd}*tj|	d d d f |)d d d f fdd}	|*d }*|	d }	n|'d }*|	d }	| &|&t||*|}#| '|&t||	|}$||||!|#|$|%||||d}+|+S )Nr`   rZ   r   )key_lens
attn_priorr!   r[   ztSomething unexpected happened. Both 'atn' and 'dec' are not included in 'self.include_modules'. Please double-check.r   .r   r'   rM   rh   rp      r   r   ffffff?)z_mellog_det_W_list
log_s_listduration_model_outputsf0_model_outputsenergy_model_outputsvpred_model_outputs	attn_softr   r   attn_logprob)(r   r   r~   r   r   r   r,   bmmsqueezer   
ValueErrorr   r   r   permutemasked_fill_r   r   r   	enumeraterz   r   r   r   r   sumr   detachfloatcloner   r   rk   r   r_   logr   r   r   r   ),rF   r   speaker_idsr   r   r   r   r   r$   r#   r   r   r   r   r   r   r   	attn_hard	attn_maskr   r   r   rP   f0_biasvoiced_mask_boolr   z_outunfolded_seq_lensr   	flow_steprO   rR   rS   r   attn_hard_reducedr   r   r   text_enc_time_expanded	f0_targetdf0denergy_avg	f0_voicedoutputsr   r   r   rT     s   















**
zRadTTSModule.forwardffffff?d   r'   c           (      C   s  |j d }|d u r|j|ftjd|j d  }|j d }nt|}|d d d |f }| |}|d u r7|}|d u r=|}| |}| |}| ||\}}|	d u ro| jj|||d}	t	|	|}	|	d d df }	|	
d|}	|d u r{|||f}n
|d d d |f }t|	|dd|| j|d\}}tj|| jdd}t|}|dd |d u r| jr| jj|||d}t|d d df | jk}||	j}nd }n| }|}| jr| ||}d}| jr| |ddd}|d	  }|
d u r| ||||d d df }
t|
|||d
d}
|d u r"| |||d d df }t||
|\}}
|d urMt|	|d d d |f  d|| j|d\}}|!d| }| j"||||
| | |dd}|#|d| j t|} |dkrut$| | } t%| j&}!|!| j' }"| d d |"d f }#| d d d |"f } t(t)| j*D ]H\}$}%| j+|$ d }&|%|#|d|d}#|!dkr|&| j&|!d  kr|!d }!|!| j' }"| d d |"d f }'| d d d |"f } t,|'|#fd}#q| jdkr| -|#}#|#||	|
|dS )Nr   r(   r!   )lensr[   )
group_sizedur_lensfloor)rounding_moder   F)r3   r   Tr   P   r'   )rQ   rN   )r   r   r   r$   r#   ).r   new_onesr,   int64maxr   r   r   inferr    clampr   r   r   div
transpose_r   r   r   r   r.   r)   r   rk   r   r   r   r   infer_f0r7   infer_energyr&   r   r   r   	new_zerosnormallenr   r   r   reversedrz   r   r   r   )(rF   
speaker_idr   sigmaspeaker_id_textspeaker_id_attributespacetoken_duration_maxr   r   r$   r0   r1   r#   r   pitch_shift
batch_sizetxt_len_pad_removedspk_vecspk_vec_textspk_vec_attributesr   _txt_enc_time_expandedr   n_groupsr%   r
  ap_txt_enc_time_expandedr	  pitch_shift_spec_lenr   residualnum_steps_to_exitsplitr   r   r  	curr_stepresidual_to_addr   r   r   r   I  s   

















zRadTTSModule.inferc                 C   s   | j |||}| jr'| jr|d d ddd d f d }n|d }|d }n|d }|d }|d u r8|dk}nt|jdkrW|d d d f }|d d d d d |jd f }| jrdt|j|j	d	}|
| d |S )
Nr   r!   r   r[   r   i  r'   r   r(   )r   r   r   r   r(  r   r,   r:   r.   r)   r   )rF   r7  r3  r   r  r$   r   r   r   r$    s"    

 zRadTTSModule.infer_f0c                 C   s8   | j |||}| jr|d }n|d }|d d }|S )Nr   r   r!   r[   )r   r   r   )rF   r7  r3  r  energyr   r   r   r%    s   
zRadTTSModule.infer_energyc                 C   s   t |  j}|  D ]D\}}ztjj|dd td| W n   Y ztjj|dd td| W n   Y ztj	| td| W q   Y q| j
|d dS )zKRemoves spectral and weightnorms from model. Call before inference
        weight_hh_l0)namezRemoved spectral norm from {}weight_hh_l0_reversezRemoved wnorm from {}r   N)next
parametersr   named_modulesr   utilsremove_spectral_normprintformatremove_weight_normr.   )rF   devrB  moduler   r   r   remove_norms  s$   zRadTTSModule.remove_normsc                 C   s<   t dt t dt ddt dt t dt t dt dS )NBT_textrP  T)optional)r   r  r*  r,  r-  )r   r   r   r   rF   r   r   r   input_types
  s   



zRadTTSModule.input_typesc                 C   s$   t dt t dt t dt dS )N)rP  DT_specrP  rO  )spect
num_framesdurs_predicted)r   r   r   rS  r   r   r   output_types  s   


zRadTTSModule.output_types)NrZ   r:   r;   Fr<   TFNr   r   TFr;   F)F)r!   )FNNNN)r  NNNr  NNNr'   r'   NNN)NN)rU   rV   rW   __doc__rC   r   r   r   r   r   r   r   r   rT   r   r$  r%  rN  propertyrT  rZ  rX   r   r   rK   r   rY   m   sp    - 4
 


 !
 

	rY   )T)'r,   torch.nn.functionalr   
functionalr   7nemo.collections.tts.modules.attribute_prediction_modelr   #nemo.collections.tts.modules.commonr   r   r   r   r   r	   r
   r   (nemo.collections.tts.parts.utils.helpersr   r   r   nemo.core.classesr   r   nemo.core.neural_types.elementsr   r   r   r   r   "nemo.core.neural_types.neural_typer   jitscriptr    r&   r7   Moduler8   rY   r   r   r   r   <module>   s    (



'