o
    wiX                     @   s   d dl Z d dlmZmZ d dlmZmZ d dlmZm	Z	m
Z
 d dlmZmZmZmZmZmZmZmZmZmZ d dlmZ dd Zd	d
 ZG dd de jje	jZG dd deZG dd dee	jZG dd deZ dS )    N)ConditionalInputConditionalLayerNorm)binarize_attention_parallelregulate_len)NeuralModuleadapter_mixins	typecheck)
EncodedRepresentationIndexLengthsTypeLogprobsTypeMelSpectrogramType	ProbsTypeRegressionValuesTypeTokenDurationType
TokenIndexTokenLogDurationType)
NeuralTypec                 C   s  t j|dd }t jj|d d d df d}t jjt j| dkddd}t jjt j| ddd}| \}}| d}|d d d d d f |||}	|d d d d d f |||}
t |d|
t |d|	 	 }t |d|
t |d|	 	 }t 
|dk||| }|S )N   )dim)r   r              )torchcumsumlongnn
functionalpadsizeexpandgatherfloatwhere)pitchdursdurs_cums_endsdurs_cums_startspitch_nonzero_cums
pitch_cumsbsl
n_formantsdcsdce
pitch_sumspitch_nelems	pitch_avg r2   c/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/tts/modules/fastpitch.pyaverage_featuresA   s    
    r4   c                 C   s*   t t | d ||}||d9 }|S )N      ?r   )r   clampexpsqueeze)log_durmin_durmax_durmaskdurr2   r2   r3   log_to_durationS   s   r>   c                       s0   e Zd Zdddg f fdd	Zd	ddZ  ZS )
ConvReLUNormr   r   i  c                    sJ   t t|   tjj||||d d| _t|||d| _tj	|| _
d S )Nr   kernel_sizepadding)condition_dimcondition_types)superr?   __init__r   r   Conv1dconvr   normDropoutdropout)selfin_channelsout_channelsrA   rK   rC   rD   	__class__r2   r3   rF   Z   s   zConvReLUNorm.__init__Nc                 C   s`   t jj| |}| |dd|dd}| |}|  r.| 	|dddd}|S )Nr   r   )
r   r   r   relurH   rI   	transposerK   is_adapter_availableforward_enabled_adapters)rL   signalconditioningoutr2   r2   r3   forward`   s   
zConvReLUNorm.forwardN)__name__
__module____qualname__rF   rX   __classcell__r2   r2   rO   r3   r?   Y   s    r?   c                       sH   e Zd ZdZdg f fdd	Zedd Zedd Zdd
dZ  Z	S )TemporalPredictorz2Predicts a single float per each temporal locationr   c                    s|   t t|   t|||| _tj | _t	|D ]}| j
t|dkr$|n||||||d qtjj|ddd| _|| _d S )Nr   )rA   rK   rC   rD   r   Tbias)rE   r^   rF   r   
cond_inputr   r   
ModuleListlayersrangeappendr?   Linearfcfilter_size)rL   
input_sizerh   rA   rK   n_layersrD   irO   r2   r3   rF   n   s    

zTemporalPredictor.__init__c                 C   s(   t dt t dt t dt dddS )NBTD)rm   rn   r   Toptional)encenc_maskrV   )r   r	   r   rL   r2   r2   r3   input_types   s   

zTemporalPredictor.input_typesc                 C   s   dt dt iS )NrW   )rm   rn   )r   r	   rt   r2   r2   r3   output_types   s   zTemporalPredictor.output_typesNc                 C   s\   |  ||}|| }|dd}| jD ]}|||d}q|dd}| || }|dS )Nr   r   rV   r   )ra   rR   rc   rg   r8   )rL   rr   rs   rV   rW   layerr2   r2   r3   rX      s   

zTemporalPredictor.forwardrY   )
rZ   r[   r\   __doc__rF   propertyru   rv   rX   r]   r2   r2   rO   r3   r^   k   s    

r^   c                       s   e Zd Z				d$dedededed	ed
edededededededededef fddZedd Zedd Z	dd Z
e dddddddddddddd Zdddddddd!d"d#Z  ZS )%FastPitchModuleP   r   K   Tencoder_moduledecoder_moduleduration_predictorpitch_predictorenergy_predictoralignerspeaker_encoder
n_speakerssymbols_embedding_dimpitch_embedding_kernel_sizeenergy_embedding_kernel_sizen_mel_channelsmin_token_durationmax_token_durationuse_log_energyc                    s  t    || _|| _|| _|| _|| _|| _|| _|d u| _	d| _
d| _|| _|dkr9|d u r9tj||	| _nd | _|| _|| _tjjd|	|
t|
d d d| _| jd uritjjd|	|t|d d d| _| dtd | dtd tjj| jj|dd| _d S )	NTFr   r   r@   
pitch_mean	pitch_stdr_   )rE   rF   encoderdecoderr   r   r   r   r   learn_alignmentuse_duration_predictorbinarizer   r   r   	Embeddingspeaker_embr   r   rG   int	pitch_emb
energy_embregister_bufferzerosrf   d_modelproj)rL   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rO   r2   r3   rF      sB   


zFastPitchModule.__init__c                 C   s   t dt t dt t dt t dt ddt dt ddt ddt dt ddt dt ddt dt ddt dt ddt dt ddt dt dddS )	Nrm   T_textrm   T_audioTrp   rm   rm   ro   T_spec)rm   r   r   )textr%   r$   energyspeakerpacespec
attn_priormel_lens
input_lensreference_specreference_spec_lens)r   r   r   r   r
   r   r   r   rt   r2   r2   r3   ru      s   


zFastPitchModule.input_typesc                 C   s~   t dt t dt t dt t dt t dt t dt t dt t dt t dt t dt t dt t dt dS )Nr   rm   r   )rm   Sr   r   r   )spect
num_framesdurs_predictedlog_durs_predictedpitch_predicted	attn_softattn_logprob	attn_hardattn_hard_durr$   energy_pred
energy_tgt)r   r   r   r   r   r   r   rt   r2   r2   r3   rv      s   











zFastPitchModule.output_typesc                 C   sZ   | j dur|  ||||d}|S | jdur)|du rtd| |d}|S d}|S )zspk_emb: Bx1xDNr   z7Please give speaker id to get lookup speaker embedding.)r   	unsqueezer   
ValueError)rL   
batch_sizer   r   r   spk_embr2   r2   r3   get_speaker_embedding   s   

z%FastPitchModule.get_speaker_embeddingNr5   )r%   r$   r   r   r   r   r   r   r   r   r   c          !      C   s  | j s| jr|d usJ |d usJ | j|jd |||d}| j||d\}}| j|||d}t|| j| j|d}d\}}}}| j rs|d urs| j	|}| j
||ddd|dk||d\}}t||
|	}|dd d dd d f }| j|||d}|d ur| j r|jd	 |jd	 krt|d|d}n| j st|d|d}| |d}n| |d}||dd }| jd ur| j|||dd	}|d ur| j rt|d|}nt|d|}| jrtd
| }| |}|d}n
| |d}d }||dd }nd }d }| j r&|d ur&t|||\}}n%|d u r9|d ur9t|||\}}n|d u rGt|||\}}ntd| j|||d\}}| |dd} | |||||||||||fS )Nr   r   r   r   r   inputrV   rw   r9   r:   r;   r<   )NNNNr   r   r   r5   zZSomething unexpected happened when 'spec' is not None and 'self.learn_alignment' is False.r   seq_lensrV   )r   trainingr   shaper   r   r>   r   r   word_embr   permuter   sumr   r4   r   r8   r   rR   r   r   r   logr   r   r   r   r   )!rL   r   r%   r$   r   r   r   r   r   r   r   r   r   r   enc_outrs   r   r   r   r   r   r   text_embr   r   r   r   r   len_regulateddec_lensdec_out_r   r2   r2   r3   rX     s   


zFastPitchModule.forward)r$   r   r   r   volumer   r   c                C   s~  | j |jd |||d}	| j||	d\}
}| j|
||	d}t|| j| j|d}| j|
||	d| }| |	d}|
|
dd }
| jd ur|d ure|jd |jd ks_J d	|jd  d
| |}n| j|
||	dd}| |	d}|
|
dd }
t||
|\}}d }|d urt||	d|\}}|d }| j|||	d\}}| |
dd}|tj|||||fS )Nr   r   r   rw   r   r   r   r   zenergy.shape[-1]: z != len(text)r   )r   r   r   r   r>   r   r   r   r   r   rR   r   r   r8   r   r"   r   r   tor   )rL   r   r$   r   r   r   r   r   r   r   r   rs   r   r   r   r   r   r   r   r   volume_extendedr   r   r   r2   r2   r3   infer  sF   
*
zFastPitchModule.infer)r|   r   r}   T)rZ   r[   r\   r   r   boolrF   rz   ru   rv   r   r   rX   r   r]   r2   r2   rO   r3   r{      s|    	
@

wr{   c                       s~   e Zd Z			ddededededed	ed
ededef fddZedd Zedd Ze	 ddddddddZ
  ZS )FastPitchSSLModuler|   r   r}   r~   r   r   r   r   r   r   r   r   c
           
         s   t    || _|| _|| _|| _|| _|	| _| jd ur-tj	j
d||t|d d d| _| dtd | dtd tj	j| jj|dd| _d S )Nr   r   r@   r   r   Tr_   )rE   rF   r   r   r   r   r   r   r   r   rG   r   r   r   r   rf   r   r   )
rL   r~   r   r   r   r   r   r   r   r   rO   r2   r3   rF     s"   

zFastPitchSSLModule.__init__c                 C   s>   t dt t dt t dt ddt dt ddt dddS )Nrl   r   Trp   r   r   rs   r%   r$   r   )r   r	   r   r   rt   r2   r2   r3   ru     s   

zFastPitchSSLModule.input_typesc                 C   sB   t dt t dt t dt t dt t dt t dt dS )Nr   rm   r   r   )r   r   r   r   r   r$   )r   r   r   r   r   rt   r2   r2   r3   rv     s   





zFastPitchSSLModule.output_typesNr5   r   c                C   s   d\}}| j d ur|  ||}t|| j| j|d}d }| jd urX| ||}|d urH|jd |jd kr?t|d|d}| 	|d}	n| 	|d}	||	
dd }|d uret|||\}
}n| j d usnJ dt|||\}
}| j|
|d\}}| |
dd}||||||fS )N)NNr   r   r   r   z9Duration predictor cannot be none if durs is not provided)r   r   )r   r>   r   r   r   r   r4   r   r8   r   rR   r   r   r   )rL   r   rs   r%   r$   r   r   r   r   r   r   r   r   r   r   r2   r2   r3   rX     s>   

zFastPitchSSLModule.forward)r|   r   r}   )rZ   r[   r\   r   r   rF   rz   ru   rv   r   rX   r]   r2   r2   rO   r3   r     s:    		
$
	

 r   )!r   'nemo.collections.tts.modules.submodulesr   r   (nemo.collections.tts.parts.utils.helpersr   r   nemo.core.classesr   r   r   nemo.core.neural_types.elementsr	   r
   r   r   r   r   r   r   r   r   "nemo.core.neural_types.neural_typer   r4   r>   r   ModuleAdapterModuleMixinr?   r^   r{   r   r2   r2   r2   r3   <module>   s   ,02  #