o
    wiG                     @   s   d dl Z d dlmZ d dlmZ d dlmZmZm	Z	m
Z
 d dlmZ d dlmZmZ d dlmZmZmZmZmZ d dlmZ d d	lmZ G d
d deZG dd deZG dd deZdS )    N)Variable)
functional)	AttentionConvNorm
LinearNormPrenet)get_mask_from_lengths)NeuralModule	typecheck)EmbeddedTextTypeLengthsType
LogitsTypeMelSpectrogramTypeSequenceToSequenceAlignmentType)
NeuralType)loggingc                       sP   e Zd Zdededef fddZedd Zedd	 Ze d
d Z	  Z
S )Encoderencoder_n_convolutionsencoder_embedding_dimencoder_kernel_sizec                    s   t    g }t|D ]!}tjt|||dt|d d dddtj|}|	| qtj
|| _tjj|t|d dddd| _dS )ax  
        Tacotron 2 Encoder. A number of convolution layers that feed into a LSTM

        Args:
            encoder_n_convolutions (int): Number of convolution layers.
            encoder_embedding_dim (int): Final output embedding size. Also used to create the convolution and LSTM layers.
            encoder_kernel_size (int): Kernel of the convolution front-end.
              relukernel_sizestridepaddingdilationw_init_gainT)batch_firstbidirectionalN)super__init__rangetorchnn
Sequentialr   intBatchNorm1dappend
ModuleListconvolutionsLSTMlstm)selfr   r   r   r+   _
conv_layer	__class__ c/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/tts/modules/tacotron2.pyr"   "   s(   

	zEncoder.__init__c                 C   s   t dt t dt dS )NBDTr6   )token_embedding	token_len)r   r   r   r.   r3   r3   r4   input_typesD   s   

zEncoder.input_typesc                 C      dt dt iS )Nencoder_embeddingr6   r8   r7   )r   r   r;   r3   r3   r4   output_typesK      zEncoder.output_typesc                C   s   | j D ]}tt||d| j}q|dd}|  }tj	j
jj||ddd}| j  | |\}}tj	j
jj|dd\}}|S )N      ?r   r   TF)r   enforce_sorted)r   )r+   Fdropoutr   training	transposecpunumpyr$   r%   utilsrnnpack_padded_sequencer-   flatten_parameterspad_packed_sequence)r.   r9   r:   convinput_lengthsoutputsr/   r3   r3   r4   forwardQ   s   


zEncoder.forward)__name__
__module____qualname__r'   r"   propertyr<   r@   r
   rR   __classcell__r3   r3   r1   r4   r   !   s    "

r   c                       s   e Zd Z	d'dededededededed	ed
ededededededef fddZedd Zedd Z	e
 dd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Z  ZS )(DecoderrB   n_mel_channelsn_frames_per_stepr   attention_dimattention_location_n_filtersattention_location_kernel_sizeattention_rnn_dimdecoder_rnn_dim
prenet_dimmax_decoder_stepsgate_thresholdp_attention_dropoutp_decoder_dropoutearly_stoppingprenet_p_dropoutc                    s   t    || _|| _|| _|| _|| _|	| _|
| _|| _	|| _
|| _|| _t|| |	|	g|| _tj|	| || _t|||||| _tj|| |d| _t|| || | _t|| dddd| _dS )a  
        Tacotron 2 Decoder. Consists of a 2 layer LSTM, one of which interfaces with the attention mechanism while the
        other is used as a regular LSTM. Includes the prenet and attention modules as well.

        Args:
            n_mel_channels (int): Number of mel channels to output
            n_frames_per_step (int): Number of spectrogram frames to predict per decoder step.
            encoder_embedding_dim (int): The size of the output from the encoder.
            attention_dim (int): The output dimension of the attention layer.
            attention_location_n_filters (int): Channel size for the convolution used the attention mechanism.
            attention_location_kernel_size (int): Kernel size for the convolution used the attention mechanism.
            attention_rnn_dim (int): The output dimension of the attention LSTM layer.
            decoder_rnn_dim (int): The output dimension of the second LSTM layer.
            prenet_dim (int): The output dimension of the prenet.
            max_decoder_steps (int): For evaluation, the max number of steps to predict.
            gate_threshold (float): At each step, tacotron 2 predicts a probability of stopping. Rather than sampling,
                this module checks if predicted probability is above the gate_threshold. Only in evaluation.
            p_attention_dropout (float): Dropout probability on the attention LSTM.
            p_decoder_dropout (float): Dropout probability on the second LSTM.
            early_stopping (bool): In evaluation mode, whether to stop when all batches hit the gate_threshold or to
                continue until max_decoder_steps.
            prenet_p_dropout (float): Dropout probability for prenet. Note, dropout is on even in eval() mode.
                Defaults to 0.5.
        r   Tsigmoid)biasr   N)r!   r"   rY   rZ   r   r^   r_   r`   ra   rb   rc   rd   re   r   prenetr$   r%   LSTMCellattention_rnnr   attention_layerdecoder_rnnr   linear_projection
gate_layer)r.   rY   rZ   r   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   r1   r3   r4   r"   h   s4   
*zDecoder.__init__c                 C   s4   t dt t dt d}| jrt dt |d< |S )Nr?   r6   )memorymemory_lengthsr5   decoder_inputs)r   r   r   rF   r   )r.   
input_dictr3   r3   r4   r<      s   

zDecoder.input_typesc                 C   s>   t dt t dt t dt d}| jst dt |d< |S )Nr5   )r6   r8   )r6   r8   r8   )mel_outputsgate_outputs
alignmentsr6   mel_lengths)r   r   r   r   rF   r   )r.   output_dictr3   r3   r4   r@      s   


zDecoder.output_typesc                 O   s&   | j r| jdi |S | jdi |S )Nr3   )rF   train_forwardinfer)r.   argskwargsr3   r3   r4   rR      s   zDecoder.forwardc                 C   s,   | d}t|j|| j| j  }|S )Nr   )sizer   datanewrY   rZ   zero_)r.   rp   r6   decoder_inputr3   r3   r4   get_go_frame   s   
zDecoder.get_go_framec                 C   s   | d}| d}t|j|| j | _t|j|| j | _t|j|| j | _	t|j|| j | _
t|j|| | _t|j|| | _t|j|| j | _|| _| j|| _|| _d S )Nr   r   )r}   r   r~   r   r^   r   attention_hiddenattention_cellr_   decoder_hiddendecoder_cellattention_weightsattention_weights_cumr   attention_contextrp   rl   memory_layerprocessed_memorymask)r.   rp   r   r6   MAX_TIMEr3   r3   r4   initialize_decoder_states   s   


z!Decoder.initialize_decoder_statesc                 C   s@   | dd}||dt|d| j d}| dd}|S )Nr   r   r   )rG   viewr}   r'   rZ   )r.   rr   r3   r3   r4   parse_decoder_inputs   s   zDecoder.parse_decoder_inputsc                 C   st   t |dd}t |ddd}| }t |dd }||dd| j}|dd}|||fS )Nr   r   r   r   )r$   stackrG   squeeze
contiguousr   r}   rY   )r.   rt   ru   rv   r3   r3   r4   parse_decoder_outputs   s   
zDecoder.parse_decoder_outputsc                 C   s  t || jfd}| || j| jf\| _| _t| j| j| j	| _t j| j
d| jdfdd}| | j| j| j|| j\| _| _
|  j| j
7  _t | j| jfd}| || j| jf\| _| _t| j| j| j	| _t j| j| jfdd}| |}| |}||| j
fS )Nr   r   dim)r$   catr   rk   r   r   rD   rE   rc   rF   r   	unsqueezer   rl   rp   r   r   rm   r   r   rd   rn   ro   )r.   r   
cell_inputattention_weights_cat decoder_hidden_attention_contextdecoder_outputgate_predictionr3   r3   r4   decode   s*   

zDecoder.decodec                C   s   |  |d}| |}tj||fdd}| |}| j|t| d g g g }}}t||	dd k rc|t| }| 
|\}}	}
||dg7 }||	g7 }||
g7 }t||	dd k s8| |||\}}}|||fS )Nr   r   r   r   )r   r   r   r$   r   ri   r   r   lenr}   r   r   r   )r.   rp   rr   rq   r   rt   ru   rv   
mel_outputgate_outputr   r3   r3   r4   ry     s   




zDecoder.train_forwardc                C   sd  |  |}|ddkrt| }nd }| j||d tj|dgtjd|j}tj	|dgtjd|j}g g g }}}	d}
	 | j
|dd}| |\}}}tt|j| jtjd}|| }||7 }| jr|t|dkr||
r|n&d}
||dg7 }||g7 }|	|g7 }	t|| jkrtd| j n|}qF| |||	\}}}	|||	|fS )	Nr   r   r   )dtypeFT)	inferencezReached max decoder steps %d.)r   r}   r   r   r$   zerosint32todeviceonesri   r   lerg   r~   rb   r   re   sumr   ra   r   warningr   )r.   rp   rq   r   r   rw   not_finishedrt   ru   rv   steppedr   r   	alignmentdecr3   r3   r4   rz   1  s8   
  $

zDecoder.inferrB   )rS   rT   rU   r'   floatboolr"   rV   r<   r@   r
   rR   r   r   r   r   r   ry   rz   rW   r3   r3   r1   r4   rX   g   s\    	
K
	



rX   c                       s\   e Zd Z	ddededededef
 fddZed	d
 Zedd Ze	 dd Z
  ZS )PostnetrB   rY   postnet_embedding_dimpostnet_kernel_sizepostnet_n_convolutions	p_dropoutc                    s   t    tj | _| jtjt|||dt	|d d dddtj
| td|d D ] }| jtjt|||dt	|d d dddtj
| q0| jtjt|||dt	|d d dddtj
| || _dS )aW  
        Tacotron 2 Postnet. A convolutional network with postnet_n_convolutions number of layers. Each layer has a
        kernel of postnet_kernel_size. Each layer apart from the last outputs postnet_embedding_dim channels, the last
        outputs n_mel_channels channels. After each layer is a dropout layer with p_dropout% drop. The last linear has
        no activation, all intermediate layers have tanh activation.

        Args:
            n_mel_channels (int): Number of mel channels to output from Posnet.
            postnet_embedding_dim (int): Number of channels to output from the intermediate layers.
            postnet_kernel_size (int): The kernel size for the convolution layers.
            postnet_n_convolutions (int): The number of convolutions layers.
            p_dropout (float): Dropout probability. Defaults to 0.5.
        r   r   tanhr   linearN)r!   r"   r$   r%   r*   r+   r)   r&   r   r'   r(   r#   r   )r.   rY   r   r   r   r   r/   r1   r3   r4   r"   ]  s\   

	
	
	
zPostnet.__init__c                 C   r=   Nmel_specr5   r   r   r;   r3   r3   r4   r<     rA   zPostnet.input_typesc                 C   r=   r   r   r;   r3   r3   r4   r@     rA   zPostnet.output_typesc                C   sb   |}t t| jd D ]}tt| j| || j| j}qt| jd || j| j}|| S )Nr   r   )	r#   r   r+   rD   rE   r$   r   r   rF   )r.   r   mel_spec_outir3   r3   r4   rR     s
   $zPostnet.forwardr   )rS   rT   rU   r'   r   r"   rV   r<   r@   r
   rR   rW   r3   r3   r1   r4   r   \  s&    G

r   )r$   torch.autogradr   torch.nnr   rD   'nemo.collections.tts.modules.submodulesr   r   r   r   (nemo.collections.tts.parts.utils.helpersr   nemo.core.classesr	   r
   nemo.core.neural_types.elementsr   r   r   r   r   "nemo.core.neural_types.neural_typer   
nemo.utilsr   r   rX   r   r3   r3   r3   r4   <module>   s   F v