o
    }o™i.  ã                   @   s   d dl mZ d dlZd dlmZ d dlm  mZ d dlm	Z	m
Z
mZ d dlmZ d dlmZmZmZ d dlmZmZmZmZ d dlmZ ddee fd	d
„ZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejejƒZ G dd„ deƒZ!G dd„ de!ƒZ"G dd„ dejƒZ#dS )é    )ÚOptionalN)ÚConditionalInputÚConditionalLayerNormÚ
LinearNorm)Úget_mask_from_lengths)ÚNeuralModuleÚadapter_mixinsÚ	typecheck)ÚEncodedRepresentationÚLengthsTypeÚMaskTypeÚ
TokenIndex)Ú
NeuralTypeÚmax_lenc                 C   s<   |d u r|   ¡ }tjd|| j| jd}t ||  d¡¡}|S )Nr   )ÚdeviceÚdtypeé   )ÚmaxÚtorchÚaranger   r   ÚltÚ	unsqueeze)Úlensr   ÚidsÚmask© r   ú\/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/modules/transformer.pyÚmask_from_lens   s
   r   c                       s&   e Zd Z‡ fdd„Zddd„Z‡  ZS )ÚPositionalEmbeddingc                    s>   t t| ƒ ¡  || _ddt d|d¡|   }|  d|¡ d S )Nr   i'  ç        g       @Úinv_freq)Úsuperr   Ú__init__Údembr   r   Úregister_buffer)Úselfr#   r    ©Ú	__class__r   r   r"   $   s   zPositionalEmbedding.__init__Nc                 C   sv   t  t  |d¡t  | jd¡¡}t j| ¡ | ¡ gdd}|d ur0|d d d …d d …f  |dd¡S |d d d …d d …f S )Néÿÿÿÿr   r   ©Údim)r   Úmatmulr   r    ÚcatÚsinÚcosÚrepeat)r%   Úpos_seqÚbszÚsinusoid_inpÚpos_embr   r   r   Úforward*   s
    zPositionalEmbedding.forward©N©Ú__name__Ú
__module__Ú__qualname__r"   r4   Ú__classcell__r   r   r&   r   r   #   s    r   c                       s6   e Zd Zdg f‡ fdd„	Zd	dd„Zd	dd„Z‡  ZS )
ÚPositionwiseConvFFFc                    sž   t t| ƒ ¡  || _|| _|| _t|ƒtur||f}t 	t 
|||d d|d d ¡t ¡ t 
|||d d|d d ¡t |¡¡| _t|||d| _|| _d S )Nr   r   é   ©Úcondition_dimÚcondition_types)r!   r;   r"   Úd_modelÚd_innerÚdropoutÚtypeÚtupleÚnnÚ
SequentialÚConv1dÚReLUÚDropoutÚCoreNetr   Ú
layer_normÚ	pre_lnorm)r%   r@   rA   Úkernel_sizerB   rL   r?   r&   r   r   r"   6   s   û
zPositionwiseConvFF.__init__Nc                 C   s   |   ||¡S r5   ©Ú_forward)r%   ÚinpÚconditioningr   r   r   r4   J   s   zPositionwiseConvFF.forwardc                 C   s‚   | j r"| dd¡}|  |  ||¡ |j¡¡}| dd¡}|| }|S | dd¡}|  |¡}| dd¡}|  || |¡ |j¡}|S )Nr   r<   )rL   Ú	transposerJ   rK   Útor   )r%   rP   rQ   Úcore_outÚoutputr   r   r   rO   M   s   
ù
zPositionwiseConvFF._forwardr5   ©r7   r8   r9   r"   r4   rO   r:   r   r   r&   r   r;   5   s    
r;   c                       s8   e Zd Zddg f‡ fdd„	Zd
dd„Zd
dd	„Z‡  ZS )ÚMultiHeadAttnçš™™™™™¹?Fc                    sŒ   t t| ƒ ¡  || _|| _|| _d|d  | _|| _t 	|d| | ¡| _
t |¡| _t |¡| _tj	|| |dd| _t|||d| _d S )Nr   g      à?é   F)Úbiasr=   )r!   rW   r"   Ún_headr@   Úd_headÚscalerL   rE   ÚLinearÚqkv_netrI   ÚdropÚdropattÚo_netr   rK   )r%   r[   r@   r\   rB   ra   rL   r?   r&   r   r   r"   c   s   zMultiHeadAttn.__init__Nc                 C   s   |   |||¡S r5   rN   )r%   rP   Ú	attn_maskrQ   r   r   r   r4   r   s   zMultiHeadAttn.forwardc                 C   sÒ  |}| j r|  ||¡}| j| j}}tj|  |¡ddd\}}}	| d¡}
| d¡}|
| }| |
|||¡}| |
|||¡}|	 |
|||¡}	| 	dddd¡ 
|||¡}| 	dddd¡ 
|||¡}|	 	dddd¡ 
|||¡}t || dd¡¡}| | j¡ |d ur¡| d¡ |j¡}| || d¡d¡}| | tj¡tdƒ ¡ tj|dd}|  |¡}t ||¡}| ||
||¡}| 	dddd¡ ¡  |
||| ¡}|  |¡}|  |¡}| j rß|| }|S |  || |¡}|S )NrY   r<   r)   r   r   Úinf)rL   rK   r[   r\   r   Úchunkr_   ÚsizeÚviewÚpermuteÚreshapeÚbmmrR   Úmul_r]   r   rS   r   r/   Úmasked_fill_ÚboolÚfloatÚFÚsoftmaxra   Ú
contiguousrb   r`   )r%   rP   rc   rQ   Úresidualr[   r\   Úhead_qÚhead_kÚhead_vÚs0Ús1Ús2ÚqÚkÚvÚ
attn_scoreÚ	attn_probÚattn_vecÚattn_outrU   r   r   r   rO   u   s@   


"

þzMultiHeadAttn._forward©NNrV   r   r   r&   r   rW   b   s    
rW   c                       s*   e Zd Zg f‡ fdd„	Zddd„Z‡  ZS )ÚTransformerLayerc           	         sJ   t t| ƒ ¡  t||||fd|i|¤Ž| _t||||| d¡|d| _d S )Nr?   rL   )rL   r?   )r!   r   r"   rW   Údec_attnr;   ÚgetÚpos_ff)	r%   r[   r@   r\   rA   rM   rB   r?   Úkwargsr&   r   r   r"   ª   s
   ÿzTransformerLayer.__init__Nc                 C   sR   | j || d¡ |d}||9 }|  ||¡}||9 }|  ¡ r'|  |¡}||9 }|S )Nr<   )rc   rQ   )r‚   Úsqueezer„   Úis_adapter_availableÚforward_enabled_adapters)r%   Údec_inpr   rQ   rU   r   r   r   r4   ²   s   
zTransformerLayer.forwardr€   r6   r   r   r&   r   r   ©   s    r   c                       sT   e Zd Zddg f‡ fdd„	Zedd„ ƒZedd„ ƒZeƒ dd
d„ƒZdd„ Z	‡  Z
S )ÚFFTransformerDecoderr   Fc                    s„   t t| ƒ ¡  || _|| _|| _t| jƒ| _t 	|	¡| _
t ¡ | _t|||ƒ| _t|ƒD ]}| j t||||||||
|d	¡ q,d S )N)ra   rL   r?   )r!   rŠ   r"   r@   r[   r\   r   r3   rE   rI   r`   Ú
ModuleListÚlayersr   Ú
cond_inputÚrangeÚappendr   )r%   Ún_layerr[   r@   r\   rA   rM   rB   ra   ÚdropembrL   r?   Ú_r&   r   r   r"   À   s.   
÷ÿÿzFFTransformerDecoder.__init__c                 C   s(   t dtƒ ƒt dtƒ ƒt dtƒ dddœS )N©ÚBÚTÚDr”   T©Úoptional)ÚinputÚseq_lensrQ   )r   r
   r   ©r%   r   r   r   Úinput_typesç   s   

ýz FFTransformerDecoder.input_typesc                 C   s   t dtƒ ƒt dtƒ ƒdœS )Nr“   )Úoutr   )r   r
   r   r›   r   r   r   Úoutput_typesï   s   

þz!FFTransformerDecoder.output_typesNc                 C   s   |   |t|ƒ d¡|¡S ©Nr<   )rO   r   r   )r%   r™   rš   rQ   r   r   r   r4   ö   s   zFFTransformerDecoder.forwardc                 C   sl   t j| d¡|jd |j¡}|  |¡| }|| }|  ||¡}|  |¡}| j	D ]	}||||d}q(||fS )Nr   ©r   )r   rQ   )
r   r   rf   r   rS   r   r3   r   r`   rŒ   )r%   rP   r   rQ   r0   r3   r   Úlayerr   r   r   rO   ú   s   

zFFTransformerDecoder._forwardr5   )r7   r8   r9   r"   Úpropertyrœ   rž   r	   r4   rO   r:   r   r   r&   r   rŠ   ¿   s    ô'

rŠ   c                       s@   e Zd Zdddddg f‡ fdd„	Zedd„ ƒZdd	d
„Z‡  ZS )ÚFFTransformerEncoderr   FNr   c                    sF   t t| ƒ |||||||||	|
|¡ || _tj||p|| jd| _d S )N)Úpadding_idx)r!   r£   r"   r¤   rE   Ú	EmbeddingÚword_emb)r%   r   r[   r@   r\   rA   rM   rB   ra   r‘   rL   Ún_embedÚd_embedr¤   r?   r&   r   r   r"   	  s   
õzFFTransformerEncoder.__init__c                 C   s   t dtƒ ƒt dtƒ dddœS )N)r”   r•   r“   Tr—   )r™   rQ   )r   r   r
   r›   r   r   r   rœ   +  s   
þz FFTransformerEncoder.input_typesc                 C   s    |   |  |¡|| jk d¡|¡S rŸ   )rO   r¦   r¤   r   )r%   r™   rQ   r   r   r   r4   2  s    zFFTransformerEncoder.forward)r   )r7   r8   r9   r"   r¢   rœ   r4   r:   r   r   r&   r   r£     s    ñ"
r£   c                       s8   e Zd Z									d‡ fdd	„	Zd
d„ Z‡  ZS )ÚFFTransformerr   é   é@   é   rY   rX   r   c                    s„   t t| ƒ ¡  || _|| _|| _|| _t| jƒ| _t	 
|
¡| _t	 ¡ | _t|ƒD ]}| j t|||||||	d¡ q(t||ƒ| _d S )N)ra   )r!   r©   r"   Úin_dimÚout_dimr[   r\   r   r3   rE   rI   r`   r‹   rŒ   rŽ   r   r   r   Údense)r%   r­   r®   Ún_layersr[   r\   rA   rM   rB   ra   r‘   r’   r&   r   r   r"   8  s   
ÿzFFTransformer.__init__c           	      C   s€   |  dd¡}t|ƒd }tj| d¡|jd |j¡}|  |¡| }|  	|| ¡}| j
D ]}|||d}q,|  |¡  dd¡}|S )Nr   r<   ).Nr    )r   )rR   r   r   r   rf   r   rS   r   r3   r`   rŒ   r¯   )	r%   r‰   Úin_lensrP   r   r0   r3   r   r¡   r   r   r   r4   V  s   
zFFTransformer.forward)	r   rª   r   r«   r¬   rY   rX   rX   r   r6   r   r   r&   r   r©   7  s    õr©   r5   )$Útypingr   r   Útorch.nnrE   Útorch.nn.functionalÚ
functionalro   Ú'nemo.collections.tts.modules.submodulesr   r   r   Ú(nemo.collections.tts.parts.utils.helpersr   Únemo.core.classesr   r   r	   Únemo.core.neural_types.elementsr
   r   r   r   Ú"nemo.core.neural_types.neural_typer   Úintr   ÚModuler   r;   rW   ÚAdapterModuleMixinr   rŠ   r£   r©   r   r   r   r   Ú<module>   s"   -GI/