o
    }oi
K                     @   s(  d dl Z d dlmZ d dlmZmZ d dlZd dlZd dlm	  m
Z d dlm	Z	 d dlmZ d dlmZmZ d dlmZmZmZmZmZ d dlmZ d	gZG d
d deZG dd deZG dd deZG dd deZG dd deZddedede fddZ!G dd	 d	eZ"G dd deZ#dS )    N)partial)DictOptional)nn)Module)NeuralModule	typecheck)BoolType	FloatTypeLengthsType
NeuralTypeSpectrogramType)loggingTransformerUNetc                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	LearnedSinusoidalPosEmbz?The sinusoidal Embedding to encode time conditional informationdimc                    sD   t    |d dkrtd| d|d }tt|| _d S )N   r   zInput dimension  is not divisible by 2!)super__init__
ValueErrorr   	Parametertorchrandnweights)selfr   half_dim	__class__ k/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/audio/parts/submodules/transformerunet.pyr   9   s
   
z LearnedSinusoidalPosEmb.__init__treturnc                 C   sF   t |d}|t | jd d tj }tj| | fdd}|S )z
        Args:
          t: input time tensor, shape (B)

        Return:
          fouriered: the encoded time conditional embedding, shape (B, D)
        zb -> b 1zd -> 1 dr   r   )	einops	rearranger   mathpir   catsincos)r   r!   freqs	fourieredr   r   r    forward@   s   zLearnedSinusoidalPosEmb.forward)
__name__
__module____qualname____doc__intr   r   Tensorr.   __classcell__r   r   r   r    r   6   s    r   c                       s>   e Zd ZdZd
dededee f fddZd
dd	Z  ZS )ConvPositionEmbedzDThe Convolutional Embedding to encode time information of each frameNr   kernel_sizegroupsc              	      s\   t    |d dkrtd| d|d u r|}ttj|||||d dt | _d S )Nr   r   zKernel size z is divisible by 2!)r8   padding)r   r   r   r   
SequentialConv1dGELU	dw_conv1d)r   r   r7   r8   r   r   r    r   Q   s   

zConvPositionEmbed.__init__c                 C   sV   |dur|d }| |d}t|d}| |}t|d}|dur)| |d}|S )z
        Args:
            x: input tensor, shape (B, T, D)

        Return:
            out: output tensor with the same shape (B, T, D)
        N).N        zb n c -> b c nzb c n -> b n c)masked_fillr%   r&   r=   )r   xmaskoutr   r   r    r.   ]   s   	
zConvPositionEmbed.forwardN)	r/   r0   r1   r2   r3   r   r   r.   r5   r   r   r   r    r6   N   s     r6   c                       s0   e Zd ZdZ fddZdejfddZ  ZS )RMSNormzThe Root Mean Square Layer Normalization

    References:
      - Zhang et al., Root Mean Square Layer Normalization, 2019
    c                    s*   t    |d | _tt|| _d S Ng      ?)r   r   scaler   r   r   onesgamma)r   r   r   r   r    r   {   s   

zRMSNorm.__init__r@   c                 C   s   t j|dd| j | j S )Nr#   r$   )F	normalizerF   rH   )r   r@   r   r   r    r.      s   zRMSNorm.forward)	r/   r0   r1   r2   r   r   r4   r.   r5   r   r   r   r    rD   t   s    rD   c                       sF   e Zd ZdZddedee f fddZdejdejfd	d
Z	  Z
S )AdaptiveRMSNormz
    Adaptive Root Mean Square Layer Normalization given a conditional embedding.
    This enables the model to consider the conditional input during normalization.
    Nr   cond_dimc                    s   t    |d u r|}|d | _t||| _t||| _tj| jj	 tj
| jj tj| jj	 tj| jj d S rE   )r   r   rF   r   Linearto_gammato_betainitzeros_weightones_bias)r   r   rL   r   r   r    r      s   

zAdaptiveRMSNorm.__init__r@   condc                 C   sN   t j|dd| j }| || |}}t|d}t|d}|| | S )Nr#   r$   zB D -> B 1 D)rI   rJ   rF   rN   rO   r%   r&   )r   r@   rU   normedrH   betar   r   r    r.      s
   zAdaptiveRMSNorm.forwardrC   )r/   r0   r1   r2   r3   r   r   r   r4   r.   r5   r   r   r   r    rK      s    rK   c                   @   s    e Zd ZdZdejfddZdS )GEGLUz#The GeGLU activation implementationr@   c                 C   s    |j ddd\}}t|| S )Nr   r#   r$   )chunkrI   gelu)r   r@   gater   r   r    r.      s   zGEGLU.forwardN)r/   r0   r1   r2   r   r4   r.   r   r   r   r    rX      s    rX      r>   r   multdropoutc              	   C   s@   t | | d d }tt| |d t t|t|| S )zn
    Return a Feed-Forward layer for the Transformer Layer.
    GeGLU activation is used in this FF layer
    r      )r3   r   r:   rM   rX   Dropout)r   r]   r^   	dim_innerr   r   r    get_feedforward_layer   s   ,rb   c                       s   e Zd ZdZ									d$d	ed
edededededededee dedee f fddZdedefddZ	e
deeef fddZe
deeef fddZe d%deej fddZd ed!efd"d#Z  ZS )&r   a>  
    Implementation of the transformer Encoder Model with U-Net structure used in
    VoiceBox and AudioBox

    References:
        Le et al., Voicebox: Text-Guided Multilingual Universal Speech Generation at Scale, 2023
        Vyas et al., Audiobox: Unified Audio Generation with Natural Language Prompts, 2023
       r\   r>   p  FNTr   depthheadsff_multattn_dropout
ff_dropoutmax_positionsadaptive_rmsnormadaptive_rmsnorm_cond_dim_inuse_unet_skip_connectionskip_connect_scalec                    s  t    |d dkrtd| dtg | _| j||d |r*|	du r*td|| _|	| _| jr:t	t
|	d}nt}|du rDd	| _n|| _t|D ]8}|d
 }|
oX||d k}| jt|rht|d |nd||dtj|||dd||dt|||dg qKt|| _td| jj td| td| td||  td| td| td| td| td| td| td|	 td|
 td| j dS )a4  
        Args:
            dim: Embedding dimension
            depth: Number of Transformer Encoder Layers
            heads: Number of heads in MHA
            ff_mult: The multiplier for the feedforward dimension (ff_dim = ff_mult * dim)
            attn_dropout: dropout rate for the MHA layer
            ff_dropout: droupout rate for the feedforward layer
            max_positions: The maximum time length of the input during training and inference
            adaptive_rmsnorm: Whether to use AdaptiveRMS layer.
                Set to True if the model has a conditional embedding in forward()
            adaptive_rms_cond_dim_in: Dimension of the conditional embedding
            use_unet_skip_connection: Whether to use U-Net or not
            skip_connect_scale: The scale of the U-Net connection.
        r   r   zNumber of layers r   )rj   rf   NzIadaptive_rmsnorm_cond_dim_in must be provided if adaptive_rmsnorm is True)rL   g;f?   r$   T)	embed_dim	num_headsr^   batch_first)r   r]   r^   Initialized %s withz	embedding dim:       %sz	Number of Layer:     %sz	feedforward dim:     %sz	number of heads:     %sz	Dropout rate of MHA: %sz	Dropout rate of FF:  %sz	maximun time length: %sz	use AdaptiveRMS:     %sz	Conditional  dim:    %sz	Use UNet connection: %sz	skip connect scale:  %s)r   r   r   r   
ModuleListlayers
init_alibirk   rl   r   rK   rD   rn   rangeappendrM   MultiheadAttentionrb   
final_normr   debugr   r/   )r   r   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   rmsnorm_classindlayerhas_skipr   r   r    r      s^   

zTransformerUNet.__init__c                 C   s   dd }t | std td|  tt||d| _	dt
t
|dt
|d   }t|d	}| jd
|dd dS )zInitialize the Alibi bias parameters

        References:
          - Press et al., Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation, 2021
        c                 S   s    dd|   }|t d| d  S )Nr   iro   )r   arange)nratior   r   r    
get_slopes"  s   z.TransformerUNet.init_alibi.<locals>.get_slopeszYIt is recommend to set number of attention heads to be the power of 2 for the Alibi bias!zCurrent value of heads: z
B -> B 1 1r#   r   ro   zT1 T2 -> 1 T1 T2
pos_matrixF)
persistentN)r'   log2
is_integerr   warningr   r   r%   r&   slopesr   absr   	unsqueezefloatregister_buffer)r   rj   rf   r   r   r   r   r    rv     s   ,zTransformerUNet.init_alibir"   c                 C   s,   t dt t dt ddt dt dddS )+Returns definitions of module output ports.BTD)r   r   Toptional)r   r   r@   key_padding_maskadaptive_rmsnorm_cond)r   r
   r	   r   r   r   r    input_types4     
zTransformerUNet.input_typesc                 C   s   dt dt iS )r   outputr   )r   r
   r   r   r   r    output_types=  s   zTransformerUNet.output_typesr   c              	   C   s  |j ^}}}g }| j||d}t }	|durt|d}	| jD ]^\}
}}}}|
du r/|| n| | j }tj||fdd}|
|}||fi |	}|dur\|	 }|
|t	d}nd}|||||d|d\}}|| }||fi |	}||| }q| |S )	a  Forward pass of the model.

        Args:
            input: input tensor, shape (B, C, D, T)
            key_padding_mask: mask tensor indicating the padding parts, shape (B, T)
            adaptive_rmsnorm_cond: conditional input for the model, shape (B, D)
        )
batch_sizeseq_lenN)rU   r#   r$   z-infF)querykeyvaluer   need_weights	attn_mask)shapeget_alibi_biasdictru   rx   poprn   r   r)   r   r?   rz   )r   r@   r   r   r   r   _skip_connects
alibi_biasrmsnorm_kwargsskip_combinerattn_prenormattn
ff_prenormffskip_connect
attn_inputfloat_key_padding_maskattn_outputff_inputr   r   r    r.   D  s:   	


zTransformerUNet.forwardr   r   c                 C   s8   | j ddd|d|f }|| j }||dd}|S )zK
        Return the alibi_bias given batch size and seqence length
        Nro   )r   r   repeat)r   r   r   r   r   r   r   r    r   u  s   
zTransformerUNet.get_alibi_bias)	rc   r\   r>   r>   rd   FNTNNN)r/   r0   r1   r2   r3   r   boolr   r   rv   propertyr   strr   r   r   r   r   r4   r.   r   r5   r   r   r   r    r      s\    	
W
0c                       s   e Zd ZdZ												
		
	d(dededededededededededee dedee dee f fddZe	de
eef fddZe	de
eef fd d!Zed"ejd#efd$d%Ze d)d&d'Z  ZS )*SpectrogramTransformerUNeta:  This model handles complex-valued inputs by stacking real and imaginary components.
    Stacked tensor is processed using TransformerUNet and the output is projected to generate real
    and imaginary components of the output channels.

    Convolutional Positional Embedding is applied for the input sequence
    ro               r\   r>   rd   N   Tin_channelsout_channelsfreq_dimr   re   rf   rg   ri   rh   rj   time_hidden_dimconv_pos_embed_kernel_sizeconv_pos_embed_groupsrk   c                    s   t    || _|| _|| d }|d u r|d }t||| _|r2tt|t||t	 | _
t|||d| _t||||||	|
||dd
| _|| d }t||| _td| jj td| j td| j td	| d S )
Nr   r\   )r   r7   r8   T)
r   re   rf   rg   ri   rh   rj   rk   rl   rm   rs   z	in_channels:  %sz	out_channels: %sz	Input frequency dimension: %s)r   r   r   r   r   rM   proj_inr:   r   SiLUsinu_pos_embr6   
conv_embedr   transformerunetproj_outr   r{   r   r/   )r   r   r   r   r   re   rf   rg   ri   rh   rj   r   r   r   rk   dim_indim_outr   r   r    r     s<   
 z#SpectrogramTransformerUNet.__init__r"   c                 C   s,   t dt t dt ddt dt dddS )r   r   Cr   r   r   Tr   )inputinput_length	condition)r   r   r   r
   r   r   r   r    r     r   z&SpectrogramTransformerUNet.input_typesc                 C   s   t dt t dt dddS )r   r   r   Tr   )r   output_length)r   r   r   r   r   r   r    r     s   
z'SpectrogramTransformerUNet.output_typesr   
max_lengthc                 C   s0   t |t| || j}|| dk}|S )af  
        Return the self_attention masking according to the input length.
        0 indicates the frame is in the valid range, while 1 indicates the frame is a padding frame.
        Args:
          input_length: shape (B)
          max_length (int): The maximum length of the input sequence

        return:
          key_padding_mask: shape (B, T)
        ro   )r   r   expandlentodevicer   )r   r   r   r   r   r    _get_key_padding_mask  s   z0SpectrogramTransformerUNet._get_key_padding_maskc                 C   s   |j \}}}}|| jkrtd| d| j tj|j|jgdd}t|d}| 	|}	| j
||d}
| j|	|
d|	 }	|du rDd}n| |}| j|	|
|d	}	| |	}tj|d
| jd|d}t| }||fS )a4  Forward pass of the model.

        Args:
            input: input tensor, shape (B, C, D, T)
            input_length: length of the valid time steps for each example in the batch, shape (B,)
            condition: scalar condition (time) for the model, will be embedded using `self.time_embedding`
        zUnexpected input channel size z, expected r   r$   zB C RI D T -> B T (C RI D))r   )rA   Nr   zB T (C RI D) -> B C D T RI)r   RIr   )r   r   RuntimeErrorr   stackrealimagr%   r&   r   r   r   r   r   r   r   view_as_complex
contiguous)r   r   r   r   r   C_inr   r   input_real_imagr@   r   time_embr   r   r   r    r.     s    




z"SpectrogramTransformerUNet.forward)ro   ro   r   r   r   r   r\   r>   r>   rd   Nr   NTr   )r/   r0   r1   r2   r3   r   r   r   r   r   r   r   r   r   r   staticmethodr   r4   r   r   r.   r5   r   r   r   r    r     sj    		
8r   )r\   r>   )$r'   	functoolsr   typingr   r   r%   r   torch.nn.functionalr   
functionalrI   torch.nnr   nemo.core.classesr   r   nemo.core.neural_typesr	   r
   r   r   r   
nemo.utilsr   __all__r   r6   rD   rK   rX   r3   r   rb   r   r   r   r   r   r    <module>   s*   $&!	 K