o
    }oiQ6                     @   s   d dl Z d dlZd dlmZmZmZ d dlZd dlmZ d dl	mZ d dlm
Z
 d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZmZmZmZ G d
d dej
jZG dd dej
jZG dd deZG dd deZG dd dej j!Z"dS )    N)DictListTuple)
DictConfig)nn)
functional)form_attention_masktransformer_weights_init)TransformerEncoder)NeuralModule)AcousticEncodedRepresentationAudioSignalLengthsType
NeuralTypeSpectrogramTypec                   @   s   e Zd ZdZdd ZdS )TransposeLastzM
    Transposes last dimension. Useful for adding to a sequential block.
    c                 C   s   | ddS )N)	transposeselfx r   `/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/modules/wav2vec_modules.pyforward)   s   zTransposeLast.forwardN)__name__
__module____qualname____doc__r   r   r   r   r   r   $   s    r   c                       s$   e Zd Z fddZdd Z  ZS )SamePadc                    s   t    |d dk| _d S )N   r   )super__init__remove)r   kernel_size	__class__r   r   r"   .   s   
zSamePad.__init__c                 C   s$   | j r|d d d d d df }|S )Nr   )r#   r   r   r   r   r   2   s   zSamePad.forward)r   r   r   r"   r   __classcell__r   r   r%   r   r   -   s    r   c                       s~   e Zd ZdZedd Zedd Z						
ddeee	e
f  de	def fddZdd Zdd Zdd Zdd Z  ZS )ConvFeatureEncodera9  
		Encoder used to isolate features in raw audio for Wav2Vec style training.
		Treated as preprocessor module in NeMo ASR training. Defaults values are
		for base model found in Baeski et al (https://arxiv.org/abs/2006.11477),
		save for use of layer normalization as default schema. (Chosen for stability.) 
    c                 C   s$   t dt| jdt tdt dS )a  Returns definitions of module input ports.
        input_signal:
            0: AxisType(BatchTag)
            1: AxisType(TimeTag)
        input_signal_length:
            0: AxisType(BatchTag)
        Note: length is in number of samples, not seconds
        )BT)freqr)   )input_signallength)r   r   _sample_ratetupler   r   r   r   r   input_types@   s   zConvFeatureEncoder.input_typesc                 C      t dt t tdt dS )aQ  Returns definitions of module output ports. 
        For compatibility, processed features are treated as Spectrogram types
        processed_signal:
            0: AxisType(BatchTag)
            1: AxisType(ChannelTag)
            2: AxisType(ProcessedTimeTag)
        processed_signal_length:
            0: AxisType(BatchTag)
        r)   Cr*   r)   )processed_signalprocessed_signal_lengthr   r   r/   r   r0   r   r   r   output_typesO      
zConvFeatureEncoder.output_types
layer_normF      ?T   conv_layersextractor_mode	conv_biasc                    s   t    || _|| _	d fdd	}d}|| _t | _|| _t	|D ];\}	}
t
|
dks6J dt|
 |
d |
d |
d	  }}| j|| ||| jd
k| jdkoX|	dk|d  }q$| jd d }||krqt||nd | _t|| _d S )NFc              
      s    fdd}|o|du sJ d|r,t | t t t jddt t  S |r>t | t jddt  S t | t  S )Nc                     s&   t j d} t j| j | S )N)stridebias)r   Conv1dinitkaiming_normal_weight)convr?   kn_inn_outr@   r   r   	make_convp   s   z=ConvFeatureEncoder.__init__.<locals>.block.<locals>.make_convFz'layer norm and group norm are exclusiveT)elementwise_affine)affine)r   
Sequentialr   	LayerNormGELU	GroupNorm)rI   rJ   rH   r@   is_layer_normis_group_normr?   rK   dimrG   r   blockm   s    z*ConvFeatureEncoder.__init__.<locals>.block      zinvalid conv definition: emb_dimr$   r@   r:   
group_normr   )rR   rS   r?   r   )FFF)r!   r"   	grad_multnormalize_input	layer_cfgr   
ModuleListr=   mode	enumeratelenstrappendLinearpost_extract_projrO   r:   )r   r=   r>   r?   feature_grad_multnormalize_audioembedding_dimrV   in_diclrH   r@   final_conv_dimr%   rT   r   r"   _   s8   
	
zConvFeatureEncoder.__init__c                 C   s   | j D ]}||}q|S N)r=   )r   r   rF   r   r   r   apply_layers   s   

zConvFeatureEncoder.apply_layersc                 C   sx   t  . t|dD ]}||d || f }t||j}|||d || f< qW d    |S 1 s5w   Y  |S )Nr   )torchno_gradrangesizeFr:   shape)r   sourcelengthsrj   orignormr   r   r   	normalize   s   

zConvFeatureEncoder.normalizec                 C   s   | j r	| ||}|d}| jdkr%| |}| jdkr$t|| j}nt  | |}W d    n1 s9w   Y  |	dd}| j
d urN| 
|}| jdkrX| |}|	dd}| j|d}||fS )NrW   r   r;   r    r:   )audio_lengths)r\   ry   	unsqueezer[   rn   GradMultiplyapplyro   rp   r   re   r_   r:   get_lengths)r   r,   r-   r5   r6   r   r   r   r      s&   








zConvFeatureEncoder.forwardc                 C   s8   | j D ]}|d }|d }tj|| |ddd }q|S )Nr$   r@   floor)rounding_moderW   )r]   ro   div)r   rz   rF   kernelr@   r   r   r   r~      s   
zConvFeatureEncoder.get_lengths)r:   Fr;   Tr<   )r   r   r   r   propertyr1   r8   r   r   rb   intboolr"   rn   ry   r   r~   r'   r   r   r%   r   r(   8   s,    

? r(   c                       sb   e Zd ZdZddededef fddZedd	 Zed
d Z	dd Z
dddZdd Z  ZS )Wav2VecTransformerEncoderad  
		Encoder module following Transformer encoder paradigm 
		as described in Vaswani et al. (https://arxiv.org/abs/1706.03762). Used for Wav2Vec
		style encoding of context vectors as described by in Baeski et al (https://arxiv.org/abs/2006.11477).
		Takes convolutional encodings of all time steps and adds to features before applying series
		of self-attention layers. 
		
		Example configs may be found at: https://github.com/NVIDIA/NeMo/tree/main/examples/asr/conf/wav2vec

		Args:
			layer_drop: Floating point value specifying proportion of module for layer dropout (See Fan et al. https://arxiv.org/pdf/1909.11556.pdf).
				If non-zero, each layer will draw from uniform probability to determine if applied in current forward call.
				Occurs only during training step
			pos_embed: Config specifying parameters for contextual embedding convolutions. Module configures convolutional padding
				to maintain number of time steps
				Must contain following:
					embedding_dim: Depth/number of channels of each time step from feature encoding 
					conv_pos: Kernel size for convolution
					conv_pos_groups: Number of groups for convolution
			transformer: Config for transformer encoder. Uses self-attention layers found in: nemo.collections.nlp.modules.common.transformer
				Must contain followign:
					num_layers: Number of attention layers 
					hidden_size: Expected input depth (embedding size between model layers)
					inner_size: Depth of embeddings within feed-forward sections of encoder layers
					num_attention_heads: Number of attention heads
					attn_score_dropout: Probability of dropout applied to attention scores
					attn_layer_dropout: Probability of dropout applied to the output of the attention layers (prior to normalization)
					ffn_dropout: Probability of dropout applied to feed-forward modules
					hidden_act: Activation function for hidden layers
            	pos_embedtransformer
layer_dropc                    s   t  jdi | |j}tj|||j|jd |jd| _|| _|j	| _
tdd| j
  |j|j  }tjj| jjd|d tj| jjd tjj| jddd| _t| jt|jt | _t|| _| d	d
  d S )Nr    )r$   paddinggroups   r;   r   )meanstdrE   )namerU   c                 S   s   t | ddS )NF)xavier)r	   )r   r   r   r   <lambda>  s    z4Wav2VecTransformerEncoder.__init__.<locals>.<lambda>r   )r!   r"   rh   r   rB   conv_posconv_pos_groupspos_convr   attn_layer_dropoutdropoutmathsqrtrC   normal_rE   	constant_rA   utilsweight_normrN   r   rP   rO   r:   r}   )r   r   r   r   rY   r   r%   r   r   r"      s$    z"Wav2VecTransformerEncoder.__init__c                 C   r2   )a1  Returns definitions of module output ports. 
        We treat features as SpectrogramType for Nemo compatibility
        audio_signal:
            0: AxisType(BatchTag)
            1: AxisType(ChannelTag)
            2: AxisType(ProcessedTimeTag)
        length:
            0: AxisType(BatchTag)
        r3   r)   )audio_signalr-   r7   r0   r   r   r   r1     r9   z%Wav2VecTransformerEncoder.input_typesc                 C   r2   )a@  Returns definitions of module output ports. 
        We're using SpectrogramType for now to keep things Nemo safe
        processed_signal:
            0: AxisType(BatchTag)
            1: AxisType(ChannelTag)
            2: AxisType(ProcessedTimeTag)
        processed_length:
            0: AxisType(BatchTag)
        r3   r)   )r5   processed_length)r   r   r/   r   r0   r   r   r   r8      r9   z&Wav2VecTransformerEncoder.output_typesc                 C   s|   |  |}t|D ]\}}d||d d |d f< q	| |}|| }|dd}| |}| j||d}|dd}||fS )Nr   rW   r    )padding_mask)create_padding_maskr`   r   r   r:   apply_transformer)r   r   r-   r   idxra   signal_convcontext_embr   r   r   r   0  s   


z!Wav2VecTransformerEncoder.forwardNc                 C   sn   t |}| jr%| jr%t| jD ]\}}t }|| jkr"||||}q|S t| jD ]
\}}||||}q*|S rm   )r   r   trainingr`   layersrandom)r   r   r   encoder_attn_mask_layerpr   r   r   r   E  s   
z+Wav2VecTransformerEncoder.apply_transformerc                 C   s>   t |}tj||jd}|t|||dk tj}|S )N)devicerW   )	maxro   aranger   expandra   r{   typeuint8)r   r-   max_lenr   r   r   r   r   S  s   "z-Wav2VecTransformerEncoder.create_padding_mask)r   rm   )r   r   r   r   r   floatr"   r   r1   r8   r   r   r   r'   r   r   r%   r   r      s    


r   c                   @   s$   e Zd Zedd Zedd ZdS )r|   c                 C   s   || _ ||}|S rm   )scalenew)ctxr   r   resr   r   r   r   _  s   
zGradMultiply.forwardc                 C   s   || j  d fS rm   )r   )r   gradr   r   r   backwarde  s   zGradMultiply.backwardN)r   r   r   staticmethodr   r   r   r   r   r   r|   ^  s
    
r|   )#r   r   typingr   r   r   ro   	omegaconfr   omegaconf.dictconfigr   torch.nnr   rs   nemo.collections.common.partsr   r	   /nemo.collections.nlp.modules.common.transformerr
   nemo.core.classes.moduler   nemo.core.neural_typesr   r   r   r   r   Moduler   r   r(   r   autogradFunctionr|   r   r   r   r   <module>   s&   	  	