o
    wi>                     @   s   d dl mZ d dlZd dlZd dlmZ d dlmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZmZmZ dgZG d	d deeZdS )
    )OrderedDictN)ConvSubsamplingStackingSubsampling)	typecheck)
Exportable)NeuralModule)AcousticEncodedRepresentationLengthsType
NeuralTypeSpectrogramType
RNNEncoderc                       s   e Zd ZdZdd Zedd Zedd Z				
				ddedededede	de
de	dededef fddZe dddZ  ZS )r   a  
    The RNN-based encoder for ASR models.
    Followed the architecture suggested in the following paper:
    'STREAMING END-TO-END SPEECH RECOGNITION FOR MOBILE DEVICES' by Yanzhang He et al.
    https://arxiv.org/pdf/1811.06621.pdf


    Args:
        feat_in (int): the size of feature channels
        n_layers (int): number of layers of RNN
        d_model (int): the hidden size of the model
        proj_size (int): the size of the output projection after each RNN layer
        rnn_type (str): the type of the RNN layers, choices=['lstm, 'gru', 'rnn']
        bidirectional (float): specifies whether RNN layers should be bidirectional or not
            Defaults to True.
        feat_out (int): the size of the output features
            Defaults to -1 (means feat_out is d_model)
        subsampling (str): the method of subsampling, choices=['stacking, 'vggnet', 'striding']
            Defaults to stacking.
        subsampling_factor (int): the subsampling factor
            Defaults to 4.
        subsampling_conv_channels (int): the size of the convolutions in the subsampling module for vggnet and striding
            Defaults to -1 which would set it to d_model.
        dropout (float): the dropout rate used between all layers
            Defaults to 0.2.
    c                 C   sJ   t d| jdt|  j}t dddt|  j}t||gS )zs
        Generates input examples for tracing etc.
        Returns:
            A tuple of input examples.
              r   )r   )	torchrandn_feat_intonext
parametersdevicerandinttuple)selfinput_exampleinput_example_length r   e/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/asr/modules/rnn_encoder.pyr   :   s    zRNNEncoder.input_examplec                 C   "   t tdt ttdt dS )z3Returns definitions of module input ports.
        BDTr   )audio_signallength)r   r
   r   r   r	   r   r   r   r   input_typesD   
   
zRNNEncoder.input_typesc                 C   r   )z4Returns definitions of module output ports.
        r   r   )outputsencoded_lengths)r   r
   r   r   r	   r$   r   r   r   output_typesO   r&   zRNNEncoder.output_typeslstmTstriding   皙?feat_inn_layersd_model	proj_sizernn_typebidirectionalsubsamplingsubsampling_factorsubsampling_conv_channelsdropoutc              	      s<  t    || _|| _|	dkr|}	|r9|dkr9|dv r+t|||d|v r%dndd| _nt|||||	t d| _nt	||| _|| _
t | _tjtjtjd	}||vr^td
|  || }t|D ]5}|rn|d n|}|dkr|| j
|dd||d}| j| | jt| | jtj|
d || _
qfd S )Nr*      )stackingstacking_normnormTF)r6   r/   feat_outr<   )r5   r6   r/   r=   conv_channels
activation)r+   grurnnz'rnn_type can be one from the following:   r+   )
input_sizehidden_size
num_layersbatch_firstr4   r2   )p)super__init__r1   r   r   
pre_encoder   nnReLULinear	_feat_out
ModuleListlayersLSTMGRURNN
ValueErrorkeysrangeappend	LayerNormDropout)r   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   SUPPORTED_RNN
rnn_moduleirnn_proj_sizelayer	__class__r   r   rI   Z   sX   


	
zRNNEncoder.__init__Nc                 C   s   | d}|d u r|j| d|tj| jjd}t|dd}t| jt	j
r,| |}n| ||\}}t| jD ]\}}||}t|trJ|\}}q9t|dd}||fS )Nr*   r   )dtyper   r9   rB   )sizenew_fullr   int32	seq_ranger   	transpose
isinstancerJ   rK   rM   	enumeraterP   r   )r   r"   r#   max_audio_lengthlthr^   _r   r   r   forward   s    

zRNNEncoder.forward)r*   r+   Tr,   r-   r*   r.   )N)__name__
__module____qualname____doc__r   propertyr%   r)   intstrboolfloatrI   r   rl   __classcell__r   r   r_   r   r      sJ    



	
B)collectionsr   r   torch.distributedtorch.nnrK   1nemo.collections.asr.parts.submodules.subsamplingr   r   nemo.core.classes.commonr   nemo.core.classes.exportabler   nemo.core.classes.moduler   nemo.core.neural_typesr   r	   r
   r   __all__r   r   r   r   r   <module>   s   