o
    wiҾ                     @   s   d dl Z d dlmZmZmZmZmZ d dlZd dlm	Z	 d dl
m	  mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ zd d
lmZ d dlm	Z d dlmZ d dlmZ dZ W n e!yv   dZ Y nw eZ"d6ddZ#d6ddZ$d7dee% fddZ&dd Z'de(fddZ)dd Z*ej+j,d ejd!e(d"ejfd#d$Z-ej+j,d ejd!e(d"ejfd%d&Z.ej+j,d'ejd(e(d)ejfd*d+Z/G d,d- d-e	j0Z1G d.d/ d/e	j0Z2G d0d1 d1e	j0Z3G d2d3 d3e	j0eeZ4G d4d5 d5e	j0Z5dS )8    N)CallableIterableListOptionalTuple)Tensor)_calculate_correct_fan)_single)activation_registry)AccessMixin)AdapterModuleMixin)logging)calib)nn)quant_modules)QuantDescriptorTFfan_inc                 C   sZ   t | |}d}|t| }|}t  | | |W  d   S 1 s&w   Y  dS )av  
    Uniform Initialization from the paper [Sequence-to-Sequence Speech Recognition with Time-Depth Separable Convolutions](https://www.isca-speech.org/archive/Interspeech_2019/pdfs/2460.pdf)
    Normalized to -

    .. math::
        \text{bound} = \text{2} \times \sqrt{\frac{1}{\text{fan\_mode}}}

    Args:
        tensor: an n-dimensional `torch.Tensor`
        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
            preserves the magnitude of the variance of the weights in the
            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
            backwards pass.
           @N)r   mathsqrttorchno_graduniform_tensormodefangainstdbound r    i/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/asr/parts/submodules/jasper.pytds_uniform_+   s   

$r"   c                 C   sX   t | |}d}|t| }|}t  | d|W  d   S 1 s%w   Y  dS )au  
    Normal Initialization from the paper [Sequence-to-Sequence Speech Recognition with Time-Depth Separable Convolutions](https://www.isca-speech.org/archive/Interspeech_2019/pdfs/2460.pdf)
    Normalized to -

    .. math::
        \text{bound} = \text{2} \times \sqrt{\frac{1}{\text{fan\_mode}}}

    Args:
        tensor: an n-dimensional `torch.Tensor`
        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
            preserves the magnitude of the variance of the weights in the
            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
            backwards pass.
    r           N)r   r   r   r   r   normal_r   r    r    r!   tds_normal_B   s   


$r%   xavier_uniformr   c                 C   sF  t | trt| j| t | tjtjfrs|d urq|dkr'tjj| j	dd d S |dkr6tjj
| j	dd d S |dkrEtjj| j	dd d S |dkrTtjj| j	dd d S |d	kr_t| j	 d S |d
krjt| j	 d S td|d S t | tjr| jr| j  | jd | j  | jrtj| j	 tj| j d S d S d S )Nr&         ?)r   xavier_normalkaiming_uniformrelu)nonlinearitykaiming_normaltds_uniform
tds_normalz Unknown Initialization mode: {0}   )
isinstanceMaskedConv1dinit_weightsconvr   Conv1dLinearinitxavier_uniform_weightxavier_normal_kaiming_uniform_kaiming_normal_r"   r%   
ValueErrorformatBatchNorm1dtrack_running_statsrunning_meanzero_running_varfill_num_batches_trackedaffineones_zeros_bias)mr   r    r    r!   r2   Y   s8   


r2   c                 C   s*   t t| | d}|d dkr|d7 }|S )Nr/      r   )maxint)kernel_sizekernel_widthnew_kernel_sizer    r    r!   compute_new_kernel_sizev   s   rP   returnc                 C   s(   |dkr|dkrt d|| d  d S )Nr/   -Only stride OR dilation may be greater than 1rJ   )r<   )rM   stridedilationr    r    r!   get_same_padding~   s   rU   c                 C   s   |dkr|dkrt d| d | }|}t| ||}| |kr2td| d| d|  d| d	 |S ||k rDtd| d	| d|   |dkrZ||  d ||  }|| }||fS ||fS )
Nr/   rR   zEFuture context window is larger than the kernel size!
Left context = z  | Right context = greater than z | Kernel size = z@
Switching to symmetric padding (left context = right context = )zFuture context window is larger than half the kernel size!
Conv layer therefore uses more future information than past to compute its output!
Left context = z | Right context = )r<   rU   r   warning)rM   rS   rT   future_contextleft_contextright_contextsymmetric_paddingr    r    r!   get_asymtric_padding   sB   r\   xcontext_windowmaskc                 C   sf   | j d }||k rtj| ddd|jddd| j }|S tj| ddd|jddd| j }|S )a  
    Calculates the masked average over padded limited context segment during inference mode.

    Args:
        x: Input tensor. Shape = [B, C, T]
        context_window: Integer context window, must be 0 or greater.
        mask: Mask tensor, 1 represents value index, 0 represents padded index. Shape = [B, 1, T].

    Returns:
        A tensor reduced via masked average pool over some limited context. Shape = [B, C, 1]
    Tdimkeepdim)shaper   sumtodtype)r]   r^   r_   	timestepsyr    r    r!   _se_pool_step_script_infer   s   
&&rj   c                 C   s   | j d }||k rtj| ddd|jddd| j }|S tjd|| dgtjdd }| dddd||| f } |dddd||| f }|jddd| j}| jddd}||d  }|S )	aO  
    Calculates the masked average over padded limited context segment during training mode.
    Randomly slices a segment of length `context_window` from signal+padded input tensor across all channels and
    uses it for computing masked limited context.

    Args:
        x: Input tensor. Shape = [B, C, T]
        context_window: Integer context window, must be 0 or greater.
        mask: Mask tensor, 1 represents value index, 0 represents padded index. Shape = [B, 1, T].

    Returns:
        A tensor reduced via masked average pool over some limited context. Shape = [B, C, 1]
    r`   Tra   r   r/   )sizerg   Ng:0yE>)rd   r   re   rf   rg   randintint32)r]   r^   r_   rh   ri   	start_idxr    r    r!   _se_pool_step_script_train   s   
&
ro   lenscurrent_maxlenoriginal_maxlenc                 C   s4   ||krt |}t |}||fS | }|}||fS N)r   aranger   )rp   rq   rr   new_lensnew_max_lensr    r    r!   _masked_conv_init_lens   s   

rw   c                       sX   e Zd Zg dZ								d fdd	Zd	d
 Zdd ZdddZdd Z  Z	S )r1   )use_conv_maskreal_out_channelsheadsr/   r   r`   FTc              
      s~  t t|   |dks||kstd|| _|dkr |}|}|}|| _t|ttfv r6t	j
|dd| _d}nd | _trL|rLtj||||||||	d| _ntsT|rTtdt	j||||||||	d| _|
| _|| _| jjd dkod	| jjd  | jjd | jjd d  k| _| jd u rd
| _n| jjd dkot| j| jjd | jjd d  k| _| jrtd| _td| _d S d S )Nr`   z)Only use heads for depthwise convolutionsr#   )valuer   )rS   paddingrT   groupsrH   ~pytorch-quantization is not installed. Install from https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization.r/   rJ   F)superr1   __init__r<   ry   _paddingtypetuplelistr   ConstantPad1d	pad_layerPYTORCH_QUANTIZATION_AVAILABLEquant_nnQuantConv1dr3   ImportErrorr4   use_maskrz   rS   r|   rT   rM   same_paddingsame_padding_asymmetricre   r   r   max_lenrp   )selfin_channelsout_channelsrM   rS   r|   rT   r}   rz   rH   r   quantize	__class__r    r!   r      sh   


*
$zMaskedConv1d.__init__c                 C   s   | j s| jr|S | jd u r3tj|d| jjd   | jjd | jjd d   d | jj	d ddd S tj|t
| j | jjd | jjd d   d | jj	d ddd S )NrJ   r   r/   trunc)rounding_mode)r   r   r   r   divr3   r|   rT   rM   rS   re   r   )r   rp   r    r    r!   get_seq_lenD  s&   
2

,
zMaskedConv1d.get_seq_lenc                 C   s   | j r|d| jkr| j|d|jd | ||}| |}| jd ur+| |}|j}| j	dkr=|
d| j	|d }| |}| j	dkrQ|
|d | jd}||fS )NrJ   devicer`   r   )r   rk   r   update_masked_lengthr   
mask_inputr   r   rd   rz   viewr3   ry   )r   r]   rp   shoutr    r    r!   forward[  s   





zMaskedConv1d.forwardNc                 C   sH   |d u rt | j|| j\| _| _| j|| _d S || _t|| _d S rs   )rw   rp   r   rf   r   r   r   r   	seq_ranger   r    r    r!   r   t  s
   z!MaskedConv1d.update_masked_lengthc                 C   sL   | d}| jd | d|j|dk }||dj|jd }|S )NrJ   r   r/   r   )rk   rp   	unsqueezerf   r   )r   r]   rp   r   r_   r    r    r!   r   |  s   
&zMaskedConv1d.mask_input)r/   r   r/   r/   r`   FTF)NN)
__name__
__module____qualname____constants__r   r   r   r   r   __classcell__r    r    r   r!   r1      s    S
r1   c                       s$   e Zd Z fddZdd Z  ZS )GroupShufflec                    s"   t t|   || _|| | _d S rs   )r   r   r   r}   channels_per_group)r   r}   channelsr   r    r!   r     s   zGroupShuffle.__init__c                 C   sN   |j }|d| j| j|d }t|dd }|d| j| j |d }|S )Nr`   r/   rJ   )rd   r   r}   r   r   	transpose
contiguous)r   r]   r   r    r    r!   r     s
   zGroupShuffle.forward)r   r   r   r   r   r   r    r    r   r!   r     s    r   c                       s~   e Zd Z				ddedededed	ee d
ef fddZdd Z	dd Z
dd ZdddZdddZdefddZ  ZS )SqueezeExciter`   nearestNFr   reduction_ratior^   interpolation_mode
activationr   c              	      s   t t|   || _|| _d| _|du rtjdd}tr7|r7t	t
j||| dd|t
j|| |dd| _n ts?|r?tdt	tj||| dd|tj|| |dd| _td| _| j|d | d	 dS )
a  
        Squeeze-and-Excitation sub-module.

        Args:
            channels: Input number of channels.
            reduction_ratio: Reduction ratio for "squeeze" layer.
            context_window: Integer number of timesteps that the context
                should be computed over, using stride 1 average pooling.
                If value < 1, then global context is computed.
            interpolation_mode: Interpolation mode of timestep dimension.
                Used only if context window is > 1.
                The modes available for resizing are: `nearest`, `linear` (3D-only),
                `bilinear`, `area`
            activation: Intermediate activation function used. Must be a
                callable activation function.
        NT)inplaceF)rH   r~   r/   )r^      )r   r   r   r   	_quantizepoolr   ReLUr   
Sequentialr   QuantLinearfcr   r5   AdaptiveAvgPool1dgapchange_context_windowset_max_len)r   r   r   r^   r   r   r   r   r    r!   r     s0   zSqueezeExcite.__init__c                 C   s   |  ||S rs   )forward_for_export)r   r]   lengthsr    r    r!   r     s   zSqueezeExcite.forwardc                 C   s  |j d }|| jkr| | |j}tjj|jjdd[ | j	|||jd}| }| j
d jjtjkr8| j
  |jtjkrB| }||d}| ||}|dd}| 
|}|dd}t|}|| }||}W d    ||fS 1 szw   Y  ||fS )Nr`   F)enabled)max_audio_lengthr   r   r#   r/   )rd   r   r   rg   r   ampautocastr   r   make_pad_maskr   r8   float32floatmasked_fill_se_pool_stepr   sigmoidrf   )r   r]   r   r   rg   r_   ri   r    r    r!   r     s.   






z SqueezeExcite.forward_for_exportc                 C   sd   | }| j dk rtj|ddd|jddd|j }|S | jr)t|| j |}|S t|| j |}|S )Nr   r`   Tra   )r^   r   re   r   rg   trainingro   rj   )r   r]   r_   ri   r    r    r!   r     s   
&	zSqueezeExcite._se_pool_stepc                 C   sV   || _ |du rt|  j}tjd| j |d}t| dr!|| _dS | jd|dd dS )zSSets maximum input length.
        Pre-calculates internal seq_range mask.
        Nr   r   r   F)
persistent)	r   next
parametersr   r   rt   hasattrr   register_bufferr   r    r    r!   r     s   

zSqueezeExcite.set_max_lenc                 C   sn   |r| j j|kr| j || _ | j j|jkr|| j j}| j d| |dd|dk }|d}|S )zMake masking for padding.Nr   r`   r/   )r   r   rf   expandrk   r   )r   seq_lensr   r   r_   r    r    r!   r     s   &
zSqueezeExcite.make_pad_maskc                 C   s,   t | drtd| j d|  || _dS )a  
        Update the context window of the SqueezeExcitation module, in-place if possible.

        Will update the pooling layer to either nn.AdaptiveAvgPool1d() (for global SE) or nn.AvgPool1d()
        (for limited context SE).

        If only the context window is changing but still a limited SE context block - then
        the earlier instance of nn.AvgPool1d() will be updated.

        Args:
            context_window: An integer representing the number of input timeframes that will be used
                to compute the context. Each timeframe corresponds to a single window stride of the
                STFT features.

                Say the window_stride = 0.01s, then a context window of 128 represents 128 * 0.01 s
                of context to compute the Squeeze step.
        r^   z0Changing Squeeze-Excitation context window from z to N)r   r   infor^   )r   r^   r    r    r!   r   !  s   

z#SqueezeExcite.change_context_window)r`   r   NFrs   )r   r   r   rL   strr   r   boolr   r   r   r   r   r   r   r   r    r    r   r!   r     s0    ;"

r   c                       s   e Zd ZdZg dZddddddddd	dd
ddddg d
d
dddd
dd
dfdedef fddZ					
			
	
dddZ					
			
			
d ddZd!ddZ	de
ee ee f de
ee ee f fddZ  ZS )"JasperBlocka  
    Constructs a single "Jasper" block. With modified parameters, also constructs other blocks for models
    such as `QuartzNet` and `Citrinet`.

    - For `Jasper`    : `separable` flag should be False
    - For `QuartzNet` : `separable` flag should be True
    - For `Citrinet`  : `separable` flag and `se` flag should be True

    Note that above are general distinctions, each model has intricate differences that expand over
    multiple such blocks.

    For further information about the differences between models which use JasperBlock, please review
    the configs for ASR models found in the ASR examples directory.

    Args:
        inplanes: Number of input channels.
        planes: Number of output channels.
        repeat: Number of repeated sub-blocks (R) for this block.
        kernel_size: Convolution kernel size across all repeated sub-blocks.
        kernel_size_factor: Floating point scale value that is multiplied with kernel size,
            then rounded down to nearest odd integer to compose the kernel size. Defaults to 1.0.
        stride: Stride of the convolutional layers.
        dilation: Integer which defined dilation factor of kernel. Note that when dilation > 1, stride must
            be equal to 1.
        padding: String representing type of padding. Currently only supports "same" padding,
            which symmetrically pads the input tensor with zeros.
        dropout: Floating point value, determins percentage of output that is zeroed out.
        activation: String representing activation functions. Valid activation functions are :
            {"hardtanh": nn.Hardtanh, "relu": nn.ReLU, "selu": nn.SELU, "swish": Swish}.
            Defaults to "relu".
        residual: Bool that determined whether a residual branch should be added or not.
            All residual branches are constructed using a pointwise convolution kernel, that may or may not
            perform strided convolution depending on the parameter `residual_mode`.
        groups: Number of groups for Grouped Convolutions. Defaults to 1.
        separable: Bool flag that describes whether Time-Channel depthwise separable convolution should be
            constructed, or ordinary convolution should be constructed.
        heads: Number of "heads" for the masked convolution. Defaults to -1, which disables it.
        normalization: String that represents type of normalization performed. Can be one of
            "batch", "group", "instance" or "layer" to compute BatchNorm1D, GroupNorm1D, InstanceNorm or
            LayerNorm (which are special cases of GroupNorm1D).
        norm_groups: Number of groups used for GroupNorm (if `normalization` == "group").
        residual_mode: String argument which describes whether the residual branch should be simply
            added ("add") or should first stride, then add ("stride_add"). Required when performing stride on
            parallel branch as well as utilizing residual add.
        residual_panes: Number of residual panes, used for Jasper-DR models. Please refer to the paper.
        conv_mask: Bool flag which determines whether to utilize masked convolutions or not. In general,
            it should be set to True.
        se: Bool flag that determines whether Squeeze-and-Excitation layer should be used.
        se_reduction_ratio: Integer value, which determines to what extend the hidden dimension of the SE
            intermediate step should be reduced. Larger values reduce number of parameters, but also limit
            the effectiveness of SE layers.
        se_context_window: Integer value determining the number of timesteps that should be utilized in order
            to compute the averaged context window. Defaults to -1, which means it uses global context - such
            that all timesteps are averaged. If any positive integer is used, it will utilize limited context
            window of that size.
        se_interpolation_mode: String used for interpolation mode of timestep dimension for SE blocks.
            Used only if context window is > 1.
            The modes available for resizing are: `nearest`, `linear` (3D-only),
            `bilinear`, `area`.
        stride_last: Bool flag that determines whether all repeated blocks should stride at once,
            (stride of S^R when this flag is False) or just the last repeated block should stride
            (stride of S when this flag is True).
        future_context: Int value that determins how many "right" / "future" context frames will be utilized
            when calculating the output of the conv kernel. All calculations are done for odd kernel sizes only.

            By default, this is -1, which is recomputed as the symmetric padding case.

            When future_context >= 0, will compute the asymmetric padding as follows :
            (left context, right context) = [K - 1 - future_context, future_context]

            Determining an exact formula to limit future context is dependent on global layout of the model.
            As such, we provide both "local" and "global" guidelines below.

            Local context limit (should always be enforced)
            - future context should be <= half the kernel size for any given layer
            - future context > kernel size defaults to symmetric kernel
            - future context of layer = number of future frames * width of each frame (dependent on stride)

            Global context limit (should be carefully considered)
            - future context should be layed out in an ever reducing pattern. Initial layers should restrict
            future context less than later layers, since shallow depth (and reduced stride) means each frame uses
            less amounts of future context.
            - Beyond a certain point, future context should remain static for a given stride level. This is
            the upper bound of the amount of future context that can be provided to the model on a global scale.
            - future context is calculated (roughly) as - (2 ^ stride) * (K // 2) number of future frames.
            This resultant value should be bound to some global maximum number of future seconds of audio (in ms).

            Note: In the special case where K < future_context, it is assumed that the kernel is too small to limit
            its future context, so symmetric padding is used instead.

            Note: There is no explicit limitation on the amount of future context used, as long as
            K > future_context constraint is maintained. This might lead to cases where future_context is
            more than half the actual kernel size K! In such cases, the conv layer is utilizing more of the future
            context than its current and past context to compute the output. While this is possible to do,
            it is not recommended and the layer will raise a warning to notify the user of such cases.
            It is advised to simply use symmetric padding for such cases.

            Example:
            Say we have a model that performs 8x stride and receives spectrogram frames with stride of 0.01s.
            Say we wish to upper bound future context to 80 ms.

            Layer ID, Kernel Size, Stride, Future Context, Global Context
            0, K=5,  S=1, FC=8, GC= 2 * (2^0) = 2 * 0.01 ms  (special case, K < FC so use symmetric pad)
            1, K=7,  S=1, FC=3, GC= 3 * (2^0) = 3 * 0.01 ms  (note that symmetric pad here uses 3 FC frames!)
            2, K=11, S=2, FC=4, GC= 4 * (2^1) = 8 * 0.01 ms  (note that symmetric pad here uses 5 FC frames!)
            3, K=15, S=1, FC=4, GC= 4 * (2^1) = 8 * 0.01 ms  (note that symmetric pad here uses 7 FC frames!)
            4, K=21, S=2, FC=2, GC= 2 * (2^2) = 8 * 0.01 ms  (note that symmetric pad here uses 10 FC frames!)
            5, K=25, S=2, FC=1, GC= 1 * (2^3) = 8 * 0.01 ms  (note that symmetric pad here uses 14 FC frames!)
            6, K=29, S=1, FC=1, GC= 1 * (2^3) = 8 * 0.01 ms ...
        quantize: Bool flag whether to quantize the Convolutional blocks.
        layer_idx (int, optional): can be specified to allow layer output capture for InterCTC loss. Defaults to -1.
    )	conv_mask	separableresidual_moderesmconv      r/   same皙?NTFr`   batchaddr   r   rX   	layer_idxc           %         sR  t t|   |dkrtdt  t|tr" fdd|D }nt| g}|dk r9t|d |d |d }nt	|d |d |d |}|| _
|| _|| _|| _|| _|| _|| _|| _d | _|}t }t|d D ]*}|rudg} n|} || j|||| ||||||||d || j|	|
d |}qm|| j||||||||||||d |r|t|||||
|d	 || _| }!|| _|rt }"|d
kr|} ndg} t|dkr|g}!d| _|!D ]}#t| j|#|d||| |d}$|"|$ q|"| _t r| jrt!"t!j#j$| _%nt s|rt&dnd | _tj'| j|	|
d | _(d S )Nr   z*currently only 'same' padding is supportedc                    s   g | ]}t | qS r    )rP   ).0kkernel_size_factorr    r!   
<listcomp>  s    z(JasperBlock.__init__.<locals>.<listcomp>r   r/   )
rM   rS   rT   r|   r}   rz   r   normalizationnorm_groupsr   )	drop_probr   )r   r^   r   r   r   
stride_addF)rM   r   r   rS   r   r~   ))r   r   r   r<   r   r0   r   rP   rU   r\   inplanesplanesr   r   r   ser   r   interctc_should_capturer   
ModuleListrangeextend_get_conv_bn_layer_get_act_dropout_layerappendr   r   copydense_residuallenr   r   r   TensorQuantizerQuantConv2ddefault_quant_desc_inputresidual_quantizerr   r   mout)%r   r   r   repeatrM   r   rS   rT   r|   dropoutr   residualr}   r   rz   r   r   r   residual_panesr   r   se_reduction_ratiose_context_windowse_interpolation_modestride_lastrX   r   r   padding_valinplanes_loopr3   _
stride_val	res_panesres_listipr   r   r   r!   r     s   
zJasperBlock.__init__r   c                 C   st   | j }|rt|||||||||	||dS tr%|r%tj||||||||dS ts-|r-tdtj||||||||dS )N)rS   rT   r|   rH   r}   rz   r   r   )rS   rT   r|   rH   r}   r~   )r   r1   r   r   r   r   r   r4   )r   r   r   rM   rS   rT   r|   rH   r}   rz   r   r   r   r    r    r!   	_get_convN  sP   
zJasperBlock._get_convc                 C   s  |dkr|}|
r&| j |||||||||	|d
| j ||dddd|||d	g}n| j |||||||||d	g}|dkrD|tj||d n6|d	krS|tj||d n'|d
krb|tjd|d n|dkrr|tj|ddd ntd| d|dkr|t|| |S )Nr`   )rS   rT   r|   rH   r}   rz   r   r/   r   )rM   rS   rT   r|   rH   r}   r   )rS   rT   r|   rH   r}   r   group)
num_groupsnum_channelsinstancelayerr   gMbP?g?)epsmomentumzNormalization method (z8) does not match one of [batch, layer, group, instance].)r  r   r   	GroupNormr>   r<   r   )r   r   r   rM   rS   rT   r|   rH   r}   rz   r   r   r   r   layersr    r    r!   r     sj   
zJasperBlock._get_conv_bn_layerc                 C   s*   |d u rt jddd}|t j|dg}|S )Nr#   g      4@)min_valmax_val)p)r   HardtanhDropout)r   r   r   r  r    r    r!   r     s   z"JasperBlock._get_act_dropout_layerinput_rQ   c                 C   s  d}|d }t |dkr|\}}|d }|}t| jD ]\}}t|ttfr.|||\}}q||}q| jdurt| jD ]M\}}|| }	t|D ]\}
}t|trZ||	|\}	}qI||	}	qI| jdksi| jdkrtrv| j	rv| 
||	 }q=ts| j	rtd||	 }q=t||	}q=| |}|  r|  }t |dkr|dd}| |}|dd}| t| d	dr| jd
dr| jd|d | jdu r| jdi dg }| j|v | _| jr| jd| j |d | jd| j |d | jdur| jr||g |fS |g|fS )a	  
        Forward pass of the module.

        Args:
            input_: The input is a tuple of two values - the preprocessed audio signal as well as the lengths
                of the audio signal. The audio signal is padded to the shape [B, D, T] and the lengths are
                a torch vector of length B.

        Returns:
            The output of the block after processing the input through `repeat` number of sub-blocks,
            as well as the lengths of the encoded audio after padding/striding.
        Nr   rJ   r`   r   r   r~   r/   
model_guidsave_encoder_tensorsFencoder)namer   interctccapture_layerszinterctc/layer_output_zinterctc/layer_length_)r   	enumerater   r0   r1   r   r   r   r   r   r   r   r   rK   r   is_adapter_availableget_enabled_adaptersr   forward_enabled_adaptersis_access_enabledgetattr
access_cfggetregister_accessible_tensorr   r   r   )r   r  	lens_origxsr   rp   ilr  res_outj	res_layerr  adapter_namesr"  r    r    r!   r     sZ   










zJasperBlock.forward)	r   r/   r/   r   Fr/   r`   FF)r   r/   r/   r   Fr/   r`   Fr   r/   F)r   N)r   r   r   __doc__r   rL   r   r  r   r   r   r   r   r   r   r   r    r    r   r!   r   9  st    q &
>

M:r   c                       sh   e Zd ZdZ					ddedededed	ef
 fd
dZdd Zdee	e
 ee
 f fddZ  ZS )ParallelBlocka  
    Computational module that computes several `blocks` independently from each other and aggregates the outputs.
    It expects audio inputs to be passed together with lengths, just like Jasper blocks, and all outputs to have
    the same dimensions but it does not impose any additional requirements on the structure of the blocks themselves.

    Args:
        blocks: List of Jasper blocks that will be computed concurently. It is expected that they accept the same
            input and return outputs with the same number of channels.
        aggregation_mode: an optional string, indicating how the outputs will be aggregated. Supported values are
            ['sum', 'dropout']. "sum" value forces outputs to be summed together. "dropout" value enables tower
            dropout training with different blocks being dropped out during training.
        block_dropout_prob: a probability of dropping any individual block during training with "dropout" aggregation
            mode. Acts as a regularization technique.
        residual_mode: an optional string indicating how residuals will be applied. Supported values are
            ['sum', 'conv']. In 'sum' mode input features are summed together with the output. This will fail if the
            number of channels in the input is different from the number of channels in an output tensor. In 'conv' mode
            inputs are passed through pointwise convolution to make input channel dimension match output channel
            dimension. In this mode `in_filters` and `out_filters` params are required.
        in_filters: number of filters (channels) in the input tensor of each block.
        out_filters: number of filters (channels) in the output tensor of each block.
    re   r#   Naggregation_modeblock_dropout_probr   
in_filtersout_filtersc                    s   t    t|| _ddg| _|| jvr!td| d| j d|| _|dkr;tjt	
t|dd| _t|| _ddg| _|| jvrQtd	| d| j d|| _|dkrp|d u s`|d u rdtd
t||dddd| _d S d S )Nre   r   z$Got non-supported aggregation mode: z. Supported values are .F)requires_gradr3   z!Got non-supported residual mode: zPin_filters and out_filters have to be specified when using 'conv' residual mode.r/   T)rM   rH   r   )r   r   r   r   blockssupported_aggregationsr<   r6  	Parameterr   onesr   weightsr  r   supported_residualsr   r1   res_conv)r   r<  r6  r7  r   r8  r9  r   r    r!   r   M  s,   
	



zParallelBlock.__init__c                 C   sP   |  | j}t|dkr&| j jdk r&|  | j}t|dkr&| j jdk s|S )Nr   r'   )r   r@  r   re   r  )r   r@  r    r    r!   get_dropout_maskp  s
   zParallelBlock.get_dropout_maskr]   c                 C   s  t | jdkr| jd |S d}d}d}| jdkr|  }t| jD ];\}}||\}}|d }	| jdkr=|| |d  }	|du rD|	}n||	 }|du rO|}q"tjt||gddd }q"|d d }
|d }| jdkrr||
 }n| jdkr|| 	|
|d  }|g|fS )	a  
        Forward pass computing aggregated output.

        Args:
            x: tuple of padded signal and lengths the signal. The shape of the signal is [B, D, T]. The lengths are
                1D torch tensor of length B.

        Returns:
           torch tensor after passing input throught each block and aggregating these outputs according to the
           aggregation mode.
        r/   r   Nr   r`   )rb   re   r3   )
r   r<  r6  rC  r#  r   rK   stackr   rB  )r   r]   resultmax_maskscaling_weightsr.  blockoutputr_   weighted_output
input_featrp   r    r    r!   r   v  s2   





zParallelBlock.forward)re   r#   re   NN)r   r   r   r4  r   rL   r   rC  r   r   r   r   r   r   r    r    r   r!   r5  6  s(    #&r5  )r   )r&   )6r   typingr   r   r   r   r   r   torch.nnr   torch.nn.functional
functionalFr   torch.nn.initr   torch.nn.modules.utilsr	   #nemo.collections.common.parts.utilsr
   nemo.core.classes.mixinsr   'nemo.core.classes.mixins.adapter_mixinsr   
nemo.utilsr   pytorch_quantizationr   r   r   !pytorch_quantization.tensor_quantr   r   r   jasper_activationsr"   r%   r   r2   rP   rL   rU   r\   jitscriptrj   ro   rw   Moduler1   r   r   r   r5  r    r    r    r!   <module>   sZ   

$
  $    