o
    }oi5j                     @   s   d dl Z d dlZd dlmZ d dlmZ d dlmZmZ d dlm	Z	 G dd dejj
ZG dd dejj
Zdd
dZG dd dej
ZG dd dej
ZdS )    N)	LayerNorm)CausalConv1DCausalConv2D)loggingc                       s:   e Zd ZdZd fdd	Zdd Zdd Zd	d
 Z  ZS )StackingSubsamplingau  Stacking subsampling which simply stacks consecutive frames to reduce the sampling rate
    Args:
        subsampling_factor (int): The subsampling factor
        feat_in (int): size of the input features
        feat_out (int): size of the output features
        norm (bool): whether to use an MLP layer after the stacking along with normalization. default is False.
    Fc                    sD   t t|   || _tj|| || _|rt|| _	d S d | _	d S N)
superr   __init__subsampling_factortorchnnLinearproj_outr   pre_norm)selfr
   feat_infeat_outnorm	__class__ e/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/parts/submodules/subsampling.pyr	   "   s   
zStackingSubsampling.__init__c                 C   s   | j S r   r
   r   r   r   r   get_sampling_frames+   s   z'StackingSubsampling.get_sampling_framesc                 C   s   dS )Nr   r   r   r   r   r   get_streaming_cache_size.   s   z,StackingSubsampling.get_streaming_cache_sizec                 C   s   |  \}}}| j|| j  | j }tjj|ddd|f}| jd ur(| |}|  \}}}t|||| j || j f}| |}tj	|| | jdd}||fS )Nr   floor)rounding_mode)
sizer
   r   r   
functionalpadr   reshaper   div)r   xlengthsbthpad_size_r   r   r   forward1   s   


zStackingSubsampling.forward)F)	__name__
__module____qualname____doc__r	   r   r   r*   __classcell__r   r   r   r   r      s    	r   c                       sr   e Zd ZdZde df fdd	Zdd Zdd	 Zd
d Z	dd Z
dd Zdd Zdd ZdefddZ  ZS )ConvSubsamplinga  Convolutional subsampling which supports VGGNet and striding approach introduced in:
    VGGNet Subsampling: Transformer-transducer: end-to-end speech recognition with self-attention (https://arxiv.org/pdf/1910.12977.pdf)
    Striding Subsampling: "Speech-Transformer: A No-Recurrence Sequence-to-Sequence Model for Speech Recognition" by Linhao Dong et al. (https://ieeexplore.ieee.org/document/8462506)
    Args:
        subsampling (str): The subsampling technique from {"vggnet", "striding", "dw-striding"}
        subsampling_factor (int): The subsampling factor which should be a power of 2
        subsampling_conv_chunking_factor (int): Input chunking factor which can be -1 (no chunking) 
        1 (auto) or a power of 2. Default is 1
        feat_in (int): size of the input features
        feat_out (int): size of the output features
        conv_channels (int): Number of channels for the convolution layers.
        activation (Module): activation function, default is nn.ReLU()
       Fc	                    s  t t|   || _|| _|| _|| _|d dkrtdtt	
|d| _|| _|| _|dkr>|dkr>|d dkr>td|| _d}	g }
|dkrd| _d| _d| _d| _d| _t| jD ];}|
tjj|	|d	ddd
 |
| |
tjj||d	ddd
 |
| |
tjj| j| j| j| jd |}	q]n(|dkrDd| _d	| _d| _| jr| jd | _| jd | _|d | _n| jd d | _| jd d | _d| _| jr|
t|	|| j| jd d
 n|
tjj|	|| j| j| jd
 |}	|
| t| jd D ]?}| jr|
t|	|	| j| jd |	d n|
tjj|	|	| j| j| j|	d |
tjj|	|ddddd |
| |}	qn|dkrd| _d	| _d| _| jrh| jd | _| jd | _|d | _n| jd d | _| jd d | _d| _t| jD ].}| jr|
t|	|| j| jd d
 n|
tjj|	|| j| j| jd
 |
| |}	qn|dkr3|}	d| _d| _d| _| jr| jd | _| jd | _|d | _n| jd d | _| jd d | _d| _t| jD ]B}| jr|
t|	| j|d kr|n|| j| jd d
 n|
tjj|	| j|d kr|n|| j| j| jd
 |
| |}	qn|dkr|}	d| _d| _d| _| jd d | _| jd d | _|
tjj|	|	| j| j| j|	dtjj|	| jdkro|n|dddddg |}	|
| t| jd D ]2}|
tjj|	|	| j| j| j|	dtjj|	| j|d kr|n|dddddg |
| |}	qntd| d|dv rtj|tj d}t!|| j| j | j| j| j| jd}tj"|t| || _#d| _$n|dv rd | _#d| _$ntd| dtjj%|
 | _&d S )N   r   z*Sampling factor should be a multiply of 2!r1   Asubsampling_conv_chunking_factor should be -1, 1, or a power of 2vggnetT   )in_channelsout_channelskernel_sizestridepadding)r9   r:   r;   	ceil_modedw_stridingFr7   r8   r9   r:   r;   groupsstridingstriding_conv1d   dw_striding_conv1dzNot valid sub-sampling: !)r5   r=   r@   dtyper$   all_paddingsr9   r:   r<   
repeat_num)rA   rC   )'r   r0   r	   _subsampling_conv_channels_feat_in	_feat_out
ValueErrorintmathlog_sampling_numr
   	is_causal subsampling_conv_chunking_factor_stride_kernel_size
_ceil_mode_left_padding_right_paddingrangeappendr   r   Conv2d	MaxPool2d_max_cache_lenr   r   Conv1dextendtensorfloatcalc_lengthr   outconv2d_subsampling
Sequentialconv)r   subsamplingr
   r   r   conv_channelsrT   
activationrS   r7   layersi	in_length
out_lengthr   r   r   r	   M   s  





	



%

	


	





zConvSubsampling.__init__c                 C   s
   d| j gS )Nr1   r   r   r   r   r   r   {  s   
z#ConvSubsampling.get_sampling_framesc                 C   s   d| j d gS )Nr   r1   r   r   r   r   r   r   ~  s   z(ConvSubsampling.get_streaming_cache_sizec           
      C   s,  t || j| j | j| j| j| jd}| jr|d}n|	dd}| j
dkrj| jrj| j
dkrFd| j | j | j }t||krCd}nd}nd}|rd| |\}}|sc| jdkr^| |}n| |}n| |}n| |}| jr| \}}}}	| |	dd||d}||fS |	dd}||fS )	N)rH   r9   r:   r<   rI   r1   r2   r3           TFr=   )rc   rX   rY   rV   rU   rW   rR   re   	unsqueeze	transposerT   rK   r   numelconv_split_by_batchrJ   conv_split_by_channelrg   r   rd   r!   )
r   r#   r$   x_ceilneed_to_splitsuccessr%   cr&   fr   r   r   r*     sB   





zConvSubsampling.forwardc                 C   sl  | j dkrt  d| j }| jd d }| jd }tjj| jd j	| | tjj| jd j
| | tdt| jdD ]>}tjj| j| j	| | tjj| j| j
| | tjj| j|d  j	| | tjj| j|d  j
| | q@| j| j | j d }tjj| jj	| | tjj| jj
| | W d    d S 1 sw   Y  d S d S )Nr=         ?r2         r   r6   r1   )rJ   r   no_gradrV   rK   r   inituniform_rg   weightbiasrZ   lenrM   rL   rR   rd   )r   scaledw_maxpw_maxidxfc_scaler   r   r   reset_parameters  s"   



 ""z ConvSubsampling.reset_parametersc                    s   |  \}}}}|dkr|dfS  jdkr! j}td|  n%d j  j  j }ttt	
|| d}d| }td|  || }|dkrR|dfS td|  t	 fd	d
t	||dD dfS )z< Tries to split input by batch, run conv and concat results r1   F$using manually set chunking factor: ro   r2    using auto set chunking factor: r   z)conv subsampling: using split batch size c                    s   g | ]}  |qS r   rg   .0chunkr   r   r   
<listcomp>  s    z7ConvSubsampling.conv_split_by_batch.<locals>.<listcomp>T)r   rT   r   debugrK   rU   rP   ceilrQ   r   rr   catsplit)r   r#   r%   r)   cfru   pnew_batch_sizer   r   r   rs     s   
&z#ConvSubsampling.conv_split_by_batchc           	   	      s\  j d |}j d |}tjd D ] | \}}}}jdkr0j}td|  ntt	t
|d d}d| }td|  t|| }|dkr_td| d d}t|| }|dkrttd| d	 d}td
| d|  j  d d  ||}t
 fddt
||dD d}j  d d  |}q|S )zI For dw convs, tries to split input by time, run conv and concat results r   r1   r   ro   r2   r   zchunking factor z, is too high; splitting down to one channel.z- is too high; splitting down to one timestep.z(conv dw subsampling: using split C size z and split T size r6   c                    s"   g | ]}j  d  d   |qS )r6   r   r   rl   r   r   r   r     s   " z9ConvSubsampling.conv_split_by_channel.<locals>.<listcomp>   )rg   rZ   rR   r   rT   r   r   rP   r   rQ   r   rr   rO   warningchannel_chunked_convr   r   )	r   r#   r)   rx   r&   r   r   new_cnew_tr   r   r   rt     s.   
&z%ConvSubsampling.conv_split_by_channelc           	   	   C   s  d}g }t ||dD ]x}| d }| jrTtjj|| jd | jd | jd | jd fd}tjj	||j
||| ddddddf |j|||  | jd|d}n&tjj	||j
||| ddddddf |j|||  | j| j|d}|| ||7 }qt |dS )z% Performs channel chunked convolutionr   r1   )r    N)r   r:   r;   r?   )r   r   r   rS   r   r   r    rV   rU   conv2dr   r   rX   r[   r   )	r   rg   
chunk_sizer#   ind
out_chunksr   stepch_outr   r   r   r   
  s6   $$	$

z$ConvSubsampling.channel_chunked_convrT   c                 C   s.   |dkr|dkr|d dkrt d|| _d S )Nr3   r1   r2   r   r4   )rN   rT   )r   rT   r   r   r   'change_subsampling_conv_chunking_factor,  s
   
z7ConvSubsampling.change_subsampling_conv_chunking_factor)r+   r,   r-   r.   r   ReLUr	   r   r   r*   r   rs   rt   r   rO   r   r/   r   r   r   r   r0   >   s       06$"r0   r1   c           	      C   s`   || }d}t |D ]}t| jtjd| || } |r#t| } q
t| } q
| jtjdS )z[ Calculates the output length of a Tensor passed through a convolution or max pooling layerrz   rE   )rZ   r   r"   torb   r   r   rO   )	r$   rH   r9   r:   r<   rI   add_padonerl   r   r   r   rc   6  s   rc   c                	       sF   e Zd ZdZddedededef fdd	ZdddZdd Z  ZS )TimeReductionModulea  
    Squeezeformer Time Reduction procedure. Downsamples the audio by `stride` in the time dimension.

    Args:
        d_model (int): input dimension of MultiheadAttentionMechanism and PositionwiseFeedForward
        out_dim (int): Output dimension of the module.
        kernel_size (int): Conv kernel size for depthwise convolution in convolution module
        stride (int): Downsampling factor in time dimension.
    rB   r2   d_modelout_dimr9   r:   c                    st   t    || _|| _|| _|| _td| j| j | _tj	||||| j|d| _
tj	||ddddd| _|   d S )Nr   r>   r1   )r   r	   r   r   r9   r:   maxr;   r   r_   dw_convpw_convr   )r   r   r   r9   r:   r   r   r   r	   N  s$   
	zTimeReductionModule.__init__Nc                 C   s   | dd}|d ur| |dd}| |}| |}| dd}| \}}}|d urd|d urd|d d d d | jd d | jf }|d d d d | jf }|d}tj	j
|ddd|| f}|||fS )Nr1   r2   g        r3   r   )rq   rb   masked_fillrp   r   r   r   r:   r   r   r   r    )r   r#   att_maskpad_maskBTDLr   r   r   r*   f  s   

"

zTimeReductionModule.forwardc                 C   s   | j d }| jd }t 9 tjj| jj| | tjj| jj	| | tjj| j
j| | tjj| j
j	| | W d    d S 1 sJw   Y  d S )Nr{   )r9   r   r   r|   r   r}   r~   r   r   r   r   )r   r   r   r   r   r   r   y  s   


"z$TimeReductionModule.reset_parameters)rB   r2   )NN)	r+   r,   r-   r.   rO   r	   r*   r   r/   r   r   r   r   r   C  s
     

r   c                       s8   e Zd ZdZd
dededef fddZdd	 Z  ZS )SubsamplingReductionModulez/Downsamples the audio signal in time dimension.r2   	reductionr   reduction_factorc              	      s   t    |dv sJ || _|| _tt|d| _|dkr4tj	|d| _
d| _| j
j| _| j
j| _d S |dkrHtd||||t dd| _
d S d S )	N)poolingr@   r2   r   )r9   r   r@   F)rh   r
   r   r   ri   rj   rS   )r   r	   r   r   rO   rP   rQ   rR   r   	MaxPool1dreduction_encr;   r9   r:   r0   r   )r   r   r   r   r   r   r   r	     s*   

z#SubsamplingReductionModule.__init__c                 C   sn   | j dkr| j||d\}}||fS t|dd}t|| j| j| jd| jd}| |}t|dd}||fS )zFShapes:
            - x: [B, T, C]
            - lengths: [B]
        r@   )r#   r$   r1   r2   FrG   )	r   r   r   rq   rc   r;   r9   r:   rR   )r   r#   r$   r   r   r   r*     s   

z"SubsamplingReductionModule.forward)r2   )	r+   r,   r-   r.   strrO   r	   r*   r/   r   r   r   r   r     s    r   )r1   )rP   r   torch.nnr   r   2nemo.collections.asr.parts.submodules.causal_convsr   r   
nemo.utilsr   Moduler   r0   rc   r   r   r   r   r   r   <module>   s   %   
{A