o
    wit                  
   @   s   d dl Z d dlZd dlmZ d dlmZ d dlmZmZ d dlm	Z	 G dd dejj
ZG dd dejj
Zdd
dZG dd dej
ZG dd dej
Zdd Zdejdededeeef fddZG dd dejZdS )    N)	LayerNorm)CausalConv1DCausalConv2D)loggingc                       s:   e Zd ZdZd fdd	Zdd Zdd Zd	d
 Z  ZS )StackingSubsamplingau  Stacking subsampling which simply stacks consecutive frames to reduce the sampling rate
    Args:
        subsampling_factor (int): The subsampling factor
        feat_in (int): size of the input features
        feat_out (int): size of the output features
        norm (bool): whether to use an MLP layer after the stacking along with normalization. default is False.
    Fc                    sD   t t|   || _tj|| || _|rt|| _	d S d | _	d S N)
superr   __init__subsampling_factortorchnnLinearproj_outr   pre_norm)selfr
   feat_infeat_outnorm	__class__ n/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/asr/parts/submodules/subsampling.pyr	   "   s   
zStackingSubsampling.__init__c                 C   s   | j S r   r
   r   r   r   r   get_sampling_frames+   s   z'StackingSubsampling.get_sampling_framesc                 C   s   dS )Nr   r   r   r   r   r   get_streaming_cache_size.   s   z,StackingSubsampling.get_streaming_cache_sizec                 C   s   |  \}}}| j|| j  | j }tjj|ddd|f}| jd ur(| |}|  \}}}t|||| j || j f}| |}tj	|| | jdd}||fS )Nr   floor)rounding_mode)
sizer
   r   r   
functionalpadr   reshaper   div)r   xlengthsbthpad_size_r   r   r   forward1   s   


zStackingSubsampling.forward)F)	__name__
__module____qualname____doc__r	   r   r   r*   __classcell__r   r   r   r   r      s    	r   c                       sr   e Zd ZdZde df fdd	Zdd Zdd	 Zd
d Z	dd Z
dd Zdd Zdd ZdefddZ  ZS )ConvSubsamplinga  Convolutional subsampling which supports VGGNet and striding approach introduced in:
    VGGNet Subsampling: Transformer-transducer: end-to-end speech recognition with self-attention (https://arxiv.org/pdf/1910.12977.pdf)
    Striding Subsampling: "Speech-Transformer: A No-Recurrence Sequence-to-Sequence Model for Speech Recognition" by Linhao Dong et al. (https://ieeexplore.ieee.org/document/8462506)
    Args:
        subsampling (str): The subsampling technique from {"vggnet", "striding", "dw-striding"}
        subsampling_factor (int): The subsampling factor which should be a power of 2
        subsampling_conv_chunking_factor (int): Input chunking factor which can be -1 (no chunking)
        1 (auto) or a power of 2. Default is 1
        feat_in (int): size of the input features
        feat_out (int): size of the output features
        conv_channels (int): Number of channels for the convolution layers.
        activation (Module): activation function, default is nn.ReLU()
       Fc	                    s  t t|   || _|| _|| _|| _|d dkrtdtt	
|d| _|| _|| _|dkr>|dkr>|d dkr>td|| _d}	g }
|dkrd| _d| _d| _d| _d| _t| jD ];}|
tjj|	|d	ddd
 |
| |
tjj||d	ddd
 |
| |
tjj| j| j| j| jd |}	q]n(|dkrDd| _d	| _d| _| jr| jd | _| jd | _|d | _n| jd d | _| jd d | _d| _| jr|
t|	|| j| jd d
 n|
tjj|	|| j| j| jd
 |}	|
| t| jd D ]?}| jr|
t|	|	| j| jd |	d n|
tjj|	|	| j| j| j|	d |
tjj|	|ddddd |
| |}	qn|dkrd| _d	| _d| _| jrh| jd | _| jd | _|d | _n| jd d | _| jd d | _d| _t| jD ].}| jr|
t|	|| j| jd d
 n|
tjj|	|| j| j| jd
 |
| |}	qn|dkr3|}	d| _d| _d| _| jr| jd | _| jd | _|d | _n| jd d | _| jd d | _d| _t| jD ]B}| jr|
t|	| j|d kr|n|| j| jd d
 n|
tjj|	| j|d kr|n|| j| j| jd
 |
| |}	qn|dkr|}	d| _d| _d| _| jd d | _| jd d | _|
tjj|	|	| j| j| j|	dtjj|	| jdkro|n|dddddg |}	|
| t| jd D ]2}|
tjj|	|	| j| j| j|	dtjj|	| j|d kr|n|dddddg |
| |}	qntd| d|dv rtj|tj d}t!|| j| j | j| j| j| jd}tj"|t| || _#d| _$n|dv rd | _#d| _$ntd| dt%|
 | _&d S )N   r   z*Sampling factor should be a multiply of 2!r1   Asubsampling_conv_chunking_factor should be -1, 1, or a power of 2vggnetT   )in_channelsout_channelskernel_sizestridepadding)r9   r:   r;   	ceil_modedw_stridingFr7   r8   r9   r:   r;   groupsstridingstriding_conv1d   dw_striding_conv1dzNot valid sub-sampling: !)r5   r=   r@   dtyper$   all_paddingsr9   r:   r<   
repeat_num)rA   rC   )'r   r0   r	   _subsampling_conv_channels_feat_in	_feat_out
ValueErrorintmathlog_sampling_numr
   	is_causal subsampling_conv_chunking_factor_stride_kernel_size
_ceil_mode_left_padding_right_paddingrangeappendr   r   Conv2d	MaxPool2d_max_cache_lenr   r   Conv1dextendtensorfloatcalc_lengthr   outconv2d_subsamplingMaskedConvSequentialconv)r   subsamplingr
   r   r   conv_channelsrT   
activationrS   r7   layersi	in_length
out_lengthr   r   r   r	   M   s  





	



%

	


	





zConvSubsampling.__init__c                 C   s
   d| j gS )Nr1   r   r   r   r   r   r   {  s   
z#ConvSubsampling.get_sampling_framesc                 C   s   d| j d gS )Nr   r1   r   r   r   r   r   r   ~  s   z(ConvSubsampling.get_streaming_cache_sizec                 C   s8  t || j| j | j| j| j| jd}| js|dd}| j	dkrn| jrn| j	dkr@d| j
 | j | j }t||kr=d}nd}nd}|re| ||\}}}|sd| jdkr\| |}|}n| ||\}}n| ||\}}n| |\}}| jr| \}}}	}
| |dd||	d}||fS |dd}||fS )	N)rH   r9   r:   r<   rI   r1   r2   r3           TFr=   )rc   rX   rY   rV   rU   rW   rR   re   	transposerT   rK   r   numelconv_split_by_batchrJ   conv_split_by_channelrg   r   rd   r!   )r   r#   r$   out_lengthsx_ceilneed_to_splitsuccessr%   cr&   fr   r   r   r*     sB   




zConvSubsampling.forwardc                 C   sl  | j dkrt  d| j }| jd d }| jd }tjj| jd j	| | tjj| jd j
| | tdt| jdD ]>}tjj| j| j	| | tjj| j| j
| | tjj| j|d  j	| | tjj| j|d  j
| | q@| j| j | j d }tjj| jj	| | tjj| jj
| | W d    d S 1 sw   Y  d S d S )Nr=         ?r2         r   r6   r1   )rJ   r   no_gradrV   rK   r   inituniform_rg   weightbiasrZ   lenrM   rL   rR   rd   )r   scaledw_maxpw_maxidxfc_scaler   r   r   reset_parameters  s"   



 ""z ConvSubsampling.reset_parametersc           
         s
  |  ^}}|dkr||dfS  jdkr  j}td|  n%d j  j  j }ttt	
|| d}d| }td|  || }|dkrR||dfS td|   fd	d
tt	||dt	||dD }	t	dd
 |	D t	dd
 |	D dfS )z:Tries to split input by batch, run conv and concat resultsr1   F$using manually set chunking factor: ro   r2    using auto set chunking factor: r   z)conv subsampling: using split batch size c                    s   g | ]
\}}  ||qS r   rg   ).0chunklnr   r   r   
<listcomp>  s    
z7ConvSubsampling.conv_split_by_batch.<locals>.<listcomp>c                 S      g | ]}|d  qS )r   r   r   ar   r   r   r         c                 S   r   r1   r   r   r   r   r   r     r   T)r   rT   r   debugrK   rU   rP   ceilrQ   r   rq   zipsplitcat)
r   r#   r$   r%   r)   cfru   pnew_batch_sizeansr   r   r   rr     s*   



*z#ConvSubsampling.conv_split_by_batchc           	   	      sf  | d}jd |}jd |}tjd D ] | \}}}}jdkr5j}td|  nt	t
t|d d}d| }td|  t|| }|dkrdtd| d d}t|| }|dkrytd| d	 d}td
| d|  j d d  ||}t fddt||dD d}j d d  |}q|S )zGFor dw convs, tries to split input by time, run conv and concat resultsr   r1   r   ro   r2   r   zchunking factor z, is too high; splitting down to one channel.z- is too high; splitting down to one timestep.z(conv dw subsampling: using split C size z and split T size r6   c                    s"   g | ]}j  d  d   |qS )r6   r   )r   r   rl   r   r   r   r     s   " z9ConvSubsampling.conv_split_by_channel.<locals>.<listcomp>   )	unsqueezerg   rZ   rR   r   rT   r   r   rP   r   rQ   r   rq   rO   warningchannel_chunked_convr   r   )	r   r#   r)   rx   r&   r   r   new_cnew_tr   r   r   rs     s0   

&z%ConvSubsampling.conv_split_by_channelc           	   	   C   s  d}g }t ||dD ]x}| d }| jrTtjj|| jd | jd | jd | jd fd}tjj	||j
||| ddddddf |j|||  | jd|d}n&tjj	||j
||| ddddddf |j|||  | j| j|d}|| ||7 }qt |dS )z$Performs channel chunked convolutionr   r1   )r    N)r   r:   r;   r?   )r   r   r   rS   r   r   r    rV   rU   conv2dr   r   rX   r[   r   )	r   rg   
chunk_sizer#   ind
out_chunksr   stepch_outr   r   r   r     s6   $$	$

z$ConvSubsampling.channel_chunked_convrT   c                 C   s.   |dkr|dkr|d dkrt d|| _d S )Nr3   r1   r2   r   r4   )rN   rT   )r   rT   r   r   r   'change_subsampling_conv_chunking_factor6  s
   
z7ConvSubsampling.change_subsampling_conv_chunking_factor)r+   r,   r-   r.   r   ReLUr	   r   r   r*   r   rr   rs   r   rO   r   r/   r   r   r   r   r0   >   s       05 '"r0   r1   c           	      C   s`   || }d}t |D ]}t| jtjd| || } |r#t| } q
t| } q
| jtjdS )zZCalculates the output length of a Tensor passed through a convolution or max pooling layerrz   rE   )rZ   r   r"   torb   r   r   rO   )	r$   rH   r9   r:   r<   rI   add_padonerl   r   r   r   rc   @  s   rc   c                	       sF   e Zd ZdZddedededef fdd	ZdddZdd Z  ZS )TimeReductionModulea  
    Squeezeformer Time Reduction procedure. Downsamples the audio by `stride` in the time dimension.

    Args:
        d_model (int): input dimension of MultiheadAttentionMechanism and PositionwiseFeedForward
        out_dim (int): Output dimension of the module.
        kernel_size (int): Conv kernel size for depthwise convolution in convolution module
        stride (int): Downsampling factor in time dimension.
    rB   r2   d_modelout_dimr9   r:   c                    st   t    || _|| _|| _|| _td| j| j | _tj	||||| j|d| _
tj	||ddddd| _|   d S )Nr   r>   r1   )r   r	   r   r   r9   r:   maxr;   r   r_   dw_convpw_convr   )r   r   r   r9   r:   r   r   r   r	   X  s.   
		zTimeReductionModule.__init__Nc                 C   s   | dd}|d ur| |dd}| |}| |}| dd}| \}}}|d urd|d urd|d d d d | jd d | jf }|d d d d | jf }|d}tj	j
|ddd|| f}|||fS )Nr1   r2   g        r3   r   )rp   rb   masked_fillr   r   r   r   r:   r   r   r   r    )r   r#   att_maskpad_maskBTDLr   r   r   r*   u  s   

"

zTimeReductionModule.forwardc                 C   s   | j d }| jd }t 9 tjj| jj| | tjj| jj	| | tjj| j
j| | tjj| j
j	| | W d    d S 1 sJw   Y  d S )Nr{   )r9   r   r   r|   r   r}   r~   r   r   r   r   )r   r   r   r   r   r   r     s   


"z$TimeReductionModule.reset_parameters)rB   r2   )NN)	r+   r,   r-   r.   rO   r	   r*   r   r/   r   r   r   r   r   M  s
     

r   c                       s8   e Zd ZdZd
dededef fddZdd	 Z  ZS )SubsamplingReductionModulez/Downsamples the audio signal in time dimension.r2   	reductionr   reduction_factorc              	      s   t    |dv sJ || _|| _tt|d| _|dkr4tj	|d| _
d| _| j
j| _| j
j| _d S |dkrHtd||||t dd| _
d S d S )	N)poolingr@   r2   r   )r9   r   r@   F)rh   r
   r   r   ri   rj   rS   )r   r	   r   r   rO   rP   rQ   rR   r   	MaxPool1dreduction_encr;   r9   r:   r0   r   )r   r   r   r   r   r   r   r	     s*   

z#SubsamplingReductionModule.__init__c                 C   sn   | j dkr| j||d\}}||fS t|dd}t|| j| j| jd| jd}| |}t|dd}||fS )z>Shapes:
        - x: [B, T, C]
        - lengths: [B]
        r@   )r#   r$   r1   r2   FrG   )	r   r   r   rp   rc   r;   r9   r:   rR   )r   r#   r$   r   r   r   r*     s   

z"SubsamplingReductionModule.forward)r2   )	r+   r,   r-   r.   strrO   r	   r*   r/   r   r   r   r   r     s    r   c                 C   s,   | j \}}}}|d||||}| | S )z,Apply mask to tensor with channel dimension.r1   )shaper   expand)ra   mask
batch_sizechannelstimefeaturesexpanded_maskr   r   r   apply_channel_mask  s   r   
input_sizer9   r:   r;   c                 C   s    | |d  |d  | | d S )z.Calculate exact output size after convolution.r   r1   r   )r   r9   r:   r;   r   r   r   calculate_conv_output_size  s    r   c                   @   s   e Zd Zdd Zdd ZdS )rf   c                 C   s   | d}|  }| || }t| D ];\}}t||}||}t|drR|jdkrRt|dr:|j	|j
f}n|j}t||jd |jd |}| || }qt||}|| fS )Nr1   r:   )r1   r1   rX   r   )r   clonerb   _create_masklong	enumerater   hasattrr:   rX   rY   r;   r   r9   )r   r#   r$   current_lengthsr   rl   layerr;   r   r   r   r*     s"   



zMaskedConvSequential.forwardc                 C   sL   |j \}}}}tj||jd|||dk }|d||||jS )z'Create mask matching tensor dimensions.)devicer1   r3   )r   r   aranger   r   r   r   rF   )r   ra   r$   r   r   r   r   	time_maskr   r   r   r     s   "z!MaskedConvSequential._create_maskN)r+   r,   r-   r*   r   r   r   r   r   rf     s    rf   r   )rP   r   torch.nnr   r   2nemo.collections.asr.parts.submodules.causal_convsr   r   
nemo.utilsr   Moduler   r0   rc   r   r   r   TensorrO   tupler   
Sequentialrf   r   r   r   r   <module>   s$   %    
F4$	