o
    }oi6N                     @   s,  d dl Z d dlmZ d dlmZmZmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZmZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlm Z  d dl!m"Z"m#Z#m$Z$m%Z% dgZ&G dd de eeZ'G dd de'ej(Z)	 e*e'du rej+e'e)d dS dS )    N)OrderedDict)ListOptionalSet)
DictConfig)PositionalEncodingRelPositionalEncoding)SqueezeformerLayer)ConvSubsamplingStackingSubsamplingTimeReductionModule)adapter_utils)	typecheck)
Exportable)AccessMixinadapter_mixins)NeuralModule)AcousticEncodedRepresentationLengthsType
NeuralTypeSpectrogramTypeSqueezeformerEncoderc                .       s  e Zd ZdZd:ddZedd Zedd	 Z	
			
															d;dedededede	dededede	dede
ee  d ed!ed"ed#ed$e	d%ed&ed'ed(ed)e
e d*e
e f, fd+d,Zd-d. Ze d<d/d0Ze d1d2 Zd3efd4d5Zd6d7 Zd=d8d9Z  ZS )>r   a  
    The encoder for ASR model of Squeezeformer.
    Based on this paper:
    'Squeezeformer: An Efficient Transformer for Automatic Speech Recognition' by Sehoon Kim et al.
    https://arxiv.org/abs/2206.00888

    Args:
        feat_in (int): the size of feature channels
        n_layers (int): number of layers of ConformerBlock
        d_model (int): the hidden size of the model
        feat_out (int): the size of the output features
            Defaults to -1 (means feat_out is d_model)
        subsampling (str): the method of subsampling, choices=['vggnet', 'striding', 'dw_striding']
            Defaults to dw_striding.
        subsampling_factor (int): the subsampling factor which should be power of 2
            Defaults to 4.
        subsampling_conv_channels (int): the size of the convolutions in the subsampling module
            Defaults to -1 which would set it to d_model.
        ff_expansion_factor (int): the expansion factor in feed forward layers
            Defaults to 4.
        self_attention_model (str): type of the attention layer and positional encoding
            'rel_pos': relative positional embedding and Transformer-XL
            'abs_pos': absolute positional embedding and Transformer
            default is rel_pos.
        pos_emb_max_len (int): the maximum length of positional embeddings
            Defaulst to 5000
        n_heads (int): number of heads in multi-headed attention layers
            Defaults to 4.
        xscaling (bool): enables scaling the inputs to the multi-headed attention layers by sqrt(d_model)
            Defaults to True.
        untie_biases (bool): whether to not share (untie) the bias weights between layers of Transformer-XL
            Defaults to True.
        conv_kernel_size (int): the size of the convolutions in the convolutional modules
            Defaults to 31.
        conv_norm_type (str): the type of the normalization in the convolutional modules
            Defaults to 'batch_norm'.
        dropout (float): the dropout rate used in all layers except the attention layers
            Defaults to 0.1.
        dropout_emb (float): the dropout rate used for the positional embeddings
            Defaults to 0.1.
        dropout_att (float): the dropout rate used for the attention layer
            Defaults to 0.0.
        adaptive_scale (bool): Whether to scale the inputs to each component by affine `scale` and `bias` layer.
            Or use a fixed scale=1 and bias=0.
        time_reduce_idx (int): Optional integer index of a layer where a time reduction operation will occur.
            All operations beyond this point will only occur at the reduced resolution.
        time_recovery_idx (int): Optional integer index of a layer where the time recovery operation will occur.
            All operations beyond this point will occur at the original resolution (resolution after
            primary downsampling). If no value is provided, assumed to be the last layer.
          c                 C   sF   t |  j}t|| j||}td||f|}t||gS )zs
        Generates input examples for tracing etc.
        Returns:
            A tuple of input examples.
        r   )	next
parametersdevicetorchrandn_feat_intorandinttuple)self	max_batchmax_dimdevinput_exampleinput_example_length r)   f/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/modules/squeezeformer_encoder.pyr'   Y   s   z"SqueezeformerEncoder.input_examplec                 C   "   t tdt ttdt dS )z*Returns definitions of module input ports.BDTr-   audio_signallength)r   r   r   r"   r   r#   r)   r)   r*   input_typesd   
   
z SqueezeformerEncoder.input_typesc                 C   r+   )z+Returns definitions of module output ports.r,   r-   )outputsencoded_lengths)r   r   r   r"   r   r3   r)   r)   r*   output_typesn   r5   z!SqueezeformerEncoder.output_typesdw_striding   rel_posNT     
batch_norm皙?        feat_inn_layersd_modelfeat_outsubsamplingsubsampling_factorsubsampling_conv_channelsff_expansion_factorself_attention_modeln_headsatt_context_sizexscalinguntie_biasespos_emb_max_lenconv_kernel_sizeconv_norm_typedropoutdropout_embdropout_attadaptive_scaletime_reduce_idxtime_recovery_idxc                    s  t    || }|| _|| _|r|| _nddg| _|r#t|| _nd | _|| _|| _	|d ur=|d u r:|d | _
n|| _
| j	d urf| j	dk sL| j
|krTtd| d| j
dk s^| j
|krftd| d|dkrl|}|r|dkr|dkrt|||d| _nt|||||t d	| _| j  nt||| _|| _|s|	d
kr||
 }tt|
|}tt|
|}tj| tj| nd }d }|| _|	d
krt|||| j|d| _n|	dkrd }d }t|||| jd| _ntd|	 dt | _t|D ]}t |||	|
|||||||d}| j!| q d | _"d | _#d | _$| j	d urRt%||ddd| _"t||| _#|	d
krHt|d|d dd| _$n
t|d|d dd| _$t&|| _'|dkro|| jkrot| j|| _(|| _nd | _(|| _| )| j d| _*d | _+d S )Nr9   r   r   z'Time reduce index must lie between [0, )z)Time recovery index must lie between [0, stacking)rG   rB   rE   )rF   rG   rB   rE   conv_channels
activationr<   )rD   dropout_ratemax_lenxscaledropout_rate_embabs_pos)rD   r\   r]   r^   z!Not valid self_attention_model: 'z'!)rD   d_ffrJ   rK   rP   rQ   rR   rT   
pos_bias_u
pos_bias_vrU         )kernel_sizestriderA   T),super__init__rD   r   rL   mathsqrtr^   rU   rV   rW   
ValueErrorr   
pre_encoder
   nnReLUreset_parametersLinear	_feat_out	Parameterr   Tensorinitzeros_rO   r   pos_encr   
ModuleListlayersranger	   appendtime_reduce_layertime_recovery_layertime_reduce_pos_encr   	LayerNormpre_lnout_projset_max_audio_lengthuse_pad_maskinterctc_capture_at_layers)r#   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   ra   d_headrb   rc   ilayer	__class__r)   r*   ri   x   s   



	







zSqueezeformerEncoder.__init__c                 C   s   || _ t|  j}t|  j}tjd| j |d}t| dr#|| _n| j	d|dd | j
||| | jdurB| j||| dS dS )zSSets maximum input length.
        Pre-calculates internal seq_range mask.
        r   )r   	seq_rangeF)
persistentN)max_audio_lengthr   r   r   dtyper   arangehasattrr   register_bufferrw   	extend_per~   )r#   r   r   r   r   r)   r)   r*   r     s   

z)SqueezeformerEncoder.set_max_audio_lengthc                 C   s$   | j |d|jd | j||dS )Nre   )
seq_lengthr   r0   )update_max_seq_lengthsizer   forward_for_export)r#   r1   r2   r)   r)   r*   forward(  s   zSqueezeformerEncoder.forwardc                 C   s  | d}|| jkr| | |d u r"|j| d|tj| jjd}t|dd}t	| j
tjr6| 
|}n| 
||\}}| |\}}| d}| ||}|dd|dg}t||dd}| jd dkrv|j| jd  d}| jd dkr|j| jd d}| }| jr| }nd }g }| |}t| jD ]\}}	| jd ur|| jkr|||||f | j|||d\}}}| |\}
}| jd ur|| jkr|d\}}}}tj |ddd}|  \}}}|d d d |d d f }| |}|| }|	||||d	}| !t"| d
d rO| j#d u r#| j$%di %dg | _#|| j#v rO|}| j&d ur6| &|}| j'd| t|ddd | j'd| |d q| j&d ur[| &|}t|dd}||fS )Nr9   r   r   r   r   re   )diagonal)xatt_maskpad_mask)repeatsdim)r   r   pos_embr   
model_guidinterctccapture_layerszinterctc/layer_output_)nametensorzinterctc/layer_length_)(r   r   r   new_fullr   int32r   r   	transpose
isinstancerm   rn   rq   rw   make_pad_mask	unsqueezerepeatlogical_andrL   triutrilr   r   	enumeratery   r|   rV   r{   r~   r}   rW   poprepeat_interleaveis_access_enabledgetattrr   
access_cfggetr   register_accessible_tensor)r#   r1   r2   r   r   r   r   recovery_activation_cachelthr   _recovery_audio_signalr-   r/   r.   lth_audio_signalr)   r)   r*   r   -  sr   







z'SqueezeformerEncoder.forward_for_exportr   c                 C   s\   t j r t j|gt j|d}t jj|t jjjd | 	 }|| j
kr,| | d S d S )Nr   )op)r   distributedis_initializedr   float32
all_reduceReduceOpMAXintitemr   r   )r#   r   r   global_max_lenr)   r)   r*   r     s   

z*SqueezeformerEncoder.update_max_seq_lengthc                 C   s*   | j d| |dd|dk }|S )zMake masking for padding.Nr   r9   )r   expandr   r   )r#   r   seq_lensmaskr)   r)   r*   r     s   &z"SqueezeformerEncoder.make_pad_maskc                 C   s   | j }|| _ |S N)r   )r#   onr   r)   r)   r*   enable_pad_mask  s   z$SqueezeformerEncoder.enable_pad_mask)r   r   )r9   r:   r;   r9   r;   r<   r;   NTTr=   r>   r?   r@   r@   rA   TNNr   )T)__name__
__module____qualname____doc__r'   propertyr4   r8   r   strr   r   boolfloatri   r   r   r   r   r   r   r   __classcell__r)   r)   r   r*   r   %   s    
3
	
	

  
Vc                       s~   e Zd ZdedefddZdefddZddee d
efddZ	de
e fddZdefddZdee f fddZ  ZS )SqueezeformerEncoderAdapterr   cfgc                 C   s&   |  |}| jD ]}||| qd S r   )_update_adapter_cfg_input_dimry   add_adapter)r#   r   r   conformer_layerr)   r)   r*   r     s   

z'SqueezeformerEncoderAdapter.add_adapterreturnc                 C   s   t dd | jD S )Nc                 S   s   g | ]}|  qS r)   )is_adapter_available).0r   r)   r)   r*   
<listcomp>  s    zDSqueezeformerEncoderAdapter.is_adapter_available.<locals>.<listcomp>)anyry   r3   r)   r)   r*   r     s   z0SqueezeformerEncoderAdapter.is_adapter_availableNTenabledc                 C   s   | j D ]	}|j||d qd S )N)r   r   )ry   set_enabled_adapters)r#   r   r   r   r)   r)   r*   r     s   
z0SqueezeformerEncoderAdapter.set_enabled_adaptersc                 C   s2   t g }| jD ]	}||  qtt|}|S r   )setry   updateget_enabled_adapterssortedlist)r#   namesr   r)   r)   r*   r     s
   
z0SqueezeformerEncoderAdapter.get_enabled_adaptersc                 C   s   t j| || jd}|S )N)
module_dim)r   update_adapter_cfg_input_dimrD   )r#   r   r)   r)   r*   r     s   z9SqueezeformerEncoderAdapter._update_adapter_cfg_input_dimc                    s8   t   }t|dkr| tjtjtjg |  }|S )Nr   )rh   get_accepted_adapter_typeslenset_accepted_adapter_typesr   LINEAR_ADAPTER_CLASSPATHMHA_ADAPTER_CLASSPATHRELMHA_ADAPTER_CLASSPATH)r#   typesr   r)   r*   r     s   
z6SqueezeformerEncoderAdapter.get_accepted_adapter_types)NT)r   r   r   r   dictr   r   r   r   r   r   r   r   r   r   typer   r   r)   r)   r   r*   r     s    r   )
base_classadapter_class),rj   collectionsr   typingr   r   r   r   torch.distributedtorch.nnrn   	omegaconfr   :nemo.collections.asr.parts.submodules.multi_head_attentionr   r   ;nemo.collections.asr.parts.submodules.squeezeformer_modulesr	   1nemo.collections.asr.parts.submodules.subsamplingr
   r   r    nemo.collections.asr.parts.utilsr   nemo.core.classes.commonr   nemo.core.classes.exportabler   nemo.core.classes.mixinsr   r   nemo.core.classes.moduler   nemo.core.neural_typesr   r   r   r   __all__r   AdapterModuleMixinr   get_registered_adapterregister_adapterr)   r)   r)   r*   <module>   s2     z,