o
    i                     @   s2  d dl Z d dlmZmZmZ d dlZdgZdejdejfddZ	d#dejd	ejd
ejdejdejdeej deej fddZ	de
dejjfddZdee
 dedeee  fddZdee dee dedejdejf
ddZG dd dejjZG dd dejjZG d d! d!ejjZG d"d deZdS )$    N)ListOptionalTupleEmformerlengthsreturnc                 C   sF   | j d }tt|  }tj|| j| jd||| 	dk}|S )Nr   )devicedtype   )
shapeinttorchmaxitemaranger   r	   expand	unsqueeze)r   
batch_size
max_lengthpadding_mask r   X/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/torchaudio/models/emformer.py_lengths_to_padding_mask
   s   
r   	utteranceright_contextsummarymemsleft_context_keyc                 C   s   | d|  d | d }| d}|dkrd }|S |t|  | d }	|d ur3| dnd}
|| d |	 |
 }t|d}|S )Nr   r
   )r   )sizer   r   r   r   )r   r   r   r   r   r   TBr   right_context_blocks_lengthleft_context_blocks_lengthklengthsr   r   r   _gen_padding_mask   s   

r$   
activationc                 C   sD   | dkr	t j S | dkrt j S | dkrt j S td|  )NrelugelusiluzUnsupported activation )r   nnReLUGELUSiLU
ValueError)r%   r   r   r   _get_activation_module'   s   


r.   weight_init_scale_strategy
num_layersc                 C   s\   | d u rdd t |D S | dkrdd t |D S | dkr'dd t |D S td|  )Nc                 S   s   g | ]}d qS Nr   ).0_r   r   r   
<listcomp>4   s    z*_get_weight_init_gains.<locals>.<listcomp>	depthwisec                 S   s   g | ]}d t |d  qS )      ?r
   mathsqrtr2   	layer_idxr   r   r   r4   6   s    constantc                 S   s   g | ]	}d t d qS )r6      r7   r:   r   r   r   r4   8   s    z-Unsupported weight_init_scale_strategy value )ranger-   )r/   r0   r   r   r   _get_weight_init_gains2   s   r?   
col_widthscol_masknum_rowsr   c                    s@   t | t |krtd fddt| |D }tj|ddS )Nz0Length of col_widths must match that of col_maskc                    s4   g | ]\}}|rt j| d nt j| d qS )r   )r   oneszeros)r2   	col_widthis_ones_colr   rB   r   r   r4   C   s    z-_gen_attention_mask_block.<locals>.<listcomp>r
   dim)lenr-   zipr   cat)r@   rA   rB   r   
mask_blockr   rH   r   _gen_attention_mask_block=   s   rO   c                       s  e Zd ZdZ				d$dededed	ee d
edef fddZde	j
de	j
dee	j
e	j
f fddZde	j
de	j
dee	j
 de	j
fddZ		d%de	j
de	j
de	j
de	j
de	j
de	j
dee	j
 dee	j
 dee	j
e	j
e	j
e	j
f fddZde	j
de	j
de	j
de	j
de	j
de	j
dee	j
e	j
f fd d!Ze	jjde	j
de	j
de	j
de	j
de	j
de	j
de	j
dee	j
e	j
e	j
e	j
f fd"d#Z  ZS )&_EmformerAttentiona_  Emformer layer attention module.

    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads in each Emformer layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        weight_init_gain (float or None, optional): scale factor to apply when initializing
            attention module parameters. (Default: ``None``)
        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
            NF    ח	input_dim	num_headsdropoutweight_init_gaintanh_on_memnegative_infc                    s   t    || dkrtd| d| d|| _|| _|| _|| _|| _| j| j d | _t	j
j|d| dd| _t	j
j||dd| _t	j
j||dd| _|rht	j
jj| jj|d	 t	j
jj| jj|d	 d S d S )
Nr   zinput_dim (z") is not a multiple of num_heads (z).g      r=   T)bias)gain)super__init__r-   rS   rT   rU   rW   rX   scalingr   r)   Linearemb_to_key_valueemb_to_queryout_projinitxavier_uniform_weight)selfrS   rT   rU   rV   rW   rX   	__class__r   r   r\   Y   s    
	z_EmformerAttention.__init__inputr   r   c           
      C   sX   |j \}}}|dd }|d ||  }t||g}| |jddd\}}	||	fS )Nr   r
   r=   chunksrJ   )r   r   r   rM   r_   chunk)
re   rh   r   r   r3   summary_lengthright_ctx_utterance_blockmems_right_ctx_utterance_blockkeyvaluer   r   r   _gen_key_valuew   s   z!_EmformerAttention._gen_key_valueattention_weightsattention_maskr   c                 C   s   |  }||d| j}|d}|d| j }|d urC||| j|d}||ddtj	| j}||| j |d}tj
jj|dd|}tj
jj|t | j| jdS )Nr   r
   r=   rI   )ptraining)floatmasked_fillr   rX   r   rT   viewtor   boolr)   
functionalsoftmaxtype_asrU   rv   )re   rr   rs   r   attention_weights_floatr   r    attention_probsr   r   r   _gen_attention_probs   s   
z'_EmformerAttention._gen_attention_probsr   r   r   r   r   left_context_valc	                    s   | d | d| d | d }	t|||g}
t|||gjddd\}}|d ur{|d ur{|	t|  | d }t|d | d|  ||| d| d  g}t|d | d|  ||| d| d  g} fdd|
||fD \}}}t|j	 |
dd}t||||||}|||}t||}|j j |	jj fkrtd|
dd |	 j}|}| d}|d |	|  }||	| d  }jrt|}ntj|dd	d
}||||fS )Nr
   r   r=   ri   c                    s4   g | ]}|  d  j jj ddqS )rt   r   r
   )
contiguousry   rT   rS   	transpose)r2   tensorr    re   r   r   r4      s    &z4_EmformerAttention._forward_impl.<locals>.<listcomp>z+Computed attention has incorrect dimensionsi
   )minr   )r   r`   r   rM   r_   rk   r   r   bmmr]   r   r$   r   r   rT   rS   AssertionErrorr   ry   ra   rW   tanhclamp)re   r   r   r   r   r   rs   r   r   r   queryro   rp   r!   reshaped_queryreshaped_keyreshaped_valuerr   r   r   	attentionoutput_right_context_memsrl   output_right_contextoutput_memsr   r   r   _forward_impl   sP   
$	


z _EmformerAttention._forward_implc           
      C   s,   |  ||||||\}}}	}	||dd fS )ac  Forward pass for training.

        B: batch size;
        D: feature dimension of each frame;
        T: number of utterance frames;
        R: number of right context frames;
        S: number of summary elements;
        M: number of memory elements.

        Args:
            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``utterance``.
            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
            summary (torch.Tensor): summary elements, with shape `(S, B, D)`.
            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
            attention_mask (torch.Tensor): attention mask for underlying attention module.

        Returns:
            (Tensor, Tensor):
                Tensor
                    output frames corresponding to utterance and right_context, with shape `(T + R, B, D)`.
                Tensor
                    updated memory elements, with shape `(M, B, D)`.
        Nrt   )r   )
re   r   r   r   r   r   rs   outputr   r3   r   r   r   forward   s   "z_EmformerAttention.forwardc              
   C   s   | d| d | d }| d| d | d | d }	t||	jtj|jd}
d|
dd| df< | j||||||
||d\}}}}|||| d| d d || d| d d fS )a  Forward pass for inference.

        B: batch size;
        D: feature dimension of each frame;
        T: number of utterance frames;
        R: number of right context frames;
        S: number of summary elements;
        M: number of memory elements.

        Args:
            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``utterance``.
            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
            summary (torch.Tensor): summary elements, with shape `(S, B, D)`.
            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
            left_context_key (torch.Tensor): left context attention key computed from preceding invocation.
            left_context_val (torch.Tensor): left context attention value computed from preceding invocation.

        Returns:
            (Tensor, Tensor, Tensor, and Tensor):
                Tensor
                    output frames corresponding to utterance and right_context, with shape `(T + R, B, D)`.
                Tensor
                    updated memory elements, with shape `(M, B, D)`.
                Tensor
                    attention key computed for left context and utterance.
                Tensor
                    attention value computed for left context and utterance.
        r   r	   r   Trt   N)r   r   )r   r   rE   rz   r{   r   r   )re   r   r   r   r   r   r   r   	query_dimkey_dimrs   r   r   ro   rp   r   r   r   infer   s&   )(z_EmformerAttention.infer)rQ   NFrR   )NN)__name__
__module____qualname____doc__r   rw   r   r{   r\   r   Tensorr   rq   r   r   r   jitexportr   __classcell__r   r   rf   r   rP   L   s    &
	

I
%	rP   c                       sh  e Zd ZdZ							d6ded	ed
ededededededee dedef fddZ	dedee
j dee
j fddZdee
j dee
je
je
jf fddZde
jde
jded e
jdee
j dee
j fd!d"Zd#e
jd$e
jd%e
jde
jfd&d'Zd$e
jd%e
jdee
je
jf fd(d)Zd#e
jd$e
jd%e
jdee
je
jf fd*d+Zd$e
jd,e
jd%e
jd e
jd-ee
j dee
je
jf fd.d/Zd$e
jd,e
jd%e
jd e
jdeee
j  dee
je
jee
j f fd0d1Zd$e
jd,e
jd%e
jd e
jd-e
jdee
je
je
jf fd2d3Ze
jjd$e
jd,e
jd%e
jdeee
j  d e
jdee
je
jee
j e
jf fd4d5Z  ZS )7_EmformerLayera$  Emformer layer that constitutes Emformer.

    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads.
        ffn_dim: (int): hidden layer dimension of feedforward network.
        segment_length (int): length of each input segment.
        dropout (float, optional): dropout probability. (Default: 0.0)
        activation (str, optional): activation function to use in feedforward network.
            Must be one of ("relu", "gelu", "silu"). (Default: "relu")
        left_context_length (int, optional): length of left context. (Default: 0)
        max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
        weight_init_gain (float or None, optional): scale factor to apply when initializing
            attention module parameters. (Default: ``None``)
        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
    rQ   r&   r   NFrR   rS   rT   ffn_dimsegment_lengthrU   r%   left_context_lengthmax_memory_sizerV   rW   rX   c              
      s   t    t||||	|
|d| _tj|| _tjj||dd| _	t
|}tjtj|tj|||tj|tj||tj|| _tj|| _tj|| _|| _|| _|| _|| _|dk| _d S )N)rS   rT   rU   rV   rW   rX   Tkernel_sizestride	ceil_moder   )r[   r\   rP   r   r   r)   DropoutrU   	AvgPool1d	memory_opr.   
Sequential	LayerNormr^   pos_fflayer_norm_inputlayer_norm_outputr   r   r   rS   use_mem)re   rS   rT   r   r   rU   r%   r   r   rV   rW   rX   activation_modulerf   r   r   r\   R  s6   



z_EmformerLayer.__init__r   r   r   c                 C   sb   t j| j|| j|d}t j| j|| j|d}t j| j|| j|d}t jd|t j|d}||||gS )NrC   r
   r   )r   rE   r   rS   r   int32)re   r   r   empty_memoryr   r   past_lengthr   r   r   _init_state  s
   z_EmformerLayer._init_statestatec                 C   s   |d d d   }t| j|}t| jt|| j }|d | j| d  }|d | j| d  }|d | j| d  }|||fS )N   r   r
   r=   )r   r   r   r   r8   ceilr   )re   r   r   past_left_context_lengthpast_mem_lengthpre_memslc_keylc_valr   r   r   _unpack_state  s   
z_EmformerLayer._unpack_statenext_knext_vupdate_lengthr   c                 C   s   t |d |g}t |d |g}t |d |g| j d  |d< ||jd | j d  |d< ||jd | j d  |d< |d | |d< |S )Nr
   r=   r   r   )r   rM   r   r   r   )re   r   r   r   r   r   new_knew_vr   r   r   _pack_state  s   "z_EmformerLayer._pack_state	rc_outputr   r   c                 C   s4   |  |t||g }| || }| |}|S r1   )rU   r   rM   r   r   )re   r   r   r   resultr   r   r   _process_attention_output  s   
z(_EmformerLayer._process_attention_outputc                 C   s8   |  t||g}||dd  |d |d fS Nr   )r   r   rM   r   )re   r   r   r   r   r   r   _apply_pre_attention_layer_norm  s   z._EmformerLayer._apply_pre_attention_layer_normc                 C   s2   |  |||}||dd  |d |d fS r   )r   r   )re   r   r   r   r   r   r   _apply_post_attention_ffn  s   $z(_EmformerLayer._apply_post_attention_ffnr   rs   c           	      C   sp   |d u rt d| jr| |dddddd}ntdj|j|jd}| j	||||||d\}}||fS )Nz;attention_mask must be not None when for_inference is Falser
   r=   r   r   )r   r   r   r   r   rs   )
r-   r   r   permuter   emptyrz   r	   r   r   )	re   r   r   r   r   rs   r   r   next_mr   r   r   _apply_attention_forward  s    
z'_EmformerLayer._apply_attention_forwardc              	   C   s   |d u r| j |d|jd}| |\}}}| jr0| |dddddd}	|	d d }	ntdj	|j
|jd}	| jj||||	|||d\}
}}}| |||d||}|
||fS )Nr
   rC   r=   r   r   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   rz   r	   r   r   r   )re   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   _apply_attention_infer  s$   	
z%_EmformerLayer._apply_attention_inferc                 C   sB   |  ||\}}| |||||\}}	| |||\}
}|
||	fS )a1  Forward pass for training.

        B: batch size;
        D: feature dimension of each frame;
        T: number of utterance frames;
        R: number of right context frames;
        M: number of memory elements.

        Args:
            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``utterance``.
            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
            attention_mask (torch.Tensor): attention mask for underlying attention module.

        Returns:
            (Tensor, Tensor, Tensor):
                Tensor
                    encoded utterance frames, with shape `(T, B, D)`.
                Tensor
                    updated right context frames, with shape `(R, B, D)`.
                Tensor
                    updated memory elements, with shape `(M, B, D)`.
        )r   r   r   )re   r   r   r   r   rs   layer_norm_utterancelayer_norm_right_contextr   r   output_utterancer   r   r   r   r     s   
$
z_EmformerLayer.forwardc                 C   sF   |  ||\}}| |||||\}}	}
| |||\}}|||
|	fS )a2  Forward pass for inference.

        B: batch size;
        D: feature dimension of each frame;
        T: number of utterance frames;
        R: number of right context frames;
        M: number of memory elements.

        Args:
            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``utterance``.
            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
            state (List[torch.Tensor] or None): list of tensors representing layer internal state
                generated in preceding invocation of ``infer``.
            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.

        Returns:
            (Tensor, Tensor, List[torch.Tensor], Tensor):
                Tensor
                    encoded utterance frames, with shape `(T, B, D)`.
                Tensor
                    updated right context frames, with shape `(R, B, D)`.
                List[Tensor]
                    list of tensors representing layer internal state
                    generated in current invocation of ``infer``.
                Tensor
                    updated memory elements, with shape `(M, B, D)`.
        )r   r   r   )re   r   r   r   r   r   r   r   r   r   output_stater   r   r   r   r   r     s   
)

z_EmformerLayer.infer)rQ   r&   r   r   NFrR   )r   r   r   r   r   rw   strr   r{   r\   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rf   r   r   ?  s    	
".(	


	



/r   c                       s   e Zd Z			ddejjdedededef
 fddZd	ejd
ejfddZ	deded
e
e fddZd	ejd
ejfddZd	ejdejd
eejejf fddZejj	dd	ejdejdee
e
ej   d
eejeje
e
ej  f fddZ  ZS )_EmformerImplr   emformer_layersr   r   right_context_lengthr   c                    sJ   t    |dk| _tjj||dd| _|| _|| _|| _	|| _
|| _d S )Nr   Tr   )r[   r\   r   r   r)   r   r   r   r   r   r   r   )re   r   r   r   r   r   rf   r   r   r\   P  s   


z_EmformerImpl.__init__rh   r   c                 C   s   |j d }t|| j | j }g }t|d D ]}|d | j }|| j }||||  q|||| j d   t|S Nr   r
   )	r   r8   r   r   r   r>   appendr   rM   )re   rh   r   num_segsright_context_blocksseg_idxstartendr   r   r   _gen_right_contextf  s   


z _EmformerImpl._gen_right_contextr   utterance_lengthc              
   C   s   t || j }| j}| j}|| }|| }t|| j | d}t|d | j |}	| j| }
| jrUt|| j d}|d }||| || |||
| ||	| ||	 g	}|S |||
| ||	| ||	 g}|S r   )	r8   r   r   r   r   r   r   r   r   )re   r   r   r   rclcrc_startrc_end	seg_startseg_end	rc_lengthm_start
mem_lengthr@   r   r   r   _gen_attention_mask_col_widthsq  s<   
	z,_EmformerImpl._gen_attention_mask_col_widthsc              	   C   s*  | d}t|| j }g }g }g }| jr0d}dd t|D }dd t|D }	|||g}
nd}dd t|D }d }	||g}
t|D ]=}| ||}t||| j|j	}|
| t||t| j||| j  |j	}|
| |	d urt||	d|j	}|
| qEdtd	d |
D  tj}|S )
Nr   	   c                 S      g | ]}|d v qS ))r
         r   r2   idxr   r   r   r4         z5_EmformerImpl._gen_attention_mask.<locals>.<listcomp>c                 S   r   ))r   r   r   r   r   r   r   r4     r      c                 S   r   ))r
   r   r   r   r   r   r   r4     r   r
   c                 S   s   g | ]}t |qS r   )r   rM   )r2   maskr   r   r   r4     s    )r   r8   r   r   r   r>   r   rO   r   r   r   r   r   rM   rz   r{   )re   rh   r   r   rc_mask
query_masksummary_masknum_colsrc_q_cols_masks_cols_maskmasks_to_concatr   r@   rc_mask_blockquery_mask_blocksummary_mask_blockrs   r   r   r   _gen_attention_mask  sH   


	
 z!_EmformerImpl._gen_attention_maskr   c           	      C   s   | ddd}| |}|d|d| j  }| |}| jr3| | ddd ddddd ntdj	|j
|jd}|}| jD ]}||||||\}}}qD| ddd|fS )aG  Forward pass for training and non-streaming inference.

        B: batch size;
        T: max number of input frames in batch;
        D: feature dimension of each frame.

        Args:
            input (torch.Tensor): utterance frames right-padded with right context frames, with
                shape `(B, T + right_context_length, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid utterance frames for i-th batch element in ``input``.

        Returns:
            (Tensor, Tensor):
                Tensor
                    output frames, with shape `(B, T, D)`.
                Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid frames for i-th batch element in output frames.
        r
   r   r=   Nrt   r   )r   r   r   r   r  r   r   r   r   rz   r	   r   r   )	re   rh   r   r   r   rs   r   r   layerr   r   r   r     s   

(
z_EmformerImpl.forwardNstatesc                 C   s$  | d| j| j krtd| j| j  d| d d|ddd}| d| j }||d }|d| }tj|| j dd}| jrT| |ddddddnt	dj
|j|jd	}|}	g }
t| jD ]\}}||	|||du rxdn|| |\}	}}}|
| qi|	ddd||
fS )
a  Forward pass for streaming inference.

        B: batch size;
        D: feature dimension of each frame.

        Args:
            input (torch.Tensor): utterance frames right-padded with right context frames, with
                shape `(B, segment_length + right_context_length, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``input``.
            states (List[List[torch.Tensor]] or None, optional): list of lists of tensors
                representing internal state generated in preceding invocation of ``infer``. (Default: ``None``)

        Returns:
            (Tensor, Tensor, List[List[Tensor]]):
                Tensor
                    output frames, with shape `(B, segment_length, D)`.
                Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid frames for i-th batch element in output frames.
                List[List[Tensor]]
                    output states; list of lists of tensors representing internal state
                    generated in current invocation of ``infer``.
        r
   zIPer configured segment_length and right_context_length, expected size of z# for dimension 1 of input, but got .r   r=   N)r   r   )r   r   r   r-   r   r   r   r   r   r   rz   r	   r   	enumerater   r   r   )re   rh   r   r  right_context_start_idxr   r   output_lengthsr   r   output_statesr;   r  r   r   r   r   r     s:   
 z_EmformerImpl.infer)r   r   r   r1   )r   r   r   r   r)   
ModuleListr   r\   r   r   r   r   r  r   r   r   r   r   r   r   r   r   rf   r   r   O  s<    $&0#r   c                       sl   e Zd ZdZ								dded	ed
ededededededededee dedef fddZ	  Z
S )r   a_  Emformer architecture introduced in
    *Emformer: Efficient Memory Transformer Based Acoustic Model for Low Latency Streaming Speech Recognition*
    :cite:`shi2021emformer`.

    See Also:
        * :func:`~torchaudio.models.emformer_rnnt_model`,
          :func:`~torchaudio.models.emformer_rnnt_base`: factory functions.
        * :class:`torchaudio.pipelines.RNNTBundle`: ASR pipelines with pretrained model.

    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads in each Emformer layer.
        ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network.
        num_layers (int): number of Emformer layers to instantiate.
        segment_length (int): length of each input segment.
        dropout (float, optional): dropout probability. (Default: 0.0)
        activation (str, optional): activation function to use in each Emformer layer's
            feedforward network. Must be one of ("relu", "gelu", "silu"). (Default: "relu")
        left_context_length (int, optional): length of left context. (Default: 0)
        right_context_length (int, optional): length of right context. (Default: 0)
        max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
        weight_init_scale_strategy (str or None, optional): per-layer weight initialization scaling
            strategy. Must be one of ("depthwise", "constant", ``None``). (Default: "depthwise")
        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)

    Examples:
        >>> emformer = Emformer(512, 8, 2048, 20, 4, right_context_length=1)
        >>> input = torch.rand(128, 400, 512)  # batch, num_frames, feature_dim
        >>> lengths = torch.randint(1, 200, (128,))  # batch
        >>> output, lengths = emformer(input, lengths)
        >>> input = torch.rand(128, 5, 512)
        >>> lengths = torch.ones(128) * 5
        >>> output, lengths, states = emformer.infer(input, lengths, None)
    rQ   r&   r   r5   FrR   rS   rT   r   r0   r   rU   r%   r   r   r   r/   rW   rX   c                    sV   t ||
tj 	
fddt|D }t j||	d d S )Nc                    s.   g | ]}t  
| 	d qS ))rU   r%   r   r   rV   rW   rX   )r   r:   r%   rU   r   rS   r   r   rX   rT   r   rW   weight_init_gainsr   r   r4   ]  s     z%Emformer.__init__.<locals>.<listcomp>)r   r   r   )r?   r   r)   r  r>   r[   r\   )re   rS   rT   r   r0   r   rU   r%   r   r   r   r/   rW   rX   r   rf   r  r   r\   K  s   

zEmformer.__init__)rQ   r&   r   r   r   r5   FrR   )r   r   r   r   r   rw   r   r   r{   r\   r   r   r   rf   r   r   &  sJ    +	
r1   )r8   typingr   r   r   r   __all__r   r   r$   r   r)   Moduler.   r   rw   r?   r{   r   rO   rP   r   r   r   r   r   r   r   <module>   sT    
"
 t   X