o
    i)                     @   s   d Z ddlmZmZmZmZ ddlZddlmZ ddl	m
Z
mZmZmZ ddlmZ ddlmZmZmZmZ G dd	 d	ejjZdS )
zEncoder for Transducer model.    )AnyDictListTupleN)check_argument_types)build_body_blocksbuild_input_blockbuild_main_parametersbuild_positional_encoding)validate_architecture)TooShortUttErrorcheck_short_uttmake_chunk_maskmake_source_maskc                       s   e Zd ZdZi i fdedeeeef  deeef deeef ddf
 fdd	Z	d
ededefddZ
dedejddfddZdejdejdeejejf fddZ		ddejdejdejdededejfddZ  ZS )EncoderzEncoder module definition.

    Args:
        input_size: Input size.
        body_conf: Encoder body configuration.
        input_conf: Encoder input configuration.
        main_conf: Encoder main configuration.

    
input_size	body_conf
input_conf	main_confreturnNc                    s   t    t s
J t|||\}}tdi |}t||| _t||| _t	|||| _
|| _|d | _|d | _|d | _|d | _dS )zConstruct an Encoder object.dynamic_chunk_trainingshort_chunk_thresholdshort_chunk_sizeleft_chunk_sizeN )super__init__r   r   r	   r   embedr
   pos_encr   encodersoutput_sizer   r   r   r   )selfr   r   r   r   
embed_sizer    main_params	__class__r   Z/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/asr_transducer/encoder/encoder.pyr   "   s   




zEncoder.__init__size
hop_lengthc                 C   s   | j || S )aP  Return the corresponding number of sample for a given chunk size, in frames.

        Where size is the number of features frames after applying subsampling.

        Args:
            size: Number of frames after subsampling.
            hop_length: Frontend's hop length

        Returns:
            : Number of raw samples

        )r   get_size_before_subsampling)r!   r'   r(   r   r   r&   get_encoder_input_raw_size>   s   z"Encoder.get_encoder_input_raw_sizeleft_contextdevicec                 C   s   | j ||S )zInitialize/Reset encoder streaming cache.

        Args:
            left_context: Number of frames in left context.
            device: Device ID.

        )r   reset_streaming_cache)r!   r+   r,   r   r   r&   r-   M   s   zEncoder.reset_streaming_cachexx_lenc           
      C   s   t | jj|d\}}|r$td|d dd| d |d|t|}| ||\}}| |}| jre|d}t	d|d
 }||| j krP|}n|| j d }t|d|| j|jd}	nd}	| j||||	d	}||d
dfS )a  Encode input sequences.

        Args:
            x: Encoder input features. (B, T_in, F)
            x_len: Encoder input features lengths. (B,)

        Returns:
           x: Encoder outputs. (B, T_out, D_enc)
           x_len: Encoder outputs lenghts. (B,)

           zhas z) frames and is too short for subsampling z(it needs more than z frames), return empty results)r0   )r   r,   N)
chunk_maskr   )r   r   subsampling_factorr'   r   r   r   r   torchrandintitemr   r   r   r   r,   r   eqsum)
r!   r.   r/   short_status
limit_sizemaskr   max_len
chunk_sizer1   r   r   r&   forwardW   sD   


zEncoder.forward    r   processed_framesright_contextc           	      C   s   t |}| ||\}}|dkr,tj||jdd|d}||k}tj||gdd}| j||d}| j	j
|||||d}|dkrP|ddd| ddf }|S )a  Encode input sequences as chunks.

        Args:
            x: Encoder input features. (1, T_in, F)
            x_len: Encoder input features lengths. (1,)
            processed_frames: Number of frames already seen.
            left_context: Number of frames in left context.
            right_context: Number of frames in right context.

        Returns:
           x: Encoder outputs. (B, T_out, D_enc)

        r   )r,   r0   )dim)r+   )r+   r@   N)r   r   r3   aranger,   viewflipcatr   r   chunk_forward)	r!   r.   r/   r?   r+   r@   r:   processed_maskr   r   r   r&   rF      s(   zEncoder.chunk_forward)r>   r   )__name__
__module____qualname____doc__intr   r   strr   r   r*   r3   r,   r-   Tensorr   r=   tensorrF   __classcell__r   r   r$   r&   r      sN    



Ar   )rK   typingr   r   r   r   r3   	typeguardr   'espnet2.asr_transducer.encoder.buildingr   r   r	   r
   )espnet2.asr_transducer.encoder.validationr   espnet2.asr_transducer.utilsr   r   r   r   nnModuler   r   r   r   r&   <module>   s    