o
    }oi                     @   sv   d dl mZ d dlZd dlZd dlmZ d dlmZmZ d dl	m
Z
mZmZmZ d dlmZ dgZG dd deZdS )	    )DictN)ConformerEncoder)NeuralModule	typecheck)ChannelTypeLengthsType
NeuralTypeSpectrogramType)loggingSpectrogramConformerc                       s   e Zd ZdZddedef fddZedd Zejd	d Zed
e	e
ef fddZed
e	e
ef fddZe 	dddZ  ZS )r   a'  A Conformer-based model for processing complex-valued spectrograms.

    This model processes complex-valued inputs by stacking real and imaginary components
    along the channel dimension. The stacked tensor is processed using Conformer layers,
    and the output is projected back to generate real and imaginary components of the
    output channels.

    Args:
        in_channels: number of input complex-valued channels
        out_channels: number of output complex-valued channels
        kwargs: additional arguments for ConformerEncoder
       in_channelsout_channelsc                    s   t    |dk rtd| || _|dk rtd| || _| }d| j |d   |d< |d< td| tdi || _	t
jjd| j d| j dd| _td	| jj td
| j td| j d S )Nr   zKNumber of input channels needs to be larger or equal to one, current value zLNumber of output channels needs to be larger or equal to one, current value    feat_infeat_outzConformer params: %s)r   r   kernel_sizezInitialized %s withz	in_channels:  %sz	out_channels: %s )super__init__
ValueErrorr   r   copyr
   debugr   	conformertorchnnConv2doutput_projection	__class____name__)selfr   r   kwargsconformer_paramsr   r   e/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/audio/parts/submodules/conformer.pyr   *   s,   
zSpectrogramConformer.__init__c                 C   s   | j jS )ax  Returns the attention context size used by the conformer encoder.

        The context size is a list of two integers [left_context, right_context] that defines
        how many frames to the left and right each frame can attend to in the self-attention
        layers.

        Returns:
            List[int]: The attention context size [left_context, right_context]
        )r   att_context_sizer    r   r   r$   context_sizeN   s   z!SpectrogramConformer.context_sizec                 C   s   | j | dS )az  Sets the attention context size used by the conformer encoder.

        The context size is a list of two integers [left_context, right_context] that defines
        how many frames to the left and right each frame can attend to in the self-attention
        layers.

        Args:
            value (List[int]): The attention context size [left_context, right_context]
        N)r   set_default_att_context_size)r    valuer   r   r$   r'   [   s   returnc              	   C   L   t dt t dt ddt dt ddt dt ddt tdt dddS )	+Returns definitions of module output ports.BCDTr.   Toptionalr0   r.   r1   r0   r0   r.   r0   r1   r.   )inputinput_lengthcache_last_channelcache_last_timecache_last_channel_lenr   r	   r   r   tupler&   r   r   r$   input_typesh      
z SpectrogramConformer.input_typesc              	   C   r+   )	r,   r-   r2   Tr3   r5   r6   r.   )outputoutput_lengthcache_last_channel_nextcache_last_time_nextcache_last_channel_next_lenr<   r&   r   r   r$   output_typest   r?   z!SpectrogramConformer.output_typesNc                 C   s   |j \}}}}	|| jkrtd| d| j tj|j|jgdd}
t|
d}|du r6| j	||d\}}n| j	|||||d\}}}}}tj|d	| jd|d
}| 
|}tj|d| jd|d
}t| }|du ro||fS |||||fS )a4  Forward pass for the SpectrogramConformer model.

        This method processes complex-valued inputs by stacking real and imaginary components,
        passing the stacked tensor through Conformer layers, and projecting back to generate
        real and imaginary components of the output channels.
        zUnexpected input channel size z, expected r   )dimzB C RI D T -> B (C RI D) TN)audio_signallength)rG   rH   r9   r:   r;   zB (C RI D) T -> B (C RI) D T)r/   RIr0   zB (C RI) D T -> B C D T RI)shaper   RuntimeErrorr   stackrealimageinops	rearranger   r   r   view_as_complex
contiguous)r    r7   r8   r9   r:   r;   r.   C_inr0   r1   input_real_imagr@   rA   r   r   r$   forward   s*   

	
zSpectrogramConformer.forward)r   r   )NNNN)r   
__module____qualname____doc__intr   propertyr'   setterr   strr   r>   rE   r   rU   __classcell__r   r   r#   r$   r      s    $

)typingr   rO   r   .nemo.collections.asr.modules.conformer_encoderr   nemo.core.classesr   r   nemo.core.neural_typesr   r   r   r	   
nemo.utilsr
   __all__r   r   r   r   r$   <module>   s   