o
    }oi                     @   sx   d dl Z d dlmZ d dl mZ d dlmZmZmZ d dlm	Z	m
Z
mZmZmZ G dd deeZG dd	 d	eeZdS )
    N)
DictConfig)nn)
ExportableNeuralModule	typecheck)AcousticEncodedRepresentationAudioSignalLengthsType
NeuralTypeSpectrogramTypec                       s   e Zd ZdZddededefdd	Zed
d Zedd Zde	f fddZ
				dddZe 				dddZ  ZS )AudioPerceptionModulezOAudio perception module that consists of audio encoder(s) and modality adapter.    }     	max_batchmax_dim
min_lengthc                 C   sj   t jd|dgd }t j||dgd }t j||gdd d }t j|||gd}||d< ||d d fS )N   )lowhighsize)r      r   )torchrandintitemrand)selfr   r   r   
batch_size
max_lengthsignalslengths r!   a/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/speechlm2/modules/perception.pyinput_example   s   z#AudioPerceptionModule.input_examplec                 C   s>   t dt| jjdt tdt t dt t tdt dS )z*Returns definitions of module input ports.)BT)freqr$   )r$   Dr%   )input_signalinput_signal_lengthprocessed_signalprocessed_signal_length)r
   r   preprocessor_sample_ratetupler	   r   r   r!   r!   r"   input_types!   s   

z!AudioPerceptionModule.input_typesc                 C   s   t dt t tdt dS )z+Returns definitions of module output ports.)r$   r%   r'   r$   )encodedencoded_len)r
   r   r.   r	   r/   r!   r!   r"   output_types-   s   
z"AudioPerceptionModule.output_typescfgc                    s   t    || _| |j| _| |j| _d|v r'|jd ur'| |j| _nd | _| |j| _d|jvrGd|jv rGt	
|jj|j| _d S t	 | _d S )Nspec_augment
output_dimd_model)super__init__r4   from_config_dictr,   encoderr5   spec_augmentationmodality_adapterr   Linearr7   r6   projIdentity)r   r4   	__class__r!   r"   r9   5   s   
zAudioPerceptionModule.__init__Nc                 C   sZ   |d uo|d u}|d uo|d u}||A du rt | j d|s)| j||d\}}||fS )NFz Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive  with ``processed_signal`` and ``processed_signal_len`` arguments.)r(   length)
ValueErrorrB   r,   )r   r(   r)   r*   r+   has_input_signalhas_processed_signalr!   r!   r"   maybe_preprocess_audioF   s   

z,AudioPerceptionModule.maybe_preprocess_audioc                 C   sp   |  ||||\}}| jd ur| jr| j||d}| j||d\}}| j||d\}}| |dd}||fS )N)
input_specrC   )audio_signalrC   r   r   )rG   r<   trainingr;   r=   r?   	transpose)r   r(   r)   r*   r+   r1   r2   r!   r!   r"   forward]   s   zAudioPerceptionModule.forward)r   r   r   )NNNN)__name__
__module____qualname____doc__intr#   propertyr0   r3   r   r9   rG   r   disable_checksrL   __classcell__r!   r!   rA   r"   r      s&    


r   c                       s*   e Zd ZdZ fddZdddZ  ZS )IdentityConnectorz8User to pass encoder's representations as-is to the LLM.c                    s   t    d S N)r8   r9   )r   argskwargsrA   r!   r"   r9   y   s   zIdentityConnector.__init__Nc                 O   s   ||fS rV   r!   )r   rI   rC   rW   rX   r!   r!   r"   rL      s   zIdentityConnector.forwardrV   )rM   rN   rO   rP   r9   rL   rT   r!   r!   rA   r"   rU   v   s    rU   )r   	omegaconfr   r   	nemo.corer   r   r   nemo.core.neural_typesr   r   r	   r
   r   r   rU   r!   r!   r!   r"   <module>   s   `