o
    wi*                     @   sx   d dl Z d dlmZ d dl mZ d dlmZmZmZ d dlm	Z	m
Z
mZmZmZ G dd deeZG dd	 d	eeZdS )
    N)
DictConfig)nn)
ExportableNeuralModule	typecheck)AcousticEncodedRepresentationAudioSignalLengthsType
NeuralTypeSpectrogramTypec                       s   e Zd ZdZddededefdd	Zed
d Zedd Zede	fddZ
def fddZ				dddZe 				dddZ  ZS )AudioPerceptionModulezOAudio perception module that consists of audio encoder(s) and modality adapter.    }     	max_batchmax_dim
min_lengthc                 C   sj   t jd|dgd }t j||dgd }t j||gdd d }t j|||gd}||d< ||d d fS )N   )lowhighsize)r      r   )torchrandintitemrand)selfr   r   r   
batch_size
max_lengthsignalslengths r!   j/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/speechlm2/modules/perception.pyinput_example   s   z#AudioPerceptionModule.input_examplec                 C   s>   t dt| jjdt tdt t dt t tdt dS )z*Returns definitions of module input ports.)BT)freqr$   )r$   Dr%   )input_signalinput_signal_lengthprocessed_signalprocessed_signal_length)r
   r   preprocessor_sample_ratetupler	   r   r   r!   r!   r"   input_types!   s   

z!AudioPerceptionModule.input_typesc                 C   s   t dt t tdt dS )z+Returns definitions of module output ports.)r$   r%   r'   r$   )encodedencoded_len)r
   r   r.   r	   r/   r!   r!   r"   output_types-   s   
z"AudioPerceptionModule.output_typesreturnc                 C   s6   | j jj| j jj }| jj}t| jdd}|| | S )zx
        Returns the audio duration corresponding to a single frame/token in the output
        of this module.
        subsampling_factorg      ?)r,   
featurizer
hop_lengthsample_rateencoderr5   getattrmodality_adapter)r   frame_shiftencoder_subsamplingadapter_subsamplingr!   r!   r"   token_equivalent_duration5   s   z/AudioPerceptionModule.token_equivalent_durationcfgc                    s   t    || _| |j| _| |j| _d|v r'|jd ur'| |j| _nd | _| |j| _d|jvrGd|jv rGt	
|jj|j| _d S t	 | _d S )Nspec_augment
output_dimd_model)super__init__r@   from_config_dictr,   r9   rA   spec_augmentationr;   r   LinearrC   rB   projIdentity)r   r@   	__class__r!   r"   rE   @   s   
zAudioPerceptionModule.__init__Nc                 C   sZ   |d uo|d u}|d uo|d u}||A du rt | j d|s)| j||d\}}||fS )NFz Arguments ``input_signal`` and ``input_signal_length`` are mutually exclusive  with ``processed_signal`` and ``processed_signal_len`` arguments.)r(   length)
ValueErrorrL   r,   )r   r(   r)   r*   r+   has_input_signalhas_processed_signalr!   r!   r"   maybe_preprocess_audioQ   s   

z,AudioPerceptionModule.maybe_preprocess_audioc                 C   sp   |  ||||\}}| jd ur| jr| j||d}| j||d\}}| j||d\}}| |dd}||fS )N)
input_specrM   )audio_signalrM   r   r   )rQ   rG   trainingr9   r;   rI   	transpose)r   r(   r)   r*   r+   r1   r2   r!   r!   r"   forwardh   s   zAudioPerceptionModule.forward)r   r   r   )NNNN)__name__
__module____qualname____doc__intr#   propertyr0   r3   floatr?   r   rE   rQ   r   disable_checksrV   __classcell__r!   r!   rK   r"   r      s*    



r   c                       s*   e Zd ZdZ fddZdddZ  ZS )IdentityConnectorz8User to pass encoder's representations as-is to the LLM.c                    s   t    d S N)rD   rE   )r   argskwargsrK   r!   r"   rE      s   zIdentityConnector.__init__Nc                 O   s   ||fS ra   r!   )r   rS   rM   rb   rc   r!   r!   r"   rV      s   zIdentityConnector.forwardra   )rW   rX   rY   rZ   rE   rV   r_   r!   r!   rK   r"   r`      s    r`   )r   	omegaconfr   r   	nemo.corer   r   r   nemo.core.neural_typesr   r   r	   r
   r   r   r`   r!   r!   r!   r"   <module>   s   k