o
    }oi                     @   s   d dl mZmZ d dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZmZ d dlmZmZmZmZ d dlmZ g d	ZG d
d deZG dd deZG dd deZG dd deZG dd deZdS )    )DictOptionalN)Trainer)
DictConfig)AudioToAudioModel)PretrainedModelInfo	typecheck)AudioSignalLengthsTypeLossType
NeuralType)logging)EncMaskDecAudioToAudioModel%ScoreBasedGenerativeAudioToAudioModelPredictiveAudioToAudioModel#SchroedingerBridgeAudioToAudioModelFlowMatchingAudioToAudioModelc                       s   e Zd ZdZddedef fddZedee	e
f fdd	Zedee	e
f fd
dZe dddZdd Zddede	fddZedee fddZ  ZS )r   a  Class for encoder-mask-decoder audio processing models.

    The model consists of the following blocks:
        - encoder: transforms input multi-channel audio signal into an encoded representation (analysis transform)
        - mask_estimator: estimates a mask used by signal processor
        - mask_processor: mask-based signal processor, combines the encoded input and the estimated mask
        - decoder: transforms processor output into the time domain (synthesis transform)
    Ncfgtrainerc                    s   d| _ |d ur|j | _ t j||d | jj| _t| jj| _t| jj| _t| jj	| _	t| jj
| _
d| jv rKtd t| jj| _ntd d | _t| jdrm| jjd urmtd t| jj| _ntd d | _|   d S )	N   r   r   mixture_consistencyzUsing mixture consistencyzMixture consistency not usedchannel_augmentzUsing channel augmentationzChannel augmentation not used)
world_sizesuper__init___cfgsample_rater   from_config_dictencodermask_estimatormask_processordecoderr   debugr   hasattrr   r   channel_augmentationsetup_optimization_flagsselfr   r   	__class__ ]/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/audio/models/enhancement.pyr   /   s(   





z$EncMaskDecAudioToAudioModel.__init__returnc                 C   (   t dt| jdt tdt dddS NBCTfreqr1   Toptionalinput_signalinput_lengthr   r	   r   tupler
   r(   r+   r+   r,   input_typesQ   
   z'EncMaskDecAudioToAudioModel.input_typesc                 C   r.   Nr0   r4   r1   Tr6   output_signaloutput_lengthr;   r=   r+   r+   r,   output_typesZ   r?   z(EncMaskDecAudioToAudioModel.output_typesc           
      C   s   | d}| j||d\}}| j||d\}}| j|||d\}}	| jdur-| j||d}| j||	d\}}	| j||d}||	fS )a>  
        Forward pass of the model.

        Args:
            input_signal: Tensor that represents a batch of raw audio signals,
                of shape [B, T] or [B, T, C]. T here represents timesteps, with 1 second of audio represented as
                `self.sample_rate` number of floating point values.
            input_signal_length: Vector of length B, that contains the individual lengths of the audio
                sequences.

        Returns:
            Output signal `output` in the time domain and the length of the output signal `output_length`.
        inputr:   )rG   r:   maskN)mixtureestimaterG   batch_length)sizer   r    r!   r   r"   match_batch_length)
r(   r9   r:   rL   encodedencoded_lengthrH   _	processedprocessed_lengthr+   r+   r,   forwardc   s   

z#EncMaskDecAudioToAudioModel.forwardc           	      C   s   t |tr|d }|d }|d }n|\}}}}|jdkr#t|d}|jdkr.t|d}| jr<| jd ur<| j|d}| j||d\}}| j|||d}| 	d	| | 	d
| j
jd d  | 	dtj| jjtjd |S )Nr9   r:   target_signal   B T -> B 1 T)rG   r8   rJ   targetr:   
train_losslearning_rater   lrglobal_stepdtype)
isinstancedictndimeinops	rearrangetrainingr%   rT   losslog
_optimizerparam_groupstorchtensorr   r]   float32)	r(   batch	batch_idxr9   r:   rU   rQ   processed_signalrf   r+   r+   r,   training_step   s"   



z)EncMaskDecAudioToAudioModel.training_stepr   valdataloader_idxtagc                 C      t |tr|d }|d }|d }n|\}}}}|jdkr#t|d}|jdkr.t|d}| j||d\}	}| j|	||d}
t| dr_|| jv r_| j| | 	 D ]\}}|j
|	||d	 qR| d
tj| jjtjd | d|
iS Nr9   r:   rU   rV   rW   r8   rX   metricspredsrY   r:   r]   r^   _lossr`   ra   rb   rc   rd   rT   rf   r$   rv   itemsupdaterg   rj   rk   r   r]   rl   )r(   rm   rn   rr   rs   r9   r:   rU   rQ   ro   rf   namemetricr+   r+   r,   evaluation_step   s    



z+EncMaskDecAudioToAudioModel.evaluation_stepc                 C   s   g }|S )z
        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.

        Returns:
            List of available pre-trained models.
        r+   )clsresultsr+   r+   r,   list_available_models   s   z1EncMaskDecAudioToAudioModel.list_available_modelsNr   rq   )__name__
__module____qualname____doc__r   r   r   propertyr   strr   r>   rD   r   rT   rp   intr   classmethodr   r   r   __classcell__r+   r+   r)   r,   r   %   s    	"%""r   c                       s   e Zd ZdZddedef fddZedee	e
f fdd	Zedee	e
f fd
dZe dddZdd Zddede	fddZ  ZS )r   zqThis models aims to directly estimate the coefficients
    in the encoded domain by applying a neural model.
    Nr   r   c                    s   t  j||d | jj| _| | jj| _| | jj| _| | jj| _| jdd| _	| jdd| _
|   td| jj td| j	 td| j
 d S )	Nr   normalize_inputFeps:0yE>Initialized %s	normalize_input: %s	eps:             %s)r   r   r   r   r   r   r"   	estimatorgetr   r   r&   r   r#   r*   r   r'   r)   r+   r,   r      s   
z$PredictiveAudioToAudioModel.__init__r-   c                 C   r.   r/   r;   r=   r+   r+   r,   r>         z'PredictiveAudioToAudioModel.input_typesc                 C   r.   r@   r;   r=   r+   r+   r,   rD      r   z(PredictiveAudioToAudioModel.output_typesc                 C   s   | d}| jrtj| ddd}||| j  }| j||d\}}| j||d\}}| j||d\}	}
| jr;|	| }	| j	|	|d}	|	|
fS )a   Forward pass of the model.

        Args:
            input_signal: time-domain signal
            input_length: valid length of each example in the batch

        Returns:
            Output signal `output` in the time domain and the length of the output signal `output_length`.
        rE   rE   TdimkeepdimrF   rK   )
rM   r   rj   amaxabsr   r   r   r"   rN   )r(   r9   r:   rL   
norm_scalerO   rP   	estimatedestimated_lengthoutputrC   r+   r+   r,   rT     s   
z#PredictiveAudioToAudioModel.forwardc           	      C   s   t |tr|d }|d }|d }n|\}}}}|jdkr#t|d}|jdkr.t|d}| j||d\}}| j|||d}| d| | d	| jj	d
 d  | dt
j| jjt
jd |S )Nr9   r:   rU   rV   rW   r8   rX   rZ   r[   r   r\   r]   r^   )r`   ra   rb   rc   rd   rT   rf   rg   rh   ri   rj   rk   r   r]   rl   )	r(   rm   rn   r9   r:   rU   rQ   rB   rf   r+   r+   r,   rp   *  s   



z)PredictiveAudioToAudioModel.training_stepr   rq   rr   rs   c                 C   rt   ru   rz   )r(   rm   rn   rr   rs   r9   r:   rU   rQ   rB   rf   r}   r~   r+   r+   r,   r   G  s    



z+PredictiveAudioToAudioModel.evaluation_stepr   r   )r   r   r   r   r   r   r   r   r   r   r   r>   rD   r   rT   rp   r   r   r   r+   r+   r)   r,   r      s    $r   c                       s   e Zd ZdZddedef fddZedee	e
f fdd	Zedee	e
f fd
dZe e dddZee
de e
de e
ede dde
de iddddZdd Zddede	fddZ  ZS )r   a  This models is using a score-based diffusion process to generate
    an encoded representation of the enhanced signal.

    The model consists of the following blocks:
        - encoder: transforms input multi-channel audio signal into an encoded representation (analysis transform)
        - estimator: neural model, estimates a score for the diffusion process
        - sde: stochastic differential equation (SDE) defining the forward and reverse diffusion process
        - sampler: sampler for the reverse diffusion process, estimates coefficients of the target signal
        - decoder: transforms sampler output into the time domain (synthesis transform)
    Nr   r   c                    s  t  j||d | jj| _| | jj| _| | jj| _| | jj| _| | jj| _d| jj	v r7t
dd| jj	v rAt
dtjj| jj	| j| jd| _	| jdd| _| jd	| _| jd urjtd
| j | jdd| _|   td| jj td| j td| j d S )Nr   sdezDSDE should be defined in the model config, not in the sampler configscore_estimatorzPScore estimator should be defined in the model config, not in the sampler config)r   r   r   Fmax_utts_evaluation_metricsJMetrics will be evaluated on first %d examples of the evaluation datasets.r   r   r   r   r   )r   r   r   r   r   r   r"   r   r   sampler
ValueErrorhydrautilsinstantiater   r   r   r   warningr   r&   r#   r*   r   r'   r)   r+   r,   r   u  s.   

z.ScoreBasedGenerativeAudioToAudioModel.__init__r-   c                 C   r.   r/   r;   r=   r+   r+   r,   r>     r   z1ScoreBasedGenerativeAudioToAudioModel.input_typesc                 C   r.   r@   r;   r=   r+   r+   r,   rD     r   z2ScoreBasedGenerativeAudioToAudioModel.output_typesc                 C      | d}| jrtj| ddd}||| j  }| j||d\}}| j|||d\}}| j||d\}	}
| jr<|	| }	| j	|	|d}	|	|
fS )aI  Forward pass of the model.

        Forward pass of the model aplies the following steps:
            - encoder to obtain the encoded representation of the input signal
            - sampler to generate the estimated coefficients of the target signal
            - decoder to transform the sampler output into the time domain

        Args:
            input_signal: Tensor that represents a batch of time-domain audio signals,
                of shape [B, C, T]. T here represents timesteps, with 1 second of audio represented as
                `self.sample_rate` number of floating point values.
            input_signal_length: Vector of length B, contains the individual lengths of the audio sequences.

        Returns:
            Output `output_signal` in the time domain and the length of the output signal `output_length`.
        rE   r   Tr   rF   )
prior_meanscore_conditionstate_lengthrK   
rM   r   rj   r   r   r   r   r   r"   rN   r(   r9   r:   rL   r   rO   rP   	generatedgenerated_lengthr   rC   r+   r+   r,   rT     s   

z-ScoreBasedGenerativeAudioToAudioModel.forwardr0   r1   rU   r9   r:   rf   r>   rD   c                 C   s   | d}| jr tj| ddd}||| j  }||| j  }| j||d\}}| j||d\}}	| jj||j	d}
| jj
|||
d\}}t|}|||  }tj||gdd	}| j|||
d
\}}|| }| }| j|||d}|S )zRandomly generate a time step for each example in the batch, estimate
        the score and calculate the loss value.

        Note that this step does not include sampler.
        r   r   Tr   rF   rM   device)stater   timer   rG   r:   	conditionrX   )rM   r   rj   r   r   r   r   r   generate_timer   perturb_kernel_params
randn_likecatr   rf   )r(   rU   r9   r:   
batch_sizer   	input_encinput_enc_len
target_encrQ   sde_timepk_meanpk_stdz_normperturbed_encestimator_input	score_est	score_len	score_refrf   r+   r+   r,   _step  s"   

z+ScoreBasedGenerativeAudioToAudioModel._stepc                 C   s   t |tr|d }|d }|d }n|\}}}}|jdkr#t|d}|jdkr.t|d}| j|||d}| d| | d| jjd	 d
  | dt	j
| jjt	jd |S Nr9   r:   rU   rV   rW   r   rZ   r[   r   r\   r]   r^   r`   ra   rb   rc   rd   r   rg   rh   ri   rj   rk   r   r]   rl   r(   rm   rn   r9   r:   rU   rQ   rf   r+   r+   r,   rp     s   



z3ScoreBasedGenerativeAudioToAudioModel.training_stepr   rq   rr   rs   c                 C   s|  t |tr|d }|d }|d }n|\}}}}|jdkr#t|d}|jdkr.t|d}| j|||d}	d}
| jd u rEd}
|d	}n%tt	| j
| | }| j
| | | j}|| jk }
t| j| |d	}|
r| j|d |d
f |d | d\}}t| dr|| j
v r| j
| |  D ]\}}|j||d |d
f |d | d q| dtj| jjtjd | d|	iS Nr9   r:   rU   rV   rW   r   FTr   .r8   rv   rw   r]   r^   ry   r`   ra   rb   rc   rd   r   r   rM   nextiterrv   num_examplesminrT   r$   r{   r|   rg   rj   rk   r   r]   rl   r(   rm   rn   rr   rs   r9   r:   rU   rQ   rf   update_metricsr   first_metric_namenum_examples_evaluatedrB   r}   r~   r+   r+   r,   r   5  s>   







z5ScoreBasedGenerativeAudioToAudioModel.evaluation_stepr   r   r   r   r   r   r   r   r   r   r   r   r   r>   rD   r   rj   inference_moderT   r	   r<   r
   r   r   rp   r   r   r   r+   r+   r)   r,   r   i  s*    ,,


4r   c                       sF  e Zd ZdZd&dedef fddZedee	e
f fdd	Zedee	e
f fd
dZe e d&ddZee
de e
ede ddde
de e
ede dddde d&ddZe d'ddZee
de e
de e
ede dde
de idd&ddZdd Zd(d"ed#e	fd$d%Z  ZS ))r   a  This models uses a flow matching process to generate
    an encoded representation of the enhanced signal.

    The model consists of the following blocks:
        - encoder: transforms input multi-channel audio signal into an encoded representation (analysis transform)
        - estimator: neural model, estimates a score for the diffusion process
        - flow: ordinary differential equation (ODE) defining a flow and a vector field.
        - sampler: sampler for the inference process, estimates coefficients of the target signal
        - decoder: transforms sampler output into the time domain (synthesis transform)
        - ssl_pretrain_masking: if it is defined, perform the ssl pretrain masking for self reconstruction in the training process
    Nr   r   c                    s`  t  j||d | jj| _| | jj| _| | jj| _| | jj| _| | jj| _t	j
j| jj| jd| _| jdd| _| jdd urWtd | | jj| _nd | _| jdd| _| jd	| _| jd urutd
| j | jdd| _|   td| jj td| jd u td| j td| j td| j td| j d S )Nr   )r   p_cond      ?ssl_pretrain_maskingz5SSL-pretrain_masking is found and will be initializedr   Fr   r   r   r   zInitialized              %sz	doing SSL-pretraining: %sz	p_cond:                %sz	normalize_input:       %sz	loss:                  %sz	eps:                   %s)r   r   r   r   r   r   r"   r   flowr   r   r   r   r   r   r   r#   r   r   r   r   r   r&   r*   r   rf   r'   r)   r+   r,   r   {  s6   


z&FlowMatchingAudioToAudioModel.__init__r-   c                 C   r.   r/   r;   r=   r+   r+   r,   r>     r   z)FlowMatchingAudioToAudioModel.input_typesc                 C   r.   r@   r;   r=   r+   r+   r,   rD     r   z*FlowMatchingAudioToAudioModel.output_typesc                 C      | j ||ddS )a  Forward pass of the model to generate samples from the target distribution.
        This is used for inference mode only, and it explicitly disables SSL masking to the input.

        Args:
            input_signal: Tensor that represents a batch of raw audio signals,
                of shape [B, T] or [B, T, C]. T here represents timesteps, with 1 second of audio represented as
                `self.sample_rate` number of floating point values.
            input_signal_length: Vector of length B, that contains the individual lengths of the audio
                sequences.

        Returns:
            Output signal `output` in the time domain and the length of the output signal `output_length`.
        Fr9   r:   enable_ssl_maskingforward_internalr(   r9   r:   r+   r+   r,   rT     s   z%FlowMatchingAudioToAudioModel.forwardr0   r1   Tr6   r8   rA   r   c                 C   r   )a  Forward pass of the model to generate samples from the target distribution.
        This is used for eval mode only, and it enables SSL masking to the input.

        Args:
            input_signal: Tensor that represents a batch of raw audio signals,
                of shape [B, T] or [B, T, C]. T here represents timesteps, with 1 second of audio represented as
                `self.sample_rate` number of floating point values.
            input_signal_length: Vector of length B, that contains the individual lengths of the audio
                sequences.

        Returns:
            Output signal `output` in the time domain and the length of the output signal `output_length`.
        Tr   r   r   r+   r+   r,   forward_eval  s   z*FlowMatchingAudioToAudioModel.forward_evalFc                 C   s   | d}| jrtj| ddd}||| j  }| j||d\}}| jdkr-t|}n|r;| j	dur;| j	||d}t
|| jj }| j|||d	\}	}
| j|	|
d\}}| jr^|| }| j||d
}||fS )a  Internal forward pass of the model.

        Args:
            input_signal: Tensor that represents a batch of raw audio signals,
                of shape [B, T] or [B, T, C]. T here represents timesteps, with 1 second of audio represented as
                `self.sample_rate` number of floating point values.
            input_signal_length: Vector of length B, that contains the individual lengths of the audio
                sequences.
            enable_ssl_masking: Whether to enable SSL masking of the input. If using SSL pretraining, masking
                is applied to the input signal. If not using SSL pretraining, masking is not applied.

        Returns:
            Output signal `output` in the time domain and the length of the output signal `output_length`.
        rE   r   Tr   rF   r   N
input_speclength)r   estimator_conditionr   rK   )rM   r   rj   r   r   r   r   r   
zeros_liker   r   r   sigma_startr   r"   rN   )r(   r9   r:   r   rL   r   rO   rP   
init_stater   r   r   rC   r+   r+   r,   r     s$   


z.FlowMatchingAudioToAudioModel.forward_internalr   rf   c                 C   s&  | d}| jr tj| ddd}||| j  }||| j  }| j||d\}}| j||d\}}	| jd ur>| j||d}t	t
|| jk  d}
||
|j }t|}| jj|dj|jd	}| jj|||d
}tj||gdd}| j|||d\}}| jj||||d}| j|||dS )Nr   r   Tr   rF   r   zB -> B 1 1 1)r   )r   )r   x_startx_endr   r   r   )r   r   r   pointrX   )rM   r   rj   r   r   r   r   r   rc   rd   randr   floattor   r   r   r   sampler   r   vector_fieldrf   )r(   rU   r9   r:   r   r   r   r   r   rQ   keep_conditionsr   r   r   r   rJ   estimate_lenconditional_vector_fieldr+   r+   r,   r   "  s$   


z#FlowMatchingAudioToAudioModel._stepc                 C   s   t |tr|d }|d }|d| }n|\}}}}|jdkr't|d}|jdkr2t|d}| j|||d}| d| | d| j	j
d	 d
  | dtj| jjtjd |S r   )r`   ra   r   clonerb   rc   rd   r   rg   rh   ri   rj   rk   r   r]   rl   r   r+   r+   r,   rp   V  s   


z+FlowMatchingAudioToAudioModel.training_stepr   rq   rr   rs   c                 C   s  t |tr|d }|d }|d| }n|\}}}}|jdkr't|d}|jdkr2t|d}| j|||d}	d}
| jd u rId}
|	d	}n%t
t| j| | }| j| | | j}|| jk }
t| j| |	d	}|
r| j|d |d
f |d | d\}}t| dr|| jv r| j| |  D ]\}}|j||d |d
f |d | d q| dtj| jjtjd | d|	iS r   )r`   ra   r   r   rb   rc   rd   r   r   rM   r   r   rv   r   r   r   r$   r{   r|   rg   rj   rk   r   r]   rl   r   r+   r+   r,   r   o  sF   






z-FlowMatchingAudioToAudioModel.evaluation_stepr   )NFr   )r   r   r   r   r   r   r   r   r   r   r   r>   rD   r   rj   r   rT   r	   r<   r
   r   r   r   r   rp   r   r   r   r+   r+   r)   r,   r   n  sB    5


6


*r   c                       s   e Zd ZdZddedef fddZedee	e
f fdd	Zedee	e
f fd
dZe e dddZee
de e
de e
ede de
de e
de e
de dddddZdd Zddede	fddZ  ZS )r   u  This models is using a Schrödinger Bridge process to generate
    an encoded representation of the enhanced signal.

    The model consists of the following blocks:
        - encoder: transforms input audio signal into an encoded representation (analysis transform)
        - estimator: neural model, estimates the coefficients for the SB process
        - noise_schedule: defines the path between the clean and noisy signals
        - sampler: sampler for the reverse process, estimates coefficients of the target signal
        - decoder: transforms sampler output into the time domain (synthesis transform)

    References:
        Schrödinger Bridge for Generative Speech Enhancement, https://arxiv.org/abs/2407.16074
    Nr   r   c                    s  t  j||d | jj| _| | jj| _| | jj| _| | jj| _| jj| _| | jj	| _	t
jj| jj| j	| j| jd| _| jdd| _| jd| _| jd ur]td| j d| jv rs| | jj| _| jdd	| _nd | _d
| _d| jv r| | jj| _| jdd	| _nd | _d
| _| jd ur| jd us| jd urtd| jdd| _|   td| jj td| j td| j td| j td| j td| j td| j td| j td| j d S )Nr   )noise_scheduler   estimator_outputr   Fr   r   loss_encodedloss_encoded_weightr           	loss_timeloss_time_weightzREither ``loss`` or ``loss_encoded`` and ``loss_time`` should be defined, not both.r   r   r   z	estimator_output:    %sz	normalize_input:     %sz	loss:                %sz	loss_encoded:        %sz	loss_encoded_weight: %sz	loss_time:           %sz	loss_time_weight:    %sz	eps:                 %s)r   r   r   r   r   r   r"   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  rf   r   r   r&   r#   r*   r   r'   r)   r+   r,   r     sV   




z,SchroedingerBridgeAudioToAudioModel.__init__r-   c                 C   r.   r/   r;   r=   r+   r+   r,   r>        z/SchroedingerBridgeAudioToAudioModel.input_typesc                 C   r.   r@   r;   r=   r+   r+   r,   rD   	  r  z0SchroedingerBridgeAudioToAudioModel.output_typesc                 C   r   )aO  Forward pass of the model.

        Forward pass of the model consists of the following steps
            - encoder to obtain the encoded representation of the input signal
            - sampler to generate the estimated coefficients of the target signal
            - decoder to transform the estimated output into the time domain

        Args:
            input_signal: Tensor that represents a batch of time-domain audio signals,
                of shape [B, C, T]. T here represents timesteps, with 1 second of audio represented as
                `self.sample_rate` number of floating point values.
            input_signal_length: Vector of length B, contains the individual lengths of the audio sequences.

        Returns:
            Output `output_signal` in the time domain and the length of the output signal `output_length`.
        rE   r   Tr   rF   )r   r   r   rK   r   r   r+   r+   r,   rT     s   

z+SchroedingerBridgeAudioToAudioModel.forwardr0   r1   r   )rf   r   r  r   c                 C   sh  | d}| jr tj| ddd}||| j  }||| j  }| j||d\}}| j||d\}}	| jj||j	d}
| jj
|
d\}}}| jj|
d\}}}||d  |d | j  }||d  |d | j  }|d	d
d
d
}|d	d
d
d
}|| ||  }|| | || j  }|d	d
d
d
}t|}|||  }tj||gdd}| j|||
d\}}| jdkr&| jdur| j|||d}d }}ned}| jdur| j|||d}|| j| 7 }nd}| jdur#t  | j||d\}}	W d   n	1 sw   Y  | d	}| j||d}| j|||d}|| j| 7 }nd}n	td| j d|||fS )zRandomly generate time step for each example in the batch, run neural estimator
        to estimate the target and calculate the loss.
        r   r   Tr   rF   r   )r   rV   rE   r   r   r   r   data_predictionNrX   r  rK   zOutput type z is not implemented)rM   r   rj   r   r   r   r   r   r   r   
get_alphas
get_sigmasviewr   r   r   r   rf   r   r   r  r   disable_checksr"   rN   r  NotImplementedError)r(   rU   r9   r:   r   r   r   r   r   rQ   process_timealpha_talpha_bar_talpha_t_maxsigma_tsigma_bar_tsigma_t_maxweight_targetweight_inputmean_xstd_xr   x_tr   rJ   r   rf   r   r  estimate_signalrL   r+   r+   r,   r   @  sZ   








z)SchroedingerBridgeAudioToAudioModel._stepc           
      C   s   t |tr|d }|d }|d }n|\}}}}|jdkr#t|d}|jdkr.t|d}| j|||d\}}}	| d| | d| jjd	 d
  | dt	j
| jjt	jd |d urc| d| |	d urm| d|	 |S )Nr9   r:   rU   rV   rW   r   rZ   r[   r   r\   r]   r^   train_loss_encodedtrain_loss_timer   )
r(   rm   rn   r9   r:   rU   rQ   rf   r   r  r+   r+   r,   rp     s(   



z1SchroedingerBridgeAudioToAudioModel.training_stepr   rq   rr   rs   c                 C   s  t |tr|d }|d }|d }n|\}}}}|jdkr#t|d}|jdkr.t|d}| j|||d^}	}d}
| jd u rGd}
|d	}n%tt	| j
| | }| j
| | | j}|| jk }
t| j| |d	}|
r| j|d |d
f |d | d\}}t| dr|| j
v r| j
| |  D ]\}}|j||d |d
f |d | d q| dtj| jjtjd | d|	iS r   r   r   r+   r+   r,   r     s>   







z3SchroedingerBridgeAudioToAudioModel.evaluation_stepr   r   r   r+   r+   r)   r,   r     s.    F-




c"r   )typingr   r   rc   r   rj   lightning.pytorchr   	omegaconfr   ,nemo.collections.audio.models.audio_to_audior   nemo.core.classes.commonr   r   nemo.core.neural_typesr	   r
   r   r   
nemo.utilsr   __all__r   r   r   r   r   r+   r+   r+   r,   <module>   s,   	 6     @