o
    wiR                     @   s   d dl Z d dlmZmZmZ d dlZd dlZd dlZd dlm	  m
Z d dlmZmZ d dlmZmZ d dlmZmZmZmZmZ d dlmZ G dd deZG d	d
 d
eZG dd deZG dd dej	jZdS )    N)DictOptionalSequence)activation_registrymask_sequence_tensor)NeuralModule	typecheck)	FloatTypeLengthsType
NeuralTypeSpectrogramTypeVoidType)loggingc                       sv   e Zd ZdZddddedef fddZedeee	f fd	d
Z
edeee	f fddZe dddZ  ZS )/SpectrogramNoiseConditionalScoreNetworkPlusPlusan  This model handles complex-valued inputs by stacking real and imaginary components.
    Stacked tensor is processed using NCSN++ and the output is projected to generate real
    and imaginary components of the output channels.

    Args:
        in_channels: number of input complex-valued channels
        out_channels: number of output complex-valued channels
       )in_channelsout_channelsr   r   c                   s   t    |dk rtd| || _|dk rtd| || _| }d| j  |d< |d< tdi || _tj	j
d| j d| j dd| _td| jj td	| j td
| j d S )Nr   zKNumber of input channels needs to be larger or equal to one, current value zLNumber of output channels needs to be larger or equal to one, current value    r   r   r   r   kernel_sizeInitialized %s withz	in_channels:  %sz	out_channels: %s )super__init__
ValueErrorr   r   copy$NoiseConditionalScoreNetworkPlusPlusncsnpptorchnnConv2doutput_projectionr   debug	__class____name__)selfr   r   kwargsncsnpp_paramsr#   r   k/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/audio/parts/submodules/ncsnpp.pyr   '   s(   
z8SpectrogramNoiseConditionalScoreNetworkPlusPlus.__init__returnc                 C   ,   t dt t dt ddt dt dddS +Returns definitions of module output ports.BCDTr/   Toptionalinputinput_length	condition)r   r   r
   r	   r%   r   r   r)   input_typesH      
z;SpectrogramNoiseConditionalScoreNetworkPlusPlus.input_typesc                 C      t dt t dt dddS r-   r.   r3   Tr4   )outputoutput_length)r   r   r
   r:   r   r   r)   output_typesQ      
z<SpectrogramNoiseConditionalScoreNetworkPlusPlus.output_typesNc                 C   s   |j \}}}}|| jkrtd| d| j tj|j|jgdd}t|d}| j	|||d\}	}
| 
|	}	|	|d| j||}	|	dddd	d
}	t|	 }	|	|
fS )NzUnexpected input channel size z, expected r   dimzB C RI F T -> B (C RI) F Tr6   r         r   )shaper   RuntimeErrorr   stackrealimageinops	rearranger   r!   reshaper   permuteview_as_complex
contiguous)r%   r7   r8   r9   r/   C_inr1   r2   input_real_imagr?   r@   r   r   r)   forwardY   s   

z7SpectrogramNoiseConditionalScoreNetworkPlusPlus.forward)NN)r$   
__module____qualname____doc__intr   propertyr   strr   r;   rA   r   rT   __classcell__r   r   r(   r)   r      s    	!r   c                       s  e Zd ZdZ												
	
d+dedededee dedededededede	e de	e f fddZ
dd Zededidediddejdejfd d!Zedeeef fd"d#Zedeeef fd$d%Ze d
d&dejd'e	ej d(e	ej fd)d*Z  ZS ),r   a3  Implementation of Noise Conditional Score Network (NCSN++) architecture.

    References:
        - Song et al., Score-Based Generative Modeling through Stochastic Differential Equations, NeurIPS 2021
        - Brock et al., Large scale GAN training for high fidelity natural image synthesis, ICLR 2018
    swishr      r^      r_   r_   rF   h㈵>F      0@        Nnonlinearityr   r   channelsnum_res_blocksnum_resolutions
init_scaleconditioned_on_timefourier_embedding_scaledropout_ratepad_time_topad_dimension_toc              
      s`  t    t|  | _|| _tjjddd| _tjjddd| _	|| _
|| _|| _|| _|| _|| _|p8d| j | _|p@d| j | _| jrutjt| jd |	dtj| jd d | jd d | jtj| jd d | jd d | _tj | _| jd d D ]}| jtjj| j
|d	d
 q| j|
| j| jr|d d nd d}tj | _t| jd d | jd	d  D ]!\}}t|D ]}td|dkr|n||d|}| j| qqtj | _tt| jd	d  t| jd d D ]$\}}tt|D ]}td||dkr|n|d|}| j| qqtj | _ | jd d D ]}| j tjj||d	d q't!| j| jksCJ t!| j| j| j ksQJ t!| j| j| j ks_J t!| j | jksjJ | "  t#$d| j%j& t#$d| j
 t#$d| j t#$d| j t#$d| j t#$d| j t#$d| j t#$d| j t#$d| j d S )Ng      ?bilinear)scale_factormoder   r   )embedding_sizescalerF   r   r   )
activationrj   rg   diffusion_step_embedding_dim)in_chout_ch)r   r   z	in_channels:         %sz	out_channels:        %sz	channels:            %sz	num_res_blocks:      %sz	num_resolutions:     %sz	conditioned_on_time: %sz	pad_time_to:         %sz	pad_dimension_to:    %sr   )'r   r   r   rs   rg   r   r   Upsample
downsampleupsampler   r   rd   re   rf   rh   rk   rl   
SequentialGaussianFourierProjectionLineartime_embedding
ModuleListinput_pyramidappendr    input_blocksziprangeResnetBlockBigGANPlusPlusoutput_blocksreversedprojection_blocksleninit_weights_r   r"   r#   r$   )r%   rc   r   r   rd   re   rf   rg   rh   ri   rj   rk   rl   _chblock_paramsru   rv   nblockr(   r   r)   r   |   sr   
$  & ."z-NoiseConditionalScoreNetworkPlusPlus.__init__c                 C   s   |   D ]"}t|tjjtjjfr&tjj|j |j	d ur&tjj
|j	 q| jD ]}tjjj|j| jd q*|   D ]}|| u rCq<t|drL|  q<d S )Ngainr   )modules
isinstancer   r   r|   r    initxavier_uniform_weightbiaszeros_r   rg   hasattrr   r%   moduler   r   r)   r      s   


z2NoiseConditionalScoreNetworkPlusPlus.init_weights_r7   r.   r?   )r;   rA   r*   c              	   C   sn   |j ^ }}}|}|| j dkrt|d| j|| j  f}|| j dkr5t|ddd| j|| j  f}|S )zEPad input tensor to match the required dimensions across `T` and `D`.r   )rG   rk   Fpadrl   )r%   r7   r   r1   r2   r?   r   r   r)   	pad_input   s   
 z.NoiseConditionalScoreNetworkPlusPlus.pad_inputc                 C   r+   r,   )r   r   r
   r	   r:   r   r   r)   r;     r<   z0NoiseConditionalScoreNetworkPlusPlus.input_typesc                 C   r=   r>   )r   r   r
   r:   r   r   r)   rA     rB   z1NoiseConditionalScoreNetworkPlusPlus.output_types)r9   r8   r9   c                C   s  |j d | jks
J |j ^ }}}| j|d}|du r-t|j d g|j d  |j}|}|durmt|j dkrJtdt|j  dt	|j  |j d |j d kretdt	|j  d	t	|j  d
| 
t|}|g}t| jd D ]}	|| |d  qwdd t|| jD }g }
t|d }t| j}t|D ]A\}	}|| td }t||}t| jD ]}t|||}t||}|
| q|	| jd k}|s| |}|d   }qg }tt|
| jD ].\}}|j |j kr|| | |}|d  }|| td }|||}t||}q|| g }t|t| j D ]\}}||}|t!j"||j dd d q+t#|}|j dd |j dd ksYJ |ddddd|d|f }||fS )a4  Forward pass of the model.

        Args:
            input: input tensor, shjae (B, C, D, T)
            input_length: length of the valid time steps for each example in the batch, shape (B,)
            condition: scalar condition (time) for the model, will be embedded using `self.time_embedding`
        r   )r7   Nrr   r   z.Expected conditon to be a 1-dim tensor, got a z-dim tensor of shape z
Condition z and input z' should match along the batch dimensionc                 S   s   g | ]\}}||qS r   r   ).0imager   r   r   r)   
<listcomp>N  s    z@NoiseConditionalScoreNetworkPlusPlus.forward.<locals>.<listcomp>       @r   )size)$rG   r   r   r   
LongTensortodevicer   r   tupler}   logr   rf   r   rx   r   r   
zeros_likeiterr   	enumeratemathsqrtr   re   nextceillongr   r   ry   r   r   interpolatesum)r%   r7   r8   r9   r   r1   r2   lengthspyramidresolution_numhistoryhiddenr   r   final_resolution
to_projectresidualr   imagestensor
projectionresultr   r   r)   rT   '  sj   $







"" z,NoiseConditionalScoreNetworkPlusPlus.forward)r\   r   r   r]   r   rF   r`   Fra   rb   NN)r$   rU   rV   rW   rZ   rX   r   floatboolr   r   r   r   r   r   Tensorr   rY   r   r;   rA   rT   r[   r   r   r(   r)   r   t   sx    		
pr   c                       sh   e Zd ZdZddedef fddZedee	e
f fd	d
Zedee	e
f fddZdd Z  ZS )r{   znGaussian Fourier embeddings for input scalars.

    The input scalars are typically time or noise levels.
    r_         ?rp   rq   c                    s*   t    tjjt|| dd| _d S )NF)requires_grad)r   r   r   r   	ParameterrandnW)r%   rp   rq   r(   r   r)   r     s   
 z"GaussianFourierProjection.__init__r*   c                 C      dt dt iS )r-   r7   r3   )r   r	   r:   r   r   r)   r;        z%GaussianFourierProjection.input_typesc                 C   r   )r-   r?   )r/   r1   )r   r   r:   r   r   r)   rA     r   z&GaussianFourierProjection.output_typesc                 C   sJ   |d d d f | j d d d f  d tj }tjt|t|gddS )Nr   rr   rC   )r   r   pir   catsincos)r%   r7   x_projr   r   r)   rT     s   ,z!GaussianFourierProjection.forward)r_   r   )r$   rU   rV   rW   rX   r   r   rY   r   rZ   r   r;   rA   rT   r[   r   r   r(   r)   r{     s    r{   c                       s   e Zd ZdZ						ddejjdeded	ee d
e	de	dee dee de	f fddZ
dd Zddejdeej fddZ  ZS )r   a!  Implementation of a ResNet block for the BigGAN model.

    References:
        - Song et al., Score-Based Generative Modeling through Stochastic Differential Equations, NeurIPS 2021
        - Brock et al., Large scale GAN training for high fidelity natural image synthesis, ICLR 2018
    Nr`   皙?ư>rs   ru   rv   rt   rg   rj   in_num_groupsout_num_groupsepsc
           
         s
  t    |pt|d d}|pt|d d}|| _tjtjj|||	d|| _tjj	||ddd| _
|durKtj|tj||tjjd| _tjtjj|||	d|tj|tjj	||ddd| _||krvtjj	||dd	| _|| _|| _|| _|   dS )
aN  
        Args:
            activation (torch.nn.Module): activation layer (ReLU, SiLU, etc)
            in_ch (int): number of channels in the input image
            out_ch (int, optional): number of channels in the output image
            diffusion_step_embedding_dim (int, optional): dimension of diffusion timestep embedding. Defaults to None (no embedding).
            dropout_rate (float, optional): dropout rate. Defaults to 0.1.
            init_scale (float, optional): scaling for weight initialization. Defaults to 0.0.
            in_num_groups (int, optional): num_groups in the first GroupNorm. Defaults to min(in_ch // 4, 32)
            out_num_groups (int, optional): num_groups in the second GroupNorm. Defaults to min(out_ch // 4, 32)
            eps (float, optional): eps parameter of GroupNorms. Defaults to 1e-6.
        rF       )
num_groupsnum_channelsr   rE   r   )r   r   r   paddingNzbatch dim -> batch dim 1 1r   )r   r   minrg   r   r   rz   	GroupNorminput_blockr    middle_convr|   rL   layers	Rearrangediffusion_step_projectionDropoutoutput_blockresidual_projectionactru   rv   r   )
r%   rs   ru   rv   rt   rg   rj   r   r   r   r(   r   r)   r     s6   

z"ResnetBlockBigGANPlusPlus.__init__c                 C   sn   |   D ]"}t|tjjtjjfr&tjj|j |j	dur&tjj
|j	 qtjjj| jd j| jd dS )zWeight initializationNrr   r   )r   r   r   r   r    r|   r   r   r   r   r   r   rg   r   r   r   r)   r     s   
 z'ResnetBlockBigGANPlusPlus.init_weights_xdiffusion_time_embeddingc                 C   s\   |  |}| |}|dur|| | }| |}|j|jkr%| |}|| td S )zForward pass of the model.

        Args:
            x: input tensor
            diffusion_time_embedding: embedding of the diffusion time step

        Returns:
            Output tensor
        Nr   )r   r   r   r   rG   r   r   r   )r%   r   r   hr   r   r)   rT     s   




z!ResnetBlockBigGANPlusPlus.forward)Nr`   r   NNr   )N)r$   rU   rV   rW   r   r   ModulerX   r   r   r   r   r   rT   r[   r   r   r(   r)   r     s:    	
;$r   )r   typingr   r   r   rL   einops.layers.torchr   torch.nn.functionalr   
functionalr   #nemo.collections.common.parts.utilsr   r   nemo.core.classesr   r   nemo.core.neural_typesr	   r
   r   r   r   
nemo.utilsr   r   r   r{   r   r   r   r   r   r)   <module>   s    W  