o
    }oiE-                     @   s   d Z ddlZddlZddlmZ ddlm  mZ ddlm	Z	m
Z
 ddlmZmZ G dd dejZG dd dejZG d	d
 d
ejZG dd dejZG dd dejZG dd dejZdS )ah  The model definition for Continuous 2D layers

Adapted from: https://github.com/CompVis/stable-diffusion/blob/
21f890f9da3cfbeaba8e2ac3c425ee9e998d5229/ldm/modules/diffusionmodules/model.py

[Copyright (c) 2022 Robin Rombach and Patrick Esser and contributors]
https://github.com/CompVis/stable-diffusion/blob/
21f890f9da3cfbeaba8e2ac3c425ee9e998d5229/LICENSE
    N)Patcher	UnPatcher)	Normalizenonlinearityc                       8   e Zd Zdef fddZdejdejfddZ  ZS )Upsamplein_channelsc                    s$   t    tj||dddd| _d S )N      kernel_sizestridepaddingsuper__init__nnConv2dconvselfr   	__class__ m/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/common/video_tokenizers/modules/layers2d.pyr   '      
zUpsample.__init__xreturnc                 C   s"   |j dddj ddd}| |S )N   dimr	   )repeat_interleaver   )r   r   r   r   r   forward+   s   
zUpsample.forward	__name__
__module____qualname__intr   torchTensorr"   __classcell__r   r   r   r   r   &       r   c                       r   )
Downsampler   c                    s$   t    tj||dddd| _d S )Nr	   r   r   r   r   r   r   r   r   r   1   r   zDownsample.__init__r   r   c                 C   s    d}t j||ddd}| |S )N)r   r
   r   r
   constantr   )modevalue)Fpadr   )r   r   r1   r   r   r   r"   5   s   
zDownsample.forwardr#   r   r   r   r   r,   0   r+   r,   c                       sF   e Zd Zdddededef fddZdejd	ejfd
dZ  Z	S )ResnetBlockN)out_channelsr   r3   dropoutc                   s   t    || _|d u r|n|}t|| _tj||dddd| _t|| _t	|| _
tj||dddd| _||krGtj||dddd| _d S t | _d S )Nr	   r
   r   r   )r   r   r   r   norm1r   r   conv1norm2Dropoutr4   conv2Identitynin_shortcut)r   r   r3   r4   kwargsr   r   r   r   <   s   


zResnetBlock.__init__r   r   c                 C   sX   |}|  |}t|}| |}| |}t|}| |}| |}| |}|| S )N)r5   r   r6   r7   r4   r9   r;   )r   r   hr   r   r   r"   S   s   





zResnetBlock.forward)
r$   r%   r&   r'   floatr   r(   r)   r"   r*   r   r   r   r   r2   ;   s    r2   c                       r   )	AttnBlockr   c                    sp   t    t|| _tj||dddd| _tj||dddd| _tj||dddd| _tj||dddd| _	d S )Nr
   r   r   )
r   r   r   normr   r   qkvproj_outr   r   r   r   r   d   s   

zAttnBlock.__init__r   r   c                 C   s   |}|  |}| |}| |}| |}|j\}}}}	|||||	 }|ddd}|||||	 }t||}
|
t	|d  }
t
j|
dd}
|||||	 }|
ddd}
t||
}|||||	}| |}|| S )Nr   r   r
   g      r   )r@   rA   rB   rC   shapereshapepermuter(   bmmr'   r0   softmaxrD   )r   r   h_rA   rB   rC   bcr=   ww_r   r   r   r"   m   s$   




zAttnBlock.forwardr#   r   r   r   r   r?   c   s    	r?   c                       s`   e Zd Zdededee dedee dededed	ef fd
dZdejdejfddZ	  Z
S )Encoderr   channelschannels_multnum_res_blocksattn_resolutionsr4   
resolution
z_channelsspatial_compressionc
              	      s  t    t|| _|| _|
dd}t||
dd| _|| | }tt	
|	tt	
| | _| j| jksAJ d| j dtjj||dddd| _|| }d	t| }|| _t | _t| jD ]S}t }t }|||  }|||  }t| jD ]}|t|||d
 |}||v r|t| qt }||_||_|| jk rt||_|d }| j| qdt | _t|||d
| j_t|| j_t|||d
| j_ t!|| _"tjj||dddd| _#d S )N
patch_sizer
   patch_method	rearrangezwe can only downsample  times at mostr	   r   )r
   r   r3   r4   r   )$r   r   lennum_resolutionsrR   getr   patcherr'   mathlog2num_downsamplesr(   r   r   conv_intuple
in_ch_mult
ModuleListdownrangeappendr2   r?   Moduleblockattnr,   
downsamplemidblock_1attn_1block_2r   norm_outconv_out)r   r   rP   rQ   rR   rS   r4   rT   rU   rV   ignore_kwargsrW   curr_resre   i_levelrk   rl   block_in	block_out_rg   r   r   r   r      sZ   






zEncoder.__init__r   r   c                 C   s   |  |}| |g}t| jD ]A}t| jD ]'}| j| j| |d }t| j| jdkr9| j| j| |}|	| q|| j
k rQ|	| j| |d  q|d }| j|}| j|}| j|}| |}t|}| |}|S )Nr   )r_   rc   rh   r]   rR   rg   rk   r\   rl   ri   rb   rm   rn   ro   rp   rq   rr   r   rs   )r   r   hsrv   i_blockr=   r   r   r   r"      s&   



zEncoder.forwardr$   r%   r&   r'   listr>   r   r(   r)   r"   r*   r   r   r   r   rO      s*    	
ErO   c                       s\   e Zd Zdededee dededededed	ef fd
dZdejdejfddZ	  Z
S )Decoderr3   rP   rQ   rR   rS   r4   rT   rU   rV   c
              	      s  t    t|| _|| _|
dd}t||
dd| _|| | }tt	
|	tt	
| | _| j| jksAJ d| j d||| jd   }|| d| jd   }d|||f| _tjj||dddd	| _t | _t|||d
| j_t|| j_t|||d
| j_t | _tt| jD ]S}t }t }|||  }t| jd D ]}|t|||d
 |}||v r|t| qt }||_||_|| j| j krt||_ |d }| j!d| qt"|| _#tjj||dddd	| _$d S )NrW   r
   rX   rY   zwe can only upsample rZ   r   r	   r   r[   r   )%r   r   r\   r]   rR   r^   r   	unpatcherr'   r`   ra   num_upsamplesz_shaper(   r   r   rc   rj   rn   r2   ro   r?   rp   rq   rf   upreversedrh   ri   rk   rl   r   upsampleinsertr   rr   rs   )r   r3   rP   rQ   rR   rS   r4   rT   rU   rV   rt   rW   out_chrw   ru   rv   rk   rl   rx   ry   r   r   r   r   r      sT   





zDecoder.__init__zr   c                 C   s   |  |}| j|}| j|}| j|}tt| jD ]:}t| jd D ] }| j	| j
| |}t| j	| jdkrG| j	| j| |}q'|| j| j krX| j	| |}q| |}t|}| |}| |}|S )Nr
   r   )rc   rn   ro   rp   rq   r   rh   r]   rR   r   rk   r\   rl   r   r   rr   r   rs   r   )r   r   r=   rv   r|   r   r   r   r"   1  s$   



zDecoder.forwardr}   r   r   r   r   r      s*    	
Dr   )__doc__r`   r(   torch.nnr   torch.nn.functional
functionalr0   9nemo.collections.common.video_tokenizers.modules.patchingr   r   6nemo.collections.common.video_tokenizers.modules.utilsr   r   rj   r   r,   r2   r?   rO   r   r   r   r   r   <module>   s   

((a