o
    }oi                     @   sx  d Z ddlZddlmZmZ ddlZddlmZ ddlm  m	Z
 ddlmZmZmZmZ ddlmZmZmZmZmZmZmZmZmZ dZG dd dejZG d	d
 d
ejZG dd dejZG dd dejZG dd dejZ G dd dejZ!G dd dejZ"G dd dejZ#G dd dejZ$G dd dejZ%G dd dejZ&G dd dejZ'G dd  d ejZ(dS )!aH  The model definition for 3D layers

Adapted from: https://github.com/lucidrains/magvit2-pytorch/blob/
9f49074179c912736e617d61b32be367eb5f993a/magvit2_pytorch/magvit2_pytorch.py#L889

[MIT License Copyright (c) 2023 Phil Wang]
https://github.com/lucidrains/magvit2-pytorch/blob/
9f49074179c912736e617d61b32be367eb5f993a/LICENSE
    N)TupleUnion)Patcher	Patcher3D	UnPatcherUnPatcher3D)	CausalNormalizebatch2space
batch2time
cast_tupleis_oddnonlinearityreplication_padspace2batch
time2batch    c                       sv   e Zd Z				ddededeeeeeef f def fdd	Zd
ej	dej	fddZ
d
ej	dej	fddZ  ZS )CausalConv3d      constantchan_inchan_outkernel_sizepad_modec                    s   t    t|d}|\}}}t|rt|sJ |dd}	|dd}
|dd}|dd}|dd}|| _||d  d|  }|| _||||f| _||
|
f}
||	|	f}	tj	|||f|
|	d|| _
d S )	Nr   dilationr   stridetime_stridetime_dilationpadding)r   r   )super__init__r   r   popr   time_padspatial_padnnConv3dconv3d)selfr   r   r   r   kwargstime_kernel_sizeheight_kernel_sizewidth_kernel_sizer   r   r   r   r   r"   	__class__ m/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/common/video_tokenizers/modules/layers3d.pyr    2   s2   




zCausalConv3d.__init__xreturnc                 C   s\   |d d d d d ddf  dd| jdd}tj||gdd}| jd }tj||| jddS )Nr   .   dim)r   r   g        modevalue)repeatr"   torchcatr#   Fpadr   )r'   r0   x_prevr   r.   r.   r/   _replication_padX   s   ,
zCausalConv3d._replication_padc                 C   s   |  |}| |S N)r>   r&   )r'   r0   r.   r.   r/   forward^   s   

zCausalConv3d.forward)r   r   r   r   )__name__
__module____qualname__intr   r   strr    r9   Tensorr>   r@   __classcell__r.   r.   r,   r/   r   1   s     &r   c                       <   e Zd Zdeddf fddZdejdejfddZ  ZS )	CausalUpsample3din_channelsr1   Nc                    s"   t    t||dddd| _d S )Nr   r   r   r   r   r   r    r   convr'   rJ   r,   r.   r/   r    d   s   
zCausalUpsample3d.__init__r0   c                 C   s   |j dddj ddd}dd|jd dk  }t|tjr!| }|j t|dd}| |}|dt|d d d d d d f S )Nr2   r   r3            ?r   .)repeat_interleaveshape
isinstancer9   rF   itemrD   rM   r'   r0   time_factorr.   r.   r/   r@   h   s   
$zCausalUpsample3d.forward	rA   rB   rC   rD   r    r9   rF   r@   rG   r.   r.   r,   r/   rI   c   s    rI   c                       rH   )	CausalDownsample3drJ   r1   Nc                    s$   t    t||ddddd| _d S )Nr   r2   r   r   r   r   r   rL   rN   r,   r.   r/   r    v   s   
zCausalDownsample3d.__init__r0   c                 C   s,   d}t j||ddd}t|}| |}|S )Nr   r   r   r   r   r   r   r   r5   )r;   r<   r   rM   )r'   r0   r<   r.   r.   r/   r@      s
   
zCausalDownsample3d.forwardrW   r.   r.   r,   r/   rX   u   s    rX   c                	       J   e Zd Z		ddedededdf fddZd	ejdejfd
dZ  Z	S )CausalHybridUpsample3dTrJ   
spatial_uptemporal_upr1   Nc                    s\   t    t||ddddd| _t||ddddd| _t||ddddd| _|| _|| _d S )Nr   r   r   r   r   rY   r   r   r   )r   r    r   conv1conv2conv3r]   r^   )r'   rJ   r]   r^   r(   r,   r.   r/   r       6   

zCausalHybridUpsample3d.__init__r0   c                 C   s   | j s| js|S | jrBdd|jd dk  }t|tjr | }|jt|dd}|dt|d d d d d d f }| 	|| }| j rX|jdddjddd}| 
|| }| |}|S )NrP   r2   r   r3   .r   rO   )r]   r^   rR   rS   r9   rF   rT   rQ   rD   ra   rb   rc   rU   r.   r.   r/   r@      s   $
zCausalHybridUpsample3d.forwardTT
rA   rB   rC   rD   boolr    r9   rF   r@   rG   r.   r.   r,   r/   r\          #r\   c                	       r[   )CausalHybridDownsample3dTrJ   spatial_downtemporal_downr1   Nc                    s\   t    t||ddddd| _t||ddddd| _t||ddddd| _|| _|| _d S )Nr`   r2   r   r   rY   r_   )r   r    r   ra   rb   rc   rj   rk   )r'   rJ   rj   rk   r(   r,   r.   r/   r       rd   z!CausalHybridDownsample3d.__init__r0   c                 C   s   | j s| js|S | j r'd}tj||ddd}| |}tj|ddd}|| }| jr?t|}| |}tj|ddd}|| }| |}|S )NrZ   r   r   r5   )r   r2   r2   r   r   r2   r   r   )	rj   rk   r;   r<   ra   
avg_pool3dr   rb   rc   )r'   r0   r<   x1x2r.   r.   r/   r@      s   


z CausalHybridDownsample3d.forwardre   rf   r.   r.   r,   r/   ri      rh   ri   c                       N   e Zd Zdddededededdf
 fdd	Zd
ejdejfddZ  Z	S )CausalResnetBlock3dNout_channelsrJ   rt   dropout
num_groupsr1   c                   s   t    || _|d u r|n|}t||d| _t||dddd| _t||d| _tj	
|| _t||dddd| _||krIt||dddd| _d S t	 | _d S )Nrv   r   r   rK   r   )r   r    rJ   r   norm1r   ra   norm2r9   r$   Dropoutru   rb   Identitynin_shortcutr'   rJ   rt   ru   rv   r,   r.   r/   r      s   
zCausalResnetBlock3d.__init__r0   c                 C   X   |}|  |}t|}| |}| |}t|}| |}| |}| |}|| S r?   rx   r   ra   ry   ru   rb   r|   r'   r0   hr.   r.   r/   r@        





zCausalResnetBlock3d.forward
rA   rB   rC   rD   floatr    r9   rF   r@   rG   r.   r.   r,   r/   rr      s    rr   c                       rq   )CausalResnetBlockFactorized3dNrs   rJ   rt   ru   rv   r1   c             
      s   t    || _|d u r|n|}t|dd| _tt||ddddt||dddd| _t||d| _	t
j|| _tt||ddddt||dddd| _||kr_t||dddd| _d S t | _d S )Nr   rw   r`   rK   r_   r   )r   r    rJ   r   rx   r$   
Sequentialr   ra   ry   r9   rz   ru   rb   r{   r|   r}   r,   r.   r/   r    (  sV   
z&CausalResnetBlockFactorized3d.__init__r0   c                 C   r~   r?   r   r   r.   r.   r/   r@   ]  r   z%CausalResnetBlockFactorized3d.forwardr   r.   r.   r,   r/   r   '  s    5r   c                       @   e Zd Zdededdf fddZdejdejfdd	Z  ZS )
CausalAttnBlockrJ   rv   r1   Nc                    l   t    t||d| _t||dddd| _t||dddd| _t||dddd| _t||dddd| _d S Nrw   r   r   rK   	r   r    r   normr   qkvproj_outr'   rJ   rv   r,   r.   r/   r    m     
zCausalAttnBlock.__init__r0   c                 C   s  |}|  |}| |}| |}| |}t|\}}t|\}}t|\}}|j\}}}	}
||||	|
 }|ddd}||||	|
 }t	||}|t
|d  }tj|dd}||||	|
 }|ddd}t	||}||||	|
}t||}| |}|| S )Nr   r2   r         r3   )r   r   r   r   r   rR   reshapepermuter9   bmmrD   r;   softmaxr
   r   )r'   r0   h_r   r   r   
batch_sizebcr   ww_r.   r.   r/   r@   v  s,   





zCausalAttnBlock.forwardrW   r.   r.   r,   r/   r   l      	r   c                       r   )
CausalTemporalAttnBlockrJ   rv   r1   Nc                    r   r   r   r   r,   r.   r/   r      r   z CausalTemporalAttnBlock.__init__r0   c                 C   s&  |}|  |}| |}| |}| |}t|\}}}t|\}}}t|\}}}|j\}	}
}|ddd}|ddd}|ddd}t||ddd}|t	|
d  }t
t|}||dktd}tj|dd}t||}|ddd|	|
|}t|||}| |}|| S )Nr   r2   r   r   z-infr3   )r   r   r   r   r   rR   r   r9   r   rD   tril	ones_likemasked_fillr   r;   r   r   r	   r   )r'   r0   r   r   r   r   r   height_bhwr   tr   maskr.   r.   r/   r@     s,   




zCausalTemporalAttnBlock.forwardrW   r.   r.   r,   r/   r     r   r   c                       sv   e Zd Zdededee dedee dededed	d
f fddZdejd	ejfddZ	dejd	ejfddZ
  ZS )EncoderBaserJ   channelschannels_multnum_res_blocksattn_resolutionsru   
resolution
z_channelsr1   Nc	              
      s  t    t|| _|| _|	dd}
t|
|	dd| _||
 |
 }t||dddd| _	|	dt
}||
 }dt| }|| _t | _t| jD ]X}t }t }|||  }|||  }t| jD ]}|t||||d	 |}||v r|t||d
 qet }||_||_|| jd krt||_|d }| j| qJt | _t||||d	| j_t||d
| j_t||||d	| j_t||d
| _t||dddd| _d S )N
patch_sizer   patch_method	rearranger   rK   rv   r   rJ   rt   ru   rv   rw   r2   ) r   r    lennum_resolutionsr   getr   patcherr   conv_in_LEGACY_NUM_GROUPStuple
in_ch_multr$   
ModuleListdownrangeappendrr   r   ModuleblockattnrX   
downsamplemidblock_1attn_1block_2r   norm_outconv_out)r'   rJ   r   r   r   r   ru   r   r   ignore_kwargsr   rv   curr_resr   i_levelr   r   block_in	block_outr   r   r,   r.   r/   r      sj   






zEncoderBase.__init__r0   c                 C   $   t |\}}| |}t||}|S r?   )r   r   r
   r'   r0   r   r.   r.   r/   	patcher3d  s   

zEncoderBase.patcher3dc                 C   sX  |  |}| |g}t| jD ]u}t| jD ]'}| j| j| |d }t| j| jdkr9| j| j| |}|	| q|| jd krT|	| j| 
|d  qdd|d jd dk  }t|tjrk| }t|d |d< |	tj|d |ddgg dd q|d }| j|}| j|}| j|}| |}t|}| |}|S )Nr   r   r2   rm   rl   )r   r   r   r   r   r   r   r   r   r   r   rR   rS   r9   rF   rT   r   r;   rn   r   r   r   r   r   r   r   )r'   r0   hsr   i_blockr   rV   r.   r.   r/   r@     s:   
	

zEncoderBase.forward)rA   rB   rC   rD   listr   r    r9   rF   r   r@   rG   r.   r.   r,   r/   r     s,    	Kr   c                       sd   e Zd Zdededee dedee dededef fd	d
ZdejdejfddZ	dd Z
  ZS )DecoderBasert   r   r   r   r   ru   r   r   c	              
      s  t    t|| _|| _|	dd}
t|
|	dd| _||
 |
 }||| jd   }||
 d| jd   }d|||f| _t	||dddd| _
|	dt}t | _t||||d	| j_t||d
| j_t||||d	| j_t | _tt| jD ]R}t }t }|||  }t| jd D ]}|t||||d	 |}||v r|t||d
 qt }||_||_|dkrt||_|d }| jd| q{t||d
| _t	||dddd| _ d S )Nr   r   r   r   r2   r   rK   rv   r   rw   r   )!r   r    r   r   r   r   r   	unpatcherz_shaper   r   r   r$   r   r   rr   r   r   r   r   r   upreversedr   r   r   r   rI   upsampleinsertr   r   r   )r'   rt   r   r   r   r   ru   r   r   r   r   out_chr   r   rv   r   r   r   r   r   r   r,   r.   r/   r    <  sh   






zDecoderBase.__init__r0   r1   c                 C   r   r?   )r   r   r
   r   r.   r.   r/   unpatcher3d  s   

zDecoderBase.unpatcher3dc                 C   s6  |  |}| j|}| j|}| j|}tt| jD ]g}t| jd D ] }| j	| j
| |}t| j	| jdkrG| j	| j| |}q'|dkrU| j	| |}qdd|jd dk  }t|tjrj| }|jt|dd}|dt|d d d d d d f }q| |}t|}| |}| |}|S )Nr   r   rP   r2   r3   .)r   r   r   r   r   r   r   r   r   r   r   r   r   r   rR   rS   r9   rF   rT   rQ   rD   r   r   r   r   )r'   zr   r   r   rV   r.   r.   r/   r@     s,   
&


zDecoderBase.forward)rA   rB   rC   rD   r   r   r    r9   rF   r   r@   rG   r.   r.   r,   r/   r   ;  s(    	Mr   c                       sn   e Zd Z		ddededee dedee ded	ed
edededdf fddZdejdejfddZ	  Z
S )EncoderFactorized      rJ   r   r   r   r   ru   r   r   spatial_compressiontemporal_compressionr1   Nc                    s  t    t|| _|| _|dd}t||dd| _|| | | }tt	
|	tt	
| | _| j| jksCJ d| j dtt	
|
tt	
| | _| j| jksaJ d| j dtt||dddd	t||d
ddd	| _|| }dt| }|| _t | _t| jD ]m}t }t }|||  }|||  }t| jD ]$}|t|||dd |}||v r|tt|ddt|dd qt }||_||_|| jd kr|| jk }|| jk }t|||d|_|d }| j| qt | _t|||dd| j_ tt|ddt|dd| j_!t|||dd| j_"t#|dd| _$tt||dddd	t||d
ddd	| _%d S )Nr   r   r   r   zSpatially downsample  times at mostzTemporally downsample r`   rK   r_   r   r   r   rw   )rj   rk   r2   )&r   r    r   r   r   r   r   r   rD   mathlog2num_spatial_downsnum_temporal_downsr$   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   ri   r   r   r   r   r   r   r   r   )r'   rJ   r   r   r   r   ru   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rj   rk   r,   r.   r/   r      s   












zEncoderFactorized.__init__r0   c                 C   s   |  |}| |g}t| jD ]C}t| jD ]'}| j| j| |d }t| j| jdkr9| j| j| |}|	| q|| jd krS|	| j| 
|d  q|d }| j|}| j|}| j|}| |}t|}| |}|S )Nr   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r'   r0   r   r   r   r   r.   r.   r/   r@   &  s&   


zEncoderFactorized.forwardr   r   )rA   rB   rC   rD   r   r   r    r9   rF   r@   rG   r.   r.   r,   r/   r     s6    	
vr   c                       s\   e Zd Z		ddededee dedee ded	ed
ededef fddZdd Z  ZS )DecoderFactorizedr   r   rt   r   r   r   r   ru   r   r   r   r   c                    s  t    t|| _|| _|dd}t||dd| _|| | | }tt	
|	tt	
| | _| j| jksCJ d| j dtt	
|
tt	
| | _| j| jksaJ d| j d||| jd   }|| d| jd   }d|||f| _tt||d	ddd
t||dddd
| _t | _t|||dd| j_tt|ddt|dd| j_t|||dd| j_|dd}t | _tt| jD ]}t }t }|||  }t| jd D ]%}|t|||dd |}||v r|tt|ddt|dd qt }||_||_ |dkrS| j| d }|r(|| jk }nd|  k o5| jd k n  }|pF|| jk oF| j| jk}t!|||d|_"|d }| j#d| qt$|dd| _%tt||d	ddd
t||dddd
| _&d S )Nr   r   r   r   zSpatially upsample r   zTemporally upsample r2   r`   rK   r_   r   r   rw   legacy_modeF)r]   r^   )'r   r    r   r   r   r   r   r   rD   r   r   num_spatial_upsnum_temporal_upsr   r$   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r\   r   r   r   r   r   )r'   rt   r   r   r   r   ru   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   i_level_reverser^   r]   r,   r.   r/   r    B  s   











 
zDecoderFactorized.__init__c                 C   s   |  |}| j|}| j|}| j|}tt| jD ]6}t| jd D ] }| j	| j
| |}t| j	| jdkrG| j	| j| |}q'|dkrT| j	| |}q| |}t|}| |}| |}|S )Nr   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r'   r   r   r   r   r.   r.   r/   r@     s$   



zDecoderFactorized.forwardr   )	rA   rB   rC   rD   r   r   r    r@   rG   r.   r.   r,   r/   r   A  s2    	
or   ))__doc__r   typingr   r   r9   torch.nnr$   torch.nn.functional
functionalr;   9nemo.collections.common.video_tokenizers.modules.patchingr   r   r   r   6nemo.collections.common.video_tokenizers.modules.utilsr   r	   r
   r   r   r   r   r   r   r   r   r   rI   rX   r\   ri   rr   r   r   r   r   r   r   r   r.   r.   r.   r/   <module>   s.   	,2;<'E),zt 