o
    }o™i»2  ã                   @   s˜   d Z ddlZddlm  mZ ddlmZ e ddg¡e ddg¡dœZ	dZ
G dd	„ d	ejjƒZG d
d„ deƒZG dd„ dejjƒZG dd„ deƒZdS )a   The patcher and unpatcher implementation for 2D and 3D data.

The idea of Haar wavelet is to compute LL, LH, HL, HH component as two 1D convolutions.
One on the rows and one on the columns.
For example, in 1D signal, we have [a, b], then the low-freq compoenent is [a + b] / 2 and high-freq is [a - b] / 2.
We can use a 1D convolution with kernel [1, 1] and stride 2 to represent the L component.
For H component, we can use a 1D convolution with kernel [1, -1] and stride 2.
Although in principle, we typically only do additional Haar wavelet over the LL component. But here we do it for all
   as we need to support downsampling for more than 2x.
For example, 4x downsampling can be done by 2x Haar and additional 2x Haar, and the shape would be.
   [3, 256, 256] -> [12, 128, 128] -> [48, 64, 64]
é    N)Ú	rearrangegÍ;fž æ?g      ð?)Úhaarr   Fc                       óD   e Zd ZdZd‡ fdd„	Zdd„ Zdd
d„Zdd„ Zdd„ Z‡  Z	S )ÚPatcherae  A module to convert image tensors into patches using torch operations.

    The main difference from `class Patching` is that this module implements
    all operations using torch, rather than python or numpy, for efficiency purpose.

    It's bit-wise identical to the Patching module outputs, with the added
    benefit of being torch.jit scriptable.
    é   r   c                    ó‚   t ƒ  ¡  || _|| _| jdt| td ttt	 
t	 | j¡¡ ¡ ƒƒ| _| jdt	 t| jd ¡td |  ¡ D ]}d|_q9d S ©NÚwavelets©Ú
persistentÚ_aranger   F©ÚsuperÚ__init__Ú
patch_sizeÚpatch_methodÚregister_bufferÚ	_WAVELETSÚ_PERSISTENTÚrangeÚintÚtorchÚlog2ÚtensorÚitemÚarangeÚshapeÚ
parametersÚrequires_grad©Úselfr   r   Úparam©Ú	__class__© úm/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/common/video_tokenizers/modules/patching.pyr   3   ó   
 ýÿzPatcher.__init__c                 C   ó6   | j dkr
|  |¡S | j dkr|  |¡S td| j  ƒ‚©Nr   r   zUnknown patch method: )r   Ú_haarÚ_arrangeÚ
ValueError©r    Úxr$   r$   r%   ÚforwardA   ó
   



zPatcher.forwardÚreflectFc                 C   s`  |j }| j}|jd }t|jd ƒ}| d¡ ddd¡ |dd¡}|d| j   ddd¡ |dd¡}	|	j|d}	|j|d}t	j
||d |d |d |d f|d |¡}t	j|| d¡|dd}
t	j||	 d¡|dd}t	j|
| d	¡|d
d}t	j|
|	 d	¡|d
d}t	j|| d	¡|d
d}t	j||	 d	¡|d
d}tj||||gdd}|r®|d }|S )Nr   r   éÿÿÿÿ©Údtypeé   ©ÚpadÚmode©r   r4   ©ÚgroupsÚstrideé   ©r4   r   ©Údim)r3   r	   r   r   ÚflipÚreshapeÚrepeatr   ÚtoÚFr6   Úconv2dÚ	unsqueezer   Úcat)r    r-   r7   Úrescaler3   ÚhÚnÚgÚhlÚhhÚxlÚxhÚxllÚxlhÚxhlÚxhhÚoutr$   r$   r%   Ú_dwtI   s&   
".zPatcher._dwtc                 C   s   | j D ]	}| j|dd}q|S )NT©rH   )r   rU   ©r    r-   Ú_r$   r$   r%   r)   a   s   
zPatcher._haarc                 C   s   t |d| j| jd ¡ }|S )Nz$b c (h p1) (w p2) -> b (c p1 p2) h w©Úp1Úp2)r   r   Ú
contiguousr,   r$   r$   r%   r*   f   s   üûzPatcher._arrange©r   r   ©r0   F)
Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r.   rU   r)   r*   Ú__classcell__r$   r$   r"   r%   r   )   s    	
r   c                       ó<   e Zd ZdZd‡ fdd„	Zddd	„Zd
d„ Zdd„ Z‡  ZS )Ú	Patcher3DzZA 3D discrete wavelet transform for video data, expects 5D tensor, i.e. a batch of videos.r   r   c                    s6   t ƒ j||d | jd|tjdgtjd td d S )N©r   r   Úpatch_size_bufferr   r2   r
   )r   r   r   r   ÚonesÚint32r   ©r    r   r   r"   r$   r%   r   s   s   
ýzPatcher3D.__init__r0   Fc              	   C   s   |j }| j}|jd }t|jd ƒ}| d¡ ddd¡ |dd¡}	|d| j   ddd¡ |dd¡}
|
j|d}
|	j|d}	t	j
|td|d ƒ|d |d |d |d |d f|d |¡}t	j||	 d¡ d¡|d	d
}t	j||
 d¡ d¡|d	d
}t	j||	 d¡ d¡|dd
}t	j||
 d¡ d¡|dd
}t	j||	 d¡ d¡|dd
}t	j||
 d¡ d¡|dd
}t	j||	 d¡ d¡|dd
}t	j||
 d¡ d¡|dd
}t	j||	 d¡ d¡|dd
}t	j||
 d¡ d¡|dd
}t	j||	 d¡ d¡|dd
}t	j||
 d¡ d¡|dd
}t	j||	 d¡ d¡|dd
}t	j||
 d¡ d¡|dd
}tj||||||||gdd}|rN|dt t d¡¡  }|S )Nr   r   r1   r2   r4   r5   r<   é   ©r4   r   r   r9   ©r   r4   r   ©r   r   r4   r>   ç       @)r3   r	   r   r   r@   rA   rB   r   rC   rD   r6   ÚmaxÚconv3drF   r   rG   Úsqrtr   )r    r-   Úwaveletr7   rH   r3   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   ÚxlllÚxllhÚxlhlÚxlhhÚxhllÚxhlhÚxhhlÚxhhhrT   r$   r$   r%   rU   {   s6   
"@zPatcher3D._dwtc                 C   s`   t j|d|jd d gdd\}}t j|j| jdd|gdd}| jD ]
}| j|ddd}q#|S )Nr   r4   r>   r   TrV   )r   Úsplitr   rG   Úrepeat_interleaver   r   rU   )r    r-   ÚxiÚxvrX   r$   r$   r%   r)   Ÿ   s
   "
zPatcher3D._haarc                 C   s`   t j|d|jd d gdd\}}t j|j| jdd|gdd}t|d| j| j| jd ¡ }|S )Nr   r4   r>   z0b c (t p1) (h p2) (w p3) -> b (c p1 p2 p3) t h w©rZ   r[   Úp3)r   r|   r   rG   r}   r   r   r\   )r    r-   r~   r   r$   r$   r%   r*   ¦   s   "ûúzPatcher3D._arranger]   r^   )	r_   r`   ra   rb   r   rU   r)   r*   rc   r$   r$   r"   r%   re   p   s    
$re   c                       r   )Ú	UnPatcherah  A module to convert patches into image tensorsusing torch operations.

    The main difference from `class Unpatching` is that this module implements
    all operations using torch, rather than python or numpy, for efficiency purpose.

    It's bit-wise identical to the Unpatching module outputs, with the added
    benefit of being torch.jit scriptable.
    r   r   c                    r   r   r   r   r"   r$   r%   r   ½   r&   zUnPatcher.__init__c                 C   r'   r(   )r   Ú_ihaarÚ	_iarranger+   r,   r$   r$   r%   r.   Ë   r/   zUnPatcher.forwardr0   Fc              	   C   s¢  |j }| j}|jd }t|jd d ƒ}| dg¡ ddd¡ |ddg¡}	|d| j   ddd¡ |dd¡}
|
j|d}
|	j|d}	t	j
| |¡ddd\}}}}t	jjj||	 d¡|d|d	 dfd
}|t	jjj||
 d¡|d|d	 dfd
7 }t	jjj||	 d¡|d|d	 dfd
}|t	jjj||
 d¡|d|d	 dfd
7 }t	jjj||	 d	¡|dd|d	 fd
}|t	jjj||
 d	¡|dd|d	 fd
7 }|rÏ|d	 }|S )Nr   r   rk   r1   r2   r>   r<   r=   r4   )r:   r;   Úpaddingr8   )r3   r	   r   r   r@   rA   rB   r   rC   r   ÚchunkÚnnÚ
functionalÚconv_transpose2drF   )r    r-   rs   r7   rH   r3   rI   rJ   rK   rL   rM   rP   rQ   rR   rS   ÚylÚyhÚyr$   r$   r%   Ú_idwtÓ   s$   
""&*&*&*zUnPatcher._idwtc                 C   s    | j D ]
}| j|ddd}q|S )Nr   TrV   )r   r   rW   r$   r$   r%   rƒ   ì   s   
zUnPatcher._ihaarc                 C   s   t |d| j| jd}|S )Nz$b (c p1 p2) h w -> b c (h p1) (w p2)rY   ©r   r   r,   r$   r$   r%   r„   ñ   s   üzUnPatcher._iarranger]   ©r   r0   F)
r_   r`   ra   rb   r   r.   r   rƒ   r„   rc   r$   r$   r"   r%   r‚   ³   s    	
r‚   c                       rd   )ÚUnPatcher3DzIA 3D inverse discrete wavelet transform for video wavelet decompositions.r   r   c                    s   t ƒ j||d d S )Nrf   )r   r   rj   r"   r$   r%   r   þ   s   zUnPatcher3D.__init__r0   Fc                 C   s|  |j }| j}t|jd d ƒ}| dg¡ ddd¡ |ddg¡}|d| j   ddd¡ |dd¡}	|j|d}|	j|d}	t	j
|ddd\}
}}}}}}}tj|
| d¡ d¡|d	d
}|tj||	 d¡ d¡|d	d
7 }tj|| d¡ d¡|d	d
}|tj||	 d¡ d¡|d	d
7 }tj|| d¡ d¡|d	d
}|tj||	 d¡ d¡|d	d
7 }tj|| d¡ d¡|d	d
}|tj||	 d¡ d¡|d	d
7 }tj|| d¡ d¡|dd
}|tj||	 d¡ d¡|dd
7 }tj|| d¡ d¡|dd
}|tj||	 d¡ d¡|dd
7 }tj|| d¡ d¡|dd
}|tj||	 d¡ d¡|dd
7 }|r<|dt	 t	 d¡¡  }|S )Nr   é   r   r1   r2   r>   r4   r<   rn   r9   rk   rm   rl   ro   )r3   r	   r   r   r@   rA   rB   r   rC   r   r†   rD   Úconv_transpose3drF   rr   r   )r    r-   rs   r7   rH   r3   rI   rK   rL   rM   rt   ru   rv   rw   rx   ry   rz   r{   rP   rQ   rR   rS   rN   rO   r$   r$   r%   r     s2   "" """""""zUnPatcher3D._idwtc                 C   sB   | j D ]
}| j|ddd}q|d d …d d …| jd d …df }|S )Nr   TrV   r   .)r   r   r   rW   r$   r$   r%   rƒ   (  s   
"zUnPatcher3D._ihaarc                 C   s>   t |d| j| j| jd}|d d …d d …| jd d …df }|S )Nz0b (c p1 p2 p3) t h w -> b c (t p1) (h p2) (w p3)r€   r   .rŽ   r,   r$   r$   r%   r„   .  s   û"zUnPatcher3D._iarranger]   r   )	r_   r`   ra   rb   r   r   rƒ   r„   rc   r$   r$   r"   r%   r   û   s    
'r   )rb   r   Útorch.nn.functionalr‡   rˆ   rD   Úeinopsr   r   r   r   ÚModuler   re   r‚   r   r$   r$   r$   r%   Ú<module>   s   þGCH