o
    پi]                     @   s  d Z ddlZddlZddlmZmZmZmZmZm	Z	 ddl
Z
ddl
mZ ddlm  mZ ddlmZmZ ddlmZ ddlmZ eeZG d	d
 d
ejZG dd deZ			d)dee dededefddZe
j Z!e!fdeeef deeef dedede
j"de
j#de
j$fddZ%e!fde
j$de
j$deeef de
j#d e
j#de
j$fd!d"Z&			d)de
j$dee dededef
d#d$Z'G d%d& d&ejZ(G d'd( d(ejZ)dS )*a?   Image to Patch Embedding using Conv2d

A convolution based approach to patchifying a 2D image w/ embedding projection.

Based on code in:
  * https://github.com/google-research/vision_transformer
  * https://github.com/google-research/big_vision/tree/main/big_vision

Hacked together by / Copyright 2020 Ross Wightman
    N)CallableDictListOptionalTupleUnion)nn   )Formatnchw_to)	to_2tuple)_assertc                       s:  e Zd ZU dZeed< ejje	 ed< 													
d de
eeeef f dedededee de	dee de	de	de	f fddZde
eeeef f fddZ		d!dee
eeeef f  dee
eeeef f  fddZd"de
eeef ef fddZdeeef deeef fddZdd Z  ZS )#
PatchEmbed! 2D Image to Patch Embedding
    
output_fmtdynamic_img_pad            NTFimg_size
patch_sizein_chans	embed_dim
norm_layerflattenbiasstrict_img_sizec                    s   t    t|| _| |\| _| _| _|d ur"d| _t	|| _
n|| _t	j| _
|	| _|
| _tj|||||d| _|rC||| _d S t | _d S )NFkernel_sizestrider   )super__init__r   r   _init_img_sizer   	grid_sizenum_patchesr   r
   r   NCHWr   r   r   Conv2dprojIdentitynorm)selfr   r   r   r   r   r   r   r   r   r   	__class__ K/home/ubuntu/.local/lib/python3.10/site-packages/timm/layers/patch_embed.pyr"       s   

 zPatchEmbed.__init__c                 C   sR   | j sJ |d u rdS t|}tdd t|| j D }|d |d  }|||fS )N)NNNc                 S   s   g | ]\}}|| qS r.   r.   ).0spr.   r.   r/   
<listcomp>C   s    z-PatchEmbed._init_img_size.<locals>.<listcomp>r   r	   )r   r   tuplezip)r+   r   r$   r%   r.   r.   r/   r#   >   s   

zPatchEmbed._init_img_sizec                 C   s   d }|d ur
t |}|d urZ|| jkrZt 8 tj| jj| jj||| jj	d ud}|j
t| jj
|dd | jj	d urE|j	| jj	 || _W d    n1 sRw   Y  || _|p^| j}|| jksh|d uru| |\| _| _| _d S d S )Nr   T)verbose)r   r   torchno_gradr   r'   r(   in_channelsout_channelsr   weightcopy_resample_patch_embedr   r#   r$   r%   )r+   r   r   new_patch_sizenew_projr.   r.   r/   set_input_sizeG   s,   


zPatchEmbed.set_input_sizereturnc                 C   s   |rt | jS | jS N)maxr   )r+   	as_scalarr.   r.   r/   
feat_ratioa   s   
zPatchEmbed.feat_ratioc                 C   sZ   | j rt|d | jd  t|d | jd  fS |d | jd  |d | jd  fS )z Get grid (feature) size for given image size taking account of dynamic padding.
        NOTE: must be torchscript compatible so using fixed tuple indexing
        r   r	   )r   mathceilr   )r+   r   r.   r.   r/   dynamic_feat_sizeg   s   0$zPatchEmbed.dynamic_feat_sizec                 C   sv  |j \}}}}| jd urg| jr8t|| jd kd| d| jd  d t|| jd kd| d| jd  d n/| jsgt|| jd  dkd| d| jd  d t|| jd  dkd| d| jd  d | jr| jd || jd   | jd  }| jd || jd   | jd  }t|d|d|f}| |}| j	r|	d
dd}n| jtjkrt|| j}| |}|S )	Nr   zInput height (z) doesn't match model ().r	   zInput width (z%) should be divisible by patch size (   )shaper   r   r   r   r   Fpadr(   r   	transposer   r
   r&   r   r*   )r+   xBCHWpad_hpad_wr.   r.   r/   forwardp   s2   
(*""

zPatchEmbed.forward)
r   r   r   r   NTNTTF)NN)T)__name__
__module____qualname____doc__r
   __annotations__r7   jitFinalboolr   intr   r   r   strr"   r#   r@   rE   rH   rV   __classcell__r.   r.   r,   r/   r      s\   
 	

 "	r   c                       s   e Zd ZU dZeed< 								dd	ee d
edededee de	dee
 de	f fddZdeejee f fddZ  ZS )PatchEmbedWithSizer   r   r   r   r   r   NTr   r   r   r   r   r   r   c	           	   
      s    t  j||||||||d d S )N)r   r   r   r   r   r   r   r   )r!   r"   )	r+   r   r   r   r   r   r   r   r   r,   r.   r/   r"      s   
zPatchEmbedWithSize.__init__rA   c                 C   s   |j \}}}}| jd ur8t|| jd  dkd| d| jd  d t|| jd  dkd| d| jd  d | |}|j dd  }| jrQ|ddd}n| jtj	kr]t
|| j}| |}||fS )	Nr   zInput image height (z#) must be divisible by patch size (rI   r	   zInput image width (rJ   )rK   r   r   r   r(   r   rN   r   r
   r&   r   r*   )r+   rO   rP   rQ   rR   rS   	feat_sizer.   r.   r/   rV      s   
,,

zPatchEmbedWithSize.forward)r   r   r   r   NTNT)rW   rX   rY   rZ   r
   r[   r   r_   r   r^   r`   r"   r   r7   Tensorr   rV   ra   r.   r.   r,   r/   rb      s:   
 	$rb   bicubicTFnew_sizeinterpolation	antialiasr6   c              	      s.  ddl zddlm} W n ty   ddlm} Y nw t| jdks'J dtdks1J d| jdd }t|tkrB| S |rTt	d	| j d
 d d  fddfdd}||}tj
j|j| jdfdd}	|||	dddd}
| j}|  } |
| } | |} | S )a/  Resample the weights of the patch embedding kernel to target resolution.
    We resample the patch embedding kernel by approximately inverting the effect
    of patch resizing.

    Code based on:
      https://github.com/google-research/big_vision/blob/b00544b81f8694488d5f36295aeb7972f3755ffe/big_vision/models/proj/flexi/vit.py

    With this resizing, we can for example load a B/8 filter into a B/16 model
    and, on 2x larger input image, the result will match.

    Args:
        patch_embed: original parameter to be resized.
        new_size (tuple(int, int): target shape (height, width)-only.
        interpolation (str): interpolation for resize
        antialias (bool): use anti-aliasing filter in resize
        verbose (bool): log operation
    Returns:
        Resized patch embedding kernel.
    r   N)vmap   zFour dimensions expectedrJ   zNew shape should only be hwrc   zResize patch embedding z to z, w/ z interpolation.c                    s,   t | d }tj|| dd  }|S )N)NN.)sizemoderi   )r   r   .)r7   re   rL   interpolatenumpy)x_np	_new_sizex_tfx_upsampled)ri   rh   r.   r/   resize   s   z(resample_patch_embed_old.<locals>.resizec                    sT   g }t  | D ]} | }d| || < |||d q	 |jS )Ng      ?)rangeprodzerosunravel_indexappendreshapestackT)	_old_sizerq   mati	basis_vec)nprt   r.   r/   get_resize_mat   s   
z0resample_patch_embed_old.<locals>.get_resize_mat)devicec                    s   |  d }|  S )Nru   )r{   )kernelresampled_kernel)rg   resize_mat_pinvr.   r/   resample_kernel   s   
z1resample_patch_embed_old.<locals>.resample_kernelr	   )ro   r7   rj   ImportError	functorchlenrK   r4   _loggerinfotensorlinalgpinvr}   r   dtypefloatto)patch_embedrg   rh   ri   r6   rj   old_sizer   
resize_matr   v_resample_kernel
orig_dtyper.   )ri   rh   rg   r   rt   r   r/   resample_patch_embed_old   s0    

r   r   r   r   rA   c                 C   st   | \}}|\}}	|| }
||	 }t j|
||d}||
d||}tj||||dd}|dddd||
}|S )zKComputes the resize matrix basis vectors and interpolates them to new_size.)r   r   r	   F)rl   rm   ri   align_cornersrJ   r   )r7   eyer{   rL   rn   squeezepermute)r   rg   rh   ri   r   r   old_hold_wnew_hnew_w	old_total	new_total
eye_matrixbasis_vectors_batchresized_basis_vectors_batchresize_matrixr.   r.   r/   _compute_resize_matrix  s   	r   r   pinv_matrixnew_size_tupler   intermediate_dtypec           	      C   sV   | j ^}}}| ||dj|d} |j|d}| | }|j||g|R  j|d}|S )zW Simplified resampling w/o vmap use.
    As proposed by https://github.com/stas-sl
    ru   )r   )rK   r{   r   )	r   r   r   r   r   c_outc_in_resampled_patch_embedr.   r.   r/   _apply_resampling  s   
r   c                 C   s   t | jdksJ dt |dksJ dt| jdd }t|}||kr(| S | j}| j}t|||||t}	tj	|	}
t
| |
||t}|S )z5 Standalone function (computes matrix on each call). rk   z/Input tensor should be 4D (out_ch, in_ch, h, w)rJ   z+New shape should only be hw (height, width)rc   N)r   rK   r4   r   r   r   DTYPE_INTERMEDIATEr7   r   r   r   )r   rg   rh   ri   r6   old_size_tupler   r   r   r   r   r   r.   r.   r/   r=   .  s    
r=   c                	       s   e Zd ZdZ		ddeeef dedef fddZe	fd	eeef d
e
jde
jde
jfddZde
jd	ee de
jfddZ  ZS ) PatchEmbedResamplerFixedOrigSizez
    Resample patch embedding weights from a fixed original size,
    caching the pseudoinverse matrix based on the target size.
    rf   T	orig_sizerh   ri   c                    sD   t    t|trt|dksJ d|| _|| _|| _i | _dS )z
        Args:
            orig_size (Tuple[int, int]): The expected original (height, width) of input patch_embed tensors.
            interpolation (str): Interpolation mode.
            antialias (bool): Use anti-aliasing filter in resize.
        rJ   z.`orig_size` must be a tuple of (height, width)N)	r!   r"   
isinstancer4   r   r   rh   ri   _pinv_cache_map)r+   r   rh   ri   r,   r.   r/   r"   Q  s   

z)PatchEmbedResamplerFixedOrigSize.__init__rg   r   r   rA   c                 C   s   |}| j |}|r t| |r t| |}|j|kr |j|kr |S t| j|| j| j	||}t
j|}d|d  d|d  }t| |rHt| | | || || j |< |S )zRRetrieves the cached pinv matrix or computes and caches it for the given new_size.pinv_r   rO   r	   )r   gethasattrgetattrr   r   r   r   rh   ri   r7   r   r   delattrregister_buffer)r+   rg   r   r   	cache_keybuffer_namer   r   r.   r.   r/   _get_or_create_pinv_matrixf  s    



z;PatchEmbedResamplerFixedOrigSize._get_or_create_pinv_matrixr   c           	      C   s   t |jdks	J t |dksJ t|jdd }|| jks*J d| d| j t|}| j|kr5|S |j}|j}| ||}t||||}|S )a   Resamples the patch embedding weights to new_size.

        Args:
            patch_embed (torch.Tensor): Original weights (out_ch, in_ch, H_orig, W_orig).
            new_size (List[int]): Target [height, width].

        Returns:
            torch.Tensor: Resampled weights.
        rk   rJ   rc   NzInput patch_embed spatial size z0 does not match module's expected original size )r   rK   r4   r   r   r   r   r   )	r+   r   rg   
input_sizer   r   r   r   r   r.   r.   r/   rV     s    

z(PatchEmbedResamplerFixedOrigSize.forward)rf   T)rW   rX   rY   rZ   r   r_   r`   r^   r"   r   r7   r   r   re   r   r   rV   ra   r.   r.   r,   r/   r   L  s,    


&r   c                       s   e Zd ZdZ				ddeeef deded	ed
ef
 fddZde	j
deeef de	j
fddZde	j
deeef de	j
fddZ			dde	j
de	j
dee	j
 deeeef  dede	j
fddZ  ZS )PatchEmbedInterpolatora8  Dynamically interpolates patch embedding weights for variable patch sizes.

    This module wraps patch embedding weight resampling functionality to support
    on-the-fly patch size variation during training. It handles both Conv2d and
    Linear patch embeddings.

    Args:
        base_patch_size: The original patch size the model was initialized with
        in_chans: Number of input channels
        embed_dim: Embedding dimension
        interpolation: Interpolation mode for resampling
        antialias: Whether to use antialiasing during interpolation
    r   r   rf   Tbase_patch_sizer   r   rh   ri   c                    s,   t    || _|| _|| _|| _|| _d S rB   )r!   r"   r   r   r   rh   ri   )r+   r   r   r   rh   ri   r,   r.   r/   r"     s   

zPatchEmbedInterpolator.__init__r;   target_patch_sizerA   c                 C   s   || j kr|S |jd }| j \}}|\}}||||| j}|dddd}t|||g| j| jdd}	|	dddd}
|
|d}
|
S )a/  Resample linear patch embedding weights for a new patch size.

        Args:
            weight: Linear weight tensor of shape [embed_dim, patch_h * patch_w * in_chans]
            target_patch_size: Target (patch_h, patch_w) to resample to

        Returns:
            Resampled weight tensor
        r   r   r	   rJ   Frg   rh   ri   r6   ru   )r   rK   r{   r   r   r=   rh   ri   )r+   r;   r   r   base_phbase_pw	target_ph	target_pwweight_convweight_conv_resampledweight_resampledr.   r.   r/   resample_linear_weight  s"   



z-PatchEmbedInterpolator.resample_linear_weightc                 C   s,   || j kr|S t|t|| j| jdd}|S )a-  Resample conv2d patch embedding weights for a new patch size.

        Args:
            weight: Conv2d weight tensor of shape [embed_dim, in_chans, patch_h, patch_w]
            target_patch_size: Target (patch_h, patch_w) to resample to

        Returns:
            Resampled weight tensor
        Fr   )r   r=   listrh   ri   )r+   r;   r   r   r.   r.   r/   resample_conv_weight  s   
z+PatchEmbedInterpolator.resample_conv_weightNpatchesproj_weight	proj_biasr   	is_linearc                 C   s   |du r| j }|rV|| j kr7|jdksJ d|j\}}}}	}
| ||}|||d}tjj|||}|S |jdkrK|j\}}}}	}
|||d}tjj|||}|S || j kro| 	||}tjjj
||||dd}|S tjjj
||||dd}|S )aW  Apply patch embedding with dynamic weight resampling.

        Args:
            patches: Input patches
                - For linear mode with resampling: [B, N, Ph, Pw, C]
                - For linear mode without resampling: [B, N, Ph*Pw*C]
                - For conv mode: [B, C, H, W]
            proj_weight: Original projection weight
            proj_bias: Optional projection bias
            patch_size: Current patch size (if None, uses base_patch_size)
            is_linear: Whether using linear (True) or conv2d (False) projection

        Returns:
            Embedded patches
        N   z0Patches must be [B, N, Ph, Pw, C] for resamplingru   r   )r    padding)r   ndimrK   r   r{   r7   r   
functionallinearr   conv2d)r+   r   r   r   r   r   rP   NPhPwrQ   r   patches_flatoutputr.   r.   r/   rV     s6   



zPatchEmbedInterpolator.forward)r   r   rf   T)NNT)rW   rX   rY   rZ   r   r_   r`   r^   r"   r7   re   r   r   r   rV   ra   r.   r.   r,   r/   r     s^    


*

 r   )rf   TF)*rZ   loggingrF   typingr   r   r   r   r   r   r7   r   torch.nn.functionalr   rL   formatr
   r   helpersr   trace_utilsr   	getLoggerrW   r   Moduler   rb   r_   r`   r^   r   float32r   r   r   re   r   r   r=   r   r   r.   r.   r.   r/   <module>   s    
 
r/
F	


 


]