o
    ei                  	   @   s  d Z ddlZddlZddlmZ ddlZddlmZ ddlm	Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZ eeZeeddG dd deZeeddG dd deZ eeddG dd deZ!eeddG dd deZ"dd Z#dd Z$G d d! d!ej%Z&G d"d# d#ej%Z'G d$d% d%ej%Z(dNd(ej)d)e*d*e+d+ej)fd,d-Z,G d.d/ d/ej%Z-G d0d1 d1ej%Z.G d2d3 d3ej%Z/G d4d5 d5ej%Z0G d6d7 d7ej%Z1G d8d9 d9ej%Z2G d:d; d;ej%Z3G d<d= d=eZ4G d>d? d?ej%Z5eG d@dA dAeZ6eG dBdC dCe6Z7edDdG dEdF dFe6Z8edGdG dHdI dIe6Z9edJdG dKdL dLee6Z:g dMZ;dS )OzPyTorch Swin Transformer model.    N)	dataclass)nn   )initialization)ACT2FN)BackboneMixin)GradientCheckpointingLayer)BackboneOutput)PreTrainedModel)ModelOutputauto_docstringlogging	torch_int   )
SwinConfigzN
    Swin encoder's outputs, with potential hidden states and attentions.
    )custom_introc                   @   sr   e Zd ZU dZdZejdB ed< dZe	ejdf dB ed< dZ
e	ejdf dB ed< dZe	ejdf dB ed< dS )SwinEncoderOutputa  
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlast_hidden_state.hidden_states
attentionsreshaped_hidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   tupler   r    r   r   d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.pyr   '   s   
 	r   zV
    Swin model's outputs that also contains a pooling of the last hidden states.
    c                   @      e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ejdf dB ed< dZe
ejdf dB ed< dZe
ejdf dB ed< dS )	SwinModelOutputa  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
        Average pooling of the last layer hidden-state.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr   pooler_output.r   r   r   )r   r   r   r   r   r   r   r   r#   r   r   r   r   r   r   r   r    r"   =   s   
 r"   z*
    Swin masked image model outputs.
    c                   @   r!   )	SwinMaskedImageModelingOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
        Masked image modeling (MLM) loss.
    reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Reconstructed pixel values.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlossreconstruction.r   r   r   )r   r   r   r   r%   r   r   r   r&   r   r   r   r   r   r   r   r    r$   V      
 r$   z0
    Swin outputs for image classification.
    c                   @   r!   )	SwinImageClassifierOutputa7  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr%   logits.r   r   r   )r   r   r   r   r%   r   r   r   r)   r   r   r   r   r   r   r   r    r(   q   r'   r(   c                 C   sR   | j \}}}}| ||| ||| ||} | dddddd d|||}|S )z2
    Partitions the given input into windows.
    r   r   r            shapeviewpermute
contiguous)input_featurewindow_size
batch_sizeheightwidthnum_channelswindowsr   r   r    window_partition   s   $r:   c                 C   sN   | j d }| d|| || |||} | dddddd d|||} | S )z?
    Merges windows to produce higher resolution features.
    r-   r   r   r   r*   r+   r,   r.   )r9   r4   r6   r7   r8   r   r   r    window_reverse   s   
$r;   c                
       sr   e Zd ZdZd fdd	Zdejdededejfd	d
Z		ddej	dB dej
dB dedeej fddZ  ZS )SwinEmbeddingszW
    Construct the patch and position embeddings. Optionally, also the mask token.
    Fc                    s   t    t|| _| jj}| jj| _|r tt	
dd|jnd | _|jr5tt	
d|d |j| _nd | _t|j| _t|j| _|j| _|| _d S )Nr   )super__init__SwinPatchEmbeddingspatch_embeddingsnum_patches	grid_size
patch_gridr   	Parameterr   zeros	embed_dim
mask_tokenuse_absolute_embeddingsposition_embeddings	LayerNormnormDropouthidden_dropout_probdropout
patch_sizeconfig)selfrP   use_mask_tokenrA   	__class__r   r    r>      s   


 
zSwinEmbeddings.__init__
embeddingsr6   r7   returnc                 C   s   |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| j }	|| j }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr-         ?r   r   r*   bicubicF)sizemodealign_cornersdim)r/   rI   r   jit
is_tracingrO   r   reshaper1   r   
functionalinterpolater0   cat)rQ   rU   r6   r7   rA   num_positionsclass_pos_embedpatch_pos_embedr]   
new_height	new_widthsqrt_num_positionsr   r   r    interpolate_pos_encoding   s(   



z'SwinEmbeddings.interpolate_pos_encodingNpixel_valuesbool_masked_posrj   c                 C   s   |j \}}}}| |\}}	| |}| \}
}}|d ur8| j|
|d}|d|}|d|  ||  }| jd urN|rI|| 	||| }n|| j }| 
|}||	fS )Nr-   g      ?)r/   r@   rK   rY   rG   expand	unsqueezetype_asrI   rj   rN   )rQ   rk   rl   rj   _r8   r6   r7   rU   output_dimensionsr5   seq_lenmask_tokensmaskr   r   r    forward   s   



zSwinEmbeddings.forward)FNF)r   r   r   r   r>   r   Tensorintrj   r   
BoolTensorboolr   ru   __classcell__r   r   rS   r    r<      s    +r<   c                       sN   e Zd ZdZ fddZdd ZdejdB deej	ee
 f fd	d
Z  ZS )r?   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
|d |d  |d |d  f| _tj||||d| _d S )Nr   r   )kernel_sizestride)r=   r>   
image_sizerO   r8   rF   
isinstancecollectionsabcIterablerA   rB   r   Conv2d
projection)rQ   rP   r~   rO   r8   hidden_sizerA   rS   r   r    r>     s   
 "zSwinPatchEmbeddings.__init__c                 C   s   || j d  dkrd| j d || j d   f}tj||}|| j d  dkr>ddd| j d || j d   f}tj||}|S )Nr   r   )rO   r   ra   pad)rQ   rk   r6   r7   
pad_valuesr   r   r    	maybe_pad  s    zSwinPatchEmbeddings.maybe_padrk   NrV   c                 C   sV   |j \}}}}| |||}| |}|j \}}}}||f}|ddd}||fS )Nr*   r   )r/   r   r   flatten	transpose)rQ   rk   rp   r8   r6   r7   rU   rq   r   r   r    ru     s   
zSwinPatchEmbeddings.forward)r   r   r   r   r>   r   r   r   r   rw   rx   ru   r{   r   r   rS   r    r?      s
    .	r?   c                	       sh   e Zd ZdZejfdee dedejddf fddZ	d	d
 Z
dejdeeef dejfddZ  ZS )SwinPatchMerginga'  
    Patch Merging Layer.

    Args:
        input_resolution (`tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    input_resolutionr]   
norm_layerrV   Nc                    sB   t    || _|| _tjd| d| dd| _|d| | _d S )Nr+   r*   Fbias)r=   r>   r   r]   r   Linear	reductionrK   )rQ   r   r]   r   rS   r   r    r>   6  s
   
zSwinPatchMerging.__init__c                 C   sF   |d dkp|d dk}|r!ddd|d d|d f}t j||}|S )Nr*   r   r   )r   ra   r   )rQ   r3   r6   r7   
should_padr   r   r   r    r   =  s
   zSwinPatchMerging.maybe_padr3   input_dimensionsc                 C   s   |\}}|j \}}}|||||}| |||}|d d dd ddd dd d f }|d d dd ddd dd d f }	|d d dd ddd dd d f }
|d d dd ddd dd d f }t||	|
|gd}||dd| }| |}| |}|S )Nr   r*   r   r-   r+   )r/   r0   r   r   rc   rK   r   )rQ   r3   r   r6   r7   r5   r]   r8   input_feature_0input_feature_1input_feature_2input_feature_3r   r   r    ru   E  s   $$$$

zSwinPatchMerging.forward)r   r   r   r   r   rJ   r   rx   Moduler>   r   r   rw   ru   r{   r   r   rS   r    r   )  s
    **r           Finput	drop_probtrainingrV   c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    r   r   r   )r   dtypedevice)r/   ndimr   randr   r   floor_div)r   r   r   	keep_probr/   random_tensoroutputr   r   r    	drop_path`  s   r   c                       sT   e Zd ZdZddedB ddf fddZdejdejfdd	Zde	fd
dZ
  ZS )SwinDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rV   c                    s   t    || _d S N)r=   r>   r   )rQ   r   rS   r   r    r>   s  s   

zSwinDropPath.__init__r   c                 C   s   t || j| jS r   )r   r   r   rQ   r   r   r   r    ru   w  s   zSwinDropPath.forwardc                 C   s   d| j  S )Nzp=)r   rQ   r   r   r    
extra_reprz  s   zSwinDropPath.extra_reprr   )r   r   r   r   floatr>   r   rw   ru   strr   r{   r   r   rS   r    r   p  s
    r   c                
       sV   e Zd Z fddZ		ddejdejdB dedB deej fd	d
Z	dd Z
  ZS )SwinSelfAttentionc                    s  t    || dkrtd| d| d|| _t|| | _| j| j | _t|tj	j
r0|n||f| _ttd| jd  d d| jd  d  || _| d|   tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _t|j| _d S )	Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r*   r   relative_position_indexr   )r=   r>   
ValueErrornum_attention_headsrx   attention_head_sizeall_head_sizer   r   r   r   r4   r   rD   r   rE   relative_position_bias_tableregister_buffercreate_relative_position_indexr   qkv_biasquerykeyvaluerL   attention_probs_dropout_probrN   rQ   rP   r]   	num_headsr4   rS   r   r    r>     s$   
*zSwinSelfAttention.__init__NFr   attention_maskoutput_attentionsrV   c                 C   s  |j \}}}||d| jf}| ||dd}| ||dd}	| ||dd}
t||	dd}|t	
| j }| j| jd }|| jd | jd  | jd | jd  d}|ddd }||d }|d ur|j d }||| || j||}||dd }|d| j||}tjj|dd}| |}t||
}|dddd }| d d | jf }||}|r||f}|S |f}|S )Nr-   r   r*   r   r\   r   )r/   r   r   r0   r   r   r   r   matmulmathsqrtr   r   r4   r1   r2   rn   r   r   ra   softmaxrN   rY   r   )rQ   r   r   r   r5   r]   r8   hidden_shapequery_layer	key_layervalue_layerattention_scoresrelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputsr   r   r    ru     s<   &


zSwinSelfAttention.forwardc                 C   s  t | jd }t | jd }t t j||gdd}t |d}|d d d d d f |d d d d d f  }|ddd }|d d d d df  | jd d 7  < |d d d d df  | jd d 7  < |d d d d df  d| jd  d 9  < |d}|S )Nr   r   ij)indexingr*   r-   )	r   aranger4   stackmeshgridr   r1   r2   sum)rQ   coords_hcoords_wcoordscoords_flattenrelative_coordsr   r   r   r    r     s   ,((,
z0SwinSelfAttention.create_relative_position_indexrv   )r   r   r   r>   r   rw   r   rz   r   ru   r   r{   r   r   rS   r    r   ~  s    
3r   c                       s8   e Zd Z fddZdejdejdejfddZ  ZS )SwinSelfOutputc                    s*   t    t||| _t|j| _d S r   )r=   r>   r   r   denserL   r   rN   rQ   rP   r]   rS   r   r    r>     s   
zSwinSelfOutput.__init__r   input_tensorrV   c                 C      |  |}| |}|S r   r   rN   )rQ   r   r   r   r   r    ru     s   

zSwinSelfOutput.forwardr   r   r   r>   r   rw   ru   r{   r   r   rS   r    r     s    $r   c                
       sN   e Zd Z fddZ		ddejdejdB dedB deej fd	d
Z	  Z
S )SwinAttentionc                    s*   t    t||||| _t||| _d S r   )r=   r>   r   rQ   r   r   r   rS   r   r    r>     s   
zSwinAttention.__init__NFr   r   r   rV   c                 C   s4   |  |||}| |d |}|f|dd   }|S )Nr   r   )rQ   r   )rQ   r   r   r   self_outputsattention_outputr   r   r   r    ru     s   zSwinAttention.forwardrv   )r   r   r   r>   r   rw   r   rz   r   ru   r{   r   r   rS   r    r     s    r   c                       2   e Zd Z fddZdejdejfddZ  ZS )SwinIntermediatec                    sJ   t    t|t|j| | _t|jt	rt
|j | _d S |j| _d S r   )r=   r>   r   r   rx   	mlp_ratior   r   
hidden_actr   r   intermediate_act_fnr   rS   r   r    r>     s
   
zSwinIntermediate.__init__r   rV   c                 C   r   r   )r   r   r   r   r   r    ru        

zSwinIntermediate.forwardr   r   r   rS   r    r     s    r   c                       r   )
SwinOutputc                    s4   t    tt|j| || _t|j| _	d S r   )
r=   r>   r   r   rx   r   r   rL   rM   rN   r   rS   r   r    r>   
  s   
zSwinOutput.__init__r   rV   c                 C   r   r   r   r   r   r   r    ru     r   zSwinOutput.forwardr   r   r   rS   r    r   	  s    r   c                       sx   e Zd Zd fdd	Zdd Zdd Zd	d
 Z		ddejde	e
e
f dedB dedB de	ejejf f
ddZ  ZS )	SwinLayerr   r   c                    s   t    |j| _|| _|j| _|| _tj||jd| _	t
|||| jd| _|dkr.t|nt | _tj||jd| _t||| _t||| _d S )Neps)r4   r   )r=   r>   chunk_size_feed_forward
shift_sizer4   r   r   rJ   layer_norm_epslayernorm_beforer   	attentionr   Identityr   layernorm_afterr   intermediater   r   )rQ   rP   r]   r   r   drop_path_rater   rS   r   r    r>     s   
zSwinLayer.__init__c                 C   sD   t || jkr td| _tj rt t|nt || _d S d S Nr   )minr4   r   r   r   r^   r_   tensor)rQ   r   r   r   r    set_shift_and_window_size#  s
   
 z#SwinLayer.set_shift_and_window_sizec              	   C   s  | j dkrtjd||df||d}td| j t| j | j  t| j  d f}td| j t| j | j  t| j  d f}d}|D ]}	|D ]}
||d d |	|
d d f< |d7 }qEqAt|| j}|d| j| j }|d|d }||dkd|dkd}|S d }|S )Nr   r   r   r-   r*   g      Yr   )	r   r   rE   slicer4   r:   r0   rn   masked_fill)rQ   r6   r7   r   r   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_maskr   r   r    get_attn_mask+  s.   

zSwinLayer.get_attn_maskc                 C   sR   | j || j   | j  }| j || j   | j  }ddd|d|f}tj||}||fS r   )r4   r   ra   r   )rQ   r   r6   r7   	pad_right
pad_bottomr   r   r   r    r   G  s
   zSwinLayer.maybe_padFr   r   r   Nalways_partitionrV   c                 C   s  |s|  | n	 |\}}| \}}}	|}
| |}|||||	}| |||\}}|j\}}}}| jdkrGtj|| j | j fdd}n|}t	|| j
}|d| j
| j
 |	}| j|||j|jd}| j|||d}|d }|d| j
| j
|	}t|| j
||}| jdkrtj|| j| jfdd}n|}|d dkp|d dk}|r|d d d |d |d d f  }|||| |	}|
| | }| |}| |}|| | }|r||d	 f}|S |f}|S )
Nr   )r   r*   )shiftsdimsr-   r   )r   r   r,   r   )r   rY   r   r0   r   r/   r   r   rollr:   r4   r  r   r   r   r;   r2   r   r   r   r   )rQ   r   r   r   r  r6   r7   r5   rp   channelsshortcutr   
height_pad	width_padshifted_hidden_stateshidden_states_windowsr  attention_outputsr   attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputsr   r   r    ru   N  sJ   


$

zSwinLayer.forward)r   r   FF)r   r   r   r>   r   r  r   r   rw   r   rx   rz   ru   r{   r   r   rS   r    r     s$    
r   c                       sX   e Zd Z fddZ		ddejdeeef dedB dedB d	eej f
d
dZ	  Z
S )	SwinStagec                    sh   t     | _| _t fddt|D | _|d ur,|tjd| _	nd | _	d| _
d S )Nc              
      s:   g | ]}t  | |d  dkrdn jd  dqS )r*   r   )rP   r]   r   r   r   r   )r   r4   .0irP   r]   r   r   r   r   r    
<listcomp>  s    	z&SwinStage.__init__.<locals>.<listcomp>)r]   r   F)r=   r>   rP   r]   r   
ModuleListrangeblocksrJ   
downsamplepointing)rQ   rP   r]   r   depthr   r   r%  rS   r   r    r>     s   
	
zSwinStage.__init__Fr   r   r   Nr  rV   c                 C   s   |\}}t | jD ]\}}|||||}	|	d }q	|}
| jd ur:|d d |d d }}||||f}| |
|}n||||f}||
|f}|rO||	dd  7 }|S )Nr   r   r*   )	enumerater$  r%  )rQ   r   r   r   r  r6   r7   r  layer_moduler  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledrq   stage_outputsr   r   r    ru     s   


zSwinStage.forwardr  )r   r   r   r>   r   rw   r   rx   rz   ru   r{   r   r   rS   r    r    s    
r  c                       st   e Zd Z fddZ					ddejdeeef dedB d	edB d
edB dedB dedB dee	B fddZ
  ZS )SwinEncoderc                    sp   t    t j_ _dd tjd jt	 jddD t
 fddtjD _d_d S )Nc                 S   s   g | ]}|  qS r   )item)r  xr   r   r    r!    s    z(SwinEncoder.__init__.<locals>.<listcomp>r   cpu)r   c                    s   g | ]E}t  t jd |  d d |  d d |  f j|  j| t jd| t jd|d   |jd k rCtnddqS )r*   r   r   N)rP   r]   r   r'  r   r   r%  )r  rx   rF   depthsr   r   
num_layersr   )r  i_layerrP   dprrB   rQ   r   r    r!    s    
*F)r=   r>   lenr2  r3  rP   r   linspacer   r   r   r"  r#  layersgradient_checkpointing)rQ   rP   rB   rS   r5  r    r>     s   
$

zSwinEncoder.__init__FTr   r   r   Noutput_hidden_states(output_hidden_states_before_downsamplingr  return_dictrV   c                 C   s  |rdnd }|r
dnd }	|rdnd }
|r7|j \}}}|j|g||R  }|dddd}||f7 }|	|f7 }	t| jD ]~\}}|||||}|d }|d }|d }|d |d f}|r|r|j \}}}|j|g|d |d f|R  }|dddd}||f7 }|	|f7 }	n'|r|s|j \}}}|j|g||R  }|dddd}||f7 }|	|f7 }	|r|
|dd  7 }
q<|stdd	 |||
fD S t|||
|	d
S )Nr   r   r   r   r*   r   r-   c                 s   s    | ]	}|d ur|V  qd S r   r   )r  vr   r   r    	<genexpr>  s    z&SwinEncoder.forward.<locals>.<genexpr>)r   r   r   r   )r/   r0   r1   r(  r9  r   r   )rQ   r   r   r   r;  r<  r  r=  all_hidden_statesall_reshaped_hidden_statesall_self_attentionsr5   rp   r   reshaped_hidden_stater  r)  r  r*  rq   r   r   r    ru     sX   





zSwinEncoder.forward)FFFFT)r   r   r   r>   r   rw   r   rx   rz   r   ru   r{   r   r   rS   r    r.    s0    
	r.  c                       sD   e Zd ZU eed< dZdZdZdZdgZ	e
  fddZ  ZS )	SwinPreTrainedModelrP   swinrk   )imageTr  c                    sz   t  | t|tr%|jdurt|j |jdur#t|j dS dS t|tr;t|j	 t
|j|  dS dS )zInitialize the weightsN)r=   _init_weightsr   r<   rG   initzeros_rI   r   r   copy_r   r   )rQ   modulerS   r   r    rG  %  s   



z!SwinPreTrainedModel._init_weights)r   r   r   r   r   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modulesr   no_gradrG  r{   r   r   rS   r    rD    s   
 rD  c                       sz   e Zd Zd fdd	Zdd Ze						ddejdB d	ejdB d
e	dB de	dB de	de	dB de
eB fddZ  ZS )	SwinModelTFc                    s   t  | || _t|j| _t|jd| jd   | _t	||d| _
t|| j
j| _tj| j|jd| _|r<tdnd| _|   dS )a  
        add_pooling_layer (`bool`, *optional*, defaults to `True`):
            Whether or not to apply pooling layer.
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether or not to create and apply mask tokens in the embedding layer.
        r*   r   )rR   r   N)r=   r>   rP   r7  r2  r3  rx   rF   num_featuresr<   rU   r.  rC   encoderr   rJ   r   	layernormAdaptiveAvgPool1dpooler	post_init)rQ   rP   add_pooling_layerrR   rS   r   r    r>   5  s   zSwinModel.__init__c                 C      | j jS r   rU   r@   r   r   r   r    get_input_embeddingsJ     zSwinModel.get_input_embeddingsNrk   rl   r   r;  rj   r=  rV   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| j|||d\}}	| j||	|||d}
|
d }| |}d}| jdurY| |	dd}t
|d}|sg||f|
dd  }|S t|||
j|
j|
jdS )	z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rl   rj   )r   r;  r=  r   r   r*   )r   r#   r   r   r   )rP   r   r;  use_return_dictr   rU   rT  rU  rW  r   r   r   r"   r   r   r   )rQ   rk   rl   r   r;  rj   r=  kwargsembedding_outputr   encoder_outputssequence_outputpooled_outputr   r   r   r    ru   M  s@   


zSwinModel.forward)TFNNNNFN)r   r   r   r>   r\  r   r   r   ry   rz   r   r"   ru   r{   r   r   rS   r    rR  3  s2    	rR  ad  
    Swin Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://huggingface.co/papers/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    c                       p   e Zd Z fddZe						ddejdB dejdB dedB dedB d	ed
edB de	e
B fddZ  ZS )SwinForMaskedImageModelingc                    sn   t  | t|ddd| _t|jd|jd   }ttj	||j
d |j ddt|j
| _|   d S )NFT)rY  rR   r*   r   )in_channelsout_channelsr|   )r=   r>   rR  rE  rx   rF   r3  r   
Sequentialr   encoder_strider8   PixelShuffledecoderrX  )rQ   rP   rS  rS   r   r    r>     s   
z#SwinForMaskedImageModeling.__init__NFrk   rl   r   r;  rj   r=  rV   c                 K   s<  |dur|n| j j}| j||||||d}|d }	|	dd}	|	j\}
}}t|d  }}|	|
|||}	| |	}d}|dur|| j j	| j j
 }|d||}|| j j
d| j j
dd }tjj||dd	}||  | d
  | j j }|s|f|dd  }|dur|f| S |S t|||j|j|jdS )a  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, SwinForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/swin-base-simmim-window6-192")
        >>> model = SwinForMaskedImageModeling.from_pretrained("microsoft/swin-base-simmim-window6-192")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 192, 192]
        ```N)rl   r   r;  rj   r=  r   r   r*   rW   r-   none)r   gh㈵>)r%   r&   r   r   r   )rP   r^  rE  r   r/   r   floorr`   rl  r~   rO   repeat_interleavern   r2   r   ra   l1_lossr   r8   r$   r   r   r   )rQ   rk   rl   r   r;  rj   r=  r_  r   rb  r5   r8   sequence_lengthr6   r7   reconstructed_pixel_valuesmasked_im_lossrY   rt   reconstruction_lossr   r   r   r    ru     sH   (	
 z"SwinForMaskedImageModeling.forwardrd  )r   r   r   r>   r   r   r   ry   rz   r   r$   ru   r{   r   r   rS   r    rf    s0    	rf  a  
    Swin Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune Swin on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                       re  )SwinForImageClassificationc                    sP   t  | |j| _t|| _|jdkrt| jj|jnt | _	| 
  d S r   )r=   r>   
num_labelsrR  rE  r   r   rS  r   
classifierrX  )rQ   rP   rS   r   r    r>   
  s   
"z#SwinForImageClassification.__init__NFrk   labelsr   r;  rj   r=  rV   c                 K   s   |dur|n| j j}| j|||||d}|d }	| |	}
d}|dur+| ||
| j }|sA|
f|dd  }|dur?|f| S |S t||
|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r;  rj   r=  r   r*   )r%   r)   r   r   r   )	rP   r^  rE  rw  loss_functionr(   r   r   r   )rQ   rk   rx  r   r;  rj   r=  r_  r   rc  r)   r%   r   r   r   r    ru     s.   
z"SwinForImageClassification.forwardrd  )r   r   r   r>   r   r   r   
LongTensorrz   r   r(   ru   r{   r   r   rS   r    ru    s0    	ru  zM
    Swin backbone, to be used with frameworks like DETR and MaskFormer.
    c                       s^   e Zd Zdef fddZdd Z			ddejdedB d	edB d
edB de	f
ddZ
  ZS )SwinBackbonerP   c                    s   t     jg fddtt jD  | _t | _t	 | jj
| _i }t| j| jD ]\}}t|||< q/t|| _|   d S )Nc                    s   g | ]}t  jd |  qS )r*   )rx   rF   r  rP   r   r    r!  Q  s    z)SwinBackbone.__init__.<locals>.<listcomp>)r=   r>   rF   r#  r7  r2  rS  r<   rU   r.  rC   rT  zipout_featuresr  r   rJ   
ModuleDicthidden_states_normsrX  )rQ   rP   r  stager8   rS   r|  r    r>   N  s   &
zSwinBackbone.__init__c                 C   rZ  r   r[  r   r   r   r    r\  ^  r]  z!SwinBackbone.get_input_embeddingsNrk   r;  r   r=  rV   c              	   K   s4  |dur|n| j j}|dur|n| j j}|dur|n| j j}| |\}}| j|||ddddd}|j}	d}
t| j|	D ]A\}}|| j	v r}|j
\}}}}|dddd }|||| |}| j| |}|||||}|dddd }|
|f7 }
q<|s|
f}|r||jf7 }|S t|
|r|jnd|jd	S )
a  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224")
        >>> model = AutoBackbone.from_pretrained(
        ...     "microsoft/swin-tiny-patch4-window7-224", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 7, 7]
        ```NT)r   r;  r<  r  r=  r   r   r*   r   r   )feature_mapsr   r   )rP   r^  r;  r   rU   rT  r   r}  stage_namesr~  r/   r1   r2   r0   r  r   r	   r   )rQ   rk   r;  r   r=  r_  r`  r   r   r   r  r  hidden_stater5   r8   r6   r7   r   r   r   r    ru   a  sH   #


zSwinBackbone.forward)NNN)r   r   r   r   r>   r\  r   rw   rz   r	   ru   r{   r   r   rS   r    r{  H  s"    r{  )ru  rf  rR  rD  r{  )r   F)<r   collections.abcr   r   dataclassesr   r   r    r   rH  activationsr   backbone_utilsr   modeling_layersr   modeling_outputsr	   modeling_utilsr
   utilsr   r   r   r   configuration_swinr   
get_loggerr   loggerr   r"   r$   r(   r:   r;   r   r<   r?   r   rw   r   rz   r   r   r   r   r   r   r   r   r  r.  rD  rR  rf  ru  r{  __all__r   r   r   r    <module>   s   

\+ 7]z7VSh?c