o
    wi                  	   @   s  d Z ddlZddlZddlZddlmZ ddlmZm	Z	 ddl
Z
ddlZ
ddl
mZmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z* e%+e,Z-ee$ddG dd deZ.dRde
jde/de0de
jfddZ1G dd dej2Z3G dd  d ej2Z4G d!d" d"ej2Z5G d#d$ d$ej2Z6G d%d& d&e6Z7G d'd( d(ej2Z8e6e7d)Z9G d*d+ d+ej2Z:G d,d- d-ej2Z;G d.d/ d/ej2Z<G d0d1 d1eZ=G d2d3 d3ej2Z>G d4d5 d5ej2Z?e$G d6d7 d7eZ@e$G d8d9 d9e@ZAG d:d; d;ej2ZBe$d<dG d=d> d>e@ZCe$d?dG d@dA dAe@ZDG dBdC dCej2ZEG dDdE dEej2ZFG dFdG dGej2ZGG dHdI dIej2ZHG dJdK dKej2ZIe$G dLdM dMe@ZJe$dNdG dOdP dPe@e(ZKg dQZLdS )SzPyTorch BEiT model.    N)	dataclass)OptionalUnion)Tensornn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GradientCheckpointingLayer)BackboneOutputBaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedLMOutputSemanticSegmenterOutput)PreTrainedModel)#compile_compatible_method_lru_cache find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int)BackboneMixin   )
BeitConfigz-
    Class for outputs of [`BeitModel`].
    )custom_introc                   @   s   e Zd ZdZdS )BeitModelOutputWithPoolingaF  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    N)__name__
__module____qualname____doc__ r#   r#   c/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/beit/modeling_beit.pyr   0   s    r           Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r%   r   r   r   )dtypedevice)shapendimtorchrandr+   r,   floor_div)r&   r'   r(   	keep_probr-   random_tensoroutputr#   r#   r$   	drop_path?   s   
r6   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )BeitDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr'   r)   c                    s   t    || _d S N)super__init__r'   )selfr'   	__class__r#   r$   r:   V   s   

zBeitDropPath.__init__hidden_statesc                 C   s   t || j| jS r8   )r6   r'   r(   r;   r>   r#   r#   r$   forwardZ   s   zBeitDropPath.forwardc                 C   s   d| j  S )Nzp=)r'   r;   r#   r#   r$   
extra_repr]   s   zBeitDropPath.extra_reprr8   )r   r    r!   r"   r   floatr:   r/   r   r@   strrB   __classcell__r#   r#   r<   r$   r7   S   s
    r7   c                	       sv   e Zd ZdZdeddf fddZdejded	edejfd
dZ			ddejde
ej de
e dejfddZ  ZS )BeitEmbeddingszc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    configr)   Nc                    s   t    ttdd|j| _|jr!ttdd|j| _	nd | _	t
|| _|j| _t|jtjjr8|jn|j|jf| _| jj}|jrUttd|d |j| _nd | _t|j| _d S )Nr   )r9   r:   r   	Parameterr/   zeroshidden_size	cls_tokenuse_mask_token
mask_tokenBeitPatchEmbeddingspatch_embeddings
patch_size
isinstance
image_sizecollectionsabcIterablenum_patches use_absolute_position_embeddingsposition_embeddingsDropouthidden_dropout_probdropout)r;   rG   rV   r<   r#   r$   r:   i   s    


zBeitEmbeddings.__init__
embeddingsheightwidthc                 C   s   |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| j }	|| j }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r
      bicubicFsizemodealign_cornersdim)r-   rX   r/   jit
is_tracingrP   r   reshapepermuter   
functionalinterpolateviewcat)r;   r\   r]   r^   rV   num_positionsclass_pos_embedpatch_pos_embedrg   
new_height	new_widthsqrt_num_positionsr#   r#   r$   interpolate_pos_encoding   s(   



z'BeitEmbeddings.interpolate_pos_encodingpixel_valuesbool_masked_posrv   c                 C   s   | j d ur|d urtd |j\}}}}| |\}\}}	| \}
}}|d urC| j|
|d}|d	|}|d|  ||  }| j
|
dd}tj||fdd}| j d urb|| ||| }| |}|||	ffS )Nz`interpolate_pos_encoding` argument has no effect for BEiTEmbeddings, embeddings are always interpolated to the input image size. The argument will be removed in transformers v4.51.0.r_   r   rf   )rX   warningswarnr-   rO   rc   rM   expand	unsqueezetype_asrK   r/   ro   rv   r[   )r;   rw   rx   rv   _r]   r^   r\   patch_heightpatch_width
batch_sizeseq_lenmask_tokensw
cls_tokensr#   r#   r$   r@      s"   

zBeitEmbeddings.forwardNN)r   r    r!   r"   r   r:   r/   r   intrv   r   
BoolTensorboolr@   rE   r#   r#   r<   r$   rF   c   s    +rF   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )rN   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|d |d  |d |d  f}|| _|| _|| _|| _
|| _tj||||d| _d S )Nr   r   kernel_sizestride)r9   r:   rR   rP   num_channelsrJ   rQ   rS   rT   rU   rV   patch_shaper   Conv2d
projection)r;   rG   rR   rP   r   rJ   rV   r   r<   r#   r$   r:      s   
  zBeitPatchEmbeddings.__init__rw   r)   c           	      C   s^   |j \}}}}|| jkrtd| |}|j d |j d }}|ddd}|||ffS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r`   r
   r   )r-   r   
ValueErrorr   flatten	transpose)	r;   rw   r   r   r]   r^   r\   r   r   r#   r#   r$   r@      s   

zBeitPatchEmbeddings.forward)	r   r    r!   r"   r:   r/   r   r@   rE   r#   r#   r<   r$   rN      s    rN   c                          e Zd Zddedee ddf fddZdd Z							dd
ej	deej	 de
deej	 de
deee  deeej	 eej	ej	f f fddZ  ZS )BeitSelfAttentionNrG   window_sizer)   c                    s   t    || _|j|j dkr"t|ds"td|j d|j d|j| _t|j|j | _| j| j | _	t
|j| j	| _t
j|j| j	dd| _t
|j| j	| _t
|j| _t|| _| jrkt||d| _d S d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .F)biasr   )r9   r:   rG   rJ   num_attention_headshasattrr   r   attention_head_sizeall_head_sizer   LinearquerykeyvaluerY   attention_probs_dropout_probr[   r   has_relative_position_biasBeitRelativePositionBiasrelative_position_biasr;   rG   r   r<   r#   r$   r:      s&   


zBeitSelfAttention.__init__c                 C   s6   |  d d | j| jf }|j| }|ddddS )Nr_   r   r`   r   r
   )rc   r   r   rn   rk   )r;   xnew_x_shaper#   r#   r$   transpose_for_scores  s   
z&BeitSelfAttention.transpose_for_scoresFr>   	head_maskoutput_attentionsr   rv   
resolutionc                 C   s.  |  |}| | |}| | |}	| |}
t|
|dd}|t| j	 }| j
rL|\}}|| jj || jj f}|| j|||jd d }|d urT|| }tjj|dd}| |}|d uri|| }t||	}|dddd }| d d | jf }|j| }|r||f}|S |f}|S )	Nr_   r   dim_sizerf   r   r`   r
   )r   r   r   r   r/   matmulr   mathsqrtr   r   rG   rP   r   r-   r   rl   softmaxr[   rk   
contiguousrc   r   rn   )r;   r>   r   r   r   rv   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresr]   r^   r   attention_probscontext_layernew_context_layer_shapeoutputsr#   r#   r$   r@     s4   
	


zBeitSelfAttention.forwardr8   NFNFN)r   r    r!   r   r   tupler:   r   r/   r   r   r   r   r@   rE   r#   r#   r<   r$   r      s.     
r   c                       sv   e Zd Z					ddejdeej dedeej dedeee  d	e	eej eejejf f f fd
dZ
  ZS )BeitSdpaSelfAttentionNFr>   r   r   r   rv   r   r)   c              	      s8  |s|d urt d t j||||||dS | |}| | |}| | |}	| |}
d }| jrQ|\}}|| j	j
 || j	j
 f}| j|||jd d}|d ur`|d u r\|}n||7 }dt| j }tjjj|
||	|| jrw| j	jndd|d}|dd	dd
 }| d d | jf }|j| }|d fS )Na  `BeitSdpaSelfAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)r>   r   r   r   rv   r   r   r   r%   F)	attn_mask	dropout_p	is_causalscaler   r`   r
   r   )loggerwarning_oncer9   r@   r   r   r   r   r   rG   rP   r   r-   r   r   r   r/   r   rl   scaled_dot_product_attentionr(   r   rk   r   rc   r   rn   )r;   r>   r   r   r   rv   r   r   r   r   r   	attn_biasr]   r^   r   scalingr   r   r<   r#   r$   r@   C  sR   	
	
	
zBeitSdpaSelfAttention.forwardr   )r   r    r!   r/   r   r   r   r   r   r   r@   rE   r#   r#   r<   r$   r   B  s*    
r   c                       sH   e Zd ZdZdeddf fddZddejdejdejfd	d
Z  Z	S )BeitSelfOutputz
    The residual connection is defined in BeitLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rG   r)   Nc                    s.   t    t|j|j| _t|j| _d S r8   )	r9   r:   r   r   rJ   denserY   rZ   r[   r;   rG   r<   r#   r$   r:        
zBeitSelfOutput.__init__r>   input_tensorc                 C      |  |}| |}|S r8   r   r[   )r;   r>   r   gammar#   r#   r$   r@        

zBeitSelfOutput.forwardr8   )
r   r    r!   r"   r   r:   r/   r   r@   rE   r#   r#   r<   r$   r     s    &r   )eagersdpac                       r   )BeitAttentionNrG   r   r)   c                    s4   t    t|j ||d| _t|| _t | _d S )Nr   )	r9   r:   BEIT_SELF_ATTENTION_CLASSES_attn_implementation	attentionr   r5   setpruned_headsr   r<   r#   r$   r:     s   

zBeitAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rf   )lenr   r   r   r   r   r   r   r   r   r5   r   r   union)r;   headsindexr#   r#   r$   prune_heads  s   zBeitAttention.prune_headsFr>   r   r   r   rv   r   c           
      C   s:   |  ||||||}| |d |}|f|dd   }	|	S )Nr   r   )r   r5   )
r;   r>   r   r   r   rv   r   self_outputsattention_outputr   r#   r#   r$   r@     s   	zBeitAttention.forwardr8   r   )r   r    r!   r   r   r   r:   r   r/   r   r   r   r   r@   rE   r#   r#   r<   r$   r     s.     
r   c                       <   e Zd Zdeddf fddZdejdejfddZ  ZS )	BeitIntermediaterG   r)   Nc                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r8   )r9   r:   r   r   rJ   intermediate_sizer   rQ   
hidden_actrD   r   intermediate_act_fnr   r<   r#   r$   r:     s
   
zBeitIntermediate.__init__r>   c                 C   r   r8   )r   r   r?   r#   r#   r$   r@     r   zBeitIntermediate.forward	r   r    r!   r   r:   r/   r   r@   rE   r#   r#   r<   r$   r     s    r   c                       r   )	
BeitOutputrG   r)   Nc                    s.   t    t|j|j| _t|j| _	d S r8   )
r9   r:   r   r   r   rJ   r   rY   rZ   r[   r   r<   r#   r$   r:     r   zBeitOutput.__init__r>   c                 C   r   r8   r   r?   r#   r#   r$   r@     r   zBeitOutput.forwardr   r#   r#   r<   r$   r     s    r   c                       s   e Zd ZdZddedee deddf fdd	Z		
		
	dde	j
dee	j
 dedee	j
 dedeeeef  deee	j
 ee	j
e	j
f f fddZ  ZS )	BeitLayerz?This corresponds to the Block class in the timm implementation.Nr%   rG   r   drop_path_rater)   c                    s   t    |j| _d| _t||d| _t|| _t|| _	t
j|j|jd| _|dkr/t|nt
 | _t
j|j|jd| _|j}|dkrct
j|t|j dd| _t
j|t|j dd| _d S d\| _| _d S )	Nr   r   epsr%   r   T)requires_gradr   )r9   r:   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r5   r   	LayerNormrJ   layer_norm_epslayernorm_beforer7   Identityr6   layernorm_afterlayer_scale_init_valuerH   r/   oneslambda_1lambda_2)r;   rG   r   r   init_valuesr<   r#   r$   r:     s   


 zBeitLayer.__init__Fr>   r   r   r   rv   r   c                 C   s   | j | ||||||d}|d }|dd  }	| jd ur"| j| }| || }| |}
| |
}
| |
}
| jd urB| j|
 }
| |
| }
|
f|	 }	|	S )N)r   r   rv   r   r   r   )r   r   r   r6   r   r   r5   r   )r;   r>   r   r   r   rv   r   self_attention_outputsr   r   layer_outputr#   r#   r$   r@     s*   	







zBeitLayer.forward)Nr%   r   )r   r    r!   r"   r   r   r   rC   r:   r/   r   r   r   r   r@   rE   r#   r#   r<   r$   r     s.    $r   c                       sf   e Zd Zdededdf fddZedddeeef dej	fd	d
Z
ddedej	fddZ  ZS )r   rG   r   r)   Nc                    sR   t    || _d|d  d d|d  d  d | _tt| j|j| _	d S )Nr`   r   r   r
   )
r9   r:   r   num_relative_distancer   rH   r/   rI   r   relative_position_bias_tabler   r<   r#   r$   r:   #  s   
&
z!BeitRelativePositionBias.__init__
   )maxsizec           	      C   s  d|d  d d|d  d  d }|d |d  }t jt |d t |d dd}t |}t |d}|dddddf |dddddf  }|ddd }|dddddf  |d d 7  < |dddddf  |d d 7  < |dddddf  d|d  d 9  < t j|d fd |jd}|	d	|ddddf< |d |dddf< |d |dddf< |d |d
< |S )z
        This method creates the relative position index, modified to support arbitrary window sizes,
        as introduced in [MiDaS v3.1](https://huggingface.co/papers/2307.14460).
        r`   r   r   r
   ij)indexingN)rc   r+   r_   )r   r   )
r/   meshgridarangestackr   rk   r   rI   r+   sum)	r;   r   r   window_areagridcoordscoords_flattenrelative_coordsrelative_position_indexr#   r#   r$    generate_relative_position_index,  s    $$
,&&*z9BeitRelativePositionBias.generate_relative_position_indexFrv   c                 C   sf  d| j d  d }d| j d  d }d|d  d }d|d  d }| j}| j}	|| d }
|d|	d  }|d||ddddd}tjj|t|t|fdd}|dddd|
d d}t	
|||	d d g}| |}||d }||d |d  d |d |d  d d}|ddd }|rtjj|d||fdd	d
d}|dS )zu
        Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes.
        r`   r   r   r
   Nr_   bilinear)rc   rd   Frb   )r   r   r   rj   rk   r   rl   rm   r   r/   ro   r  rn   r   r|   squeeze)r;   r   rv   r   
old_height	old_widthrs   rt    old_relative_position_bias_tableold_num_relative_distancenew_num_relative_distanceold_sub_tablenew_sub_table new_relative_position_bias_tabler  r   r#   r#   r$   r@   E  s@   
&
z BeitRelativePositionBias.forward)FN)r   r    r!   r   r   r:   r   r   r/   r   r  r   r@   rE   r#   r#   r<   r$   r   "  s
    	r   c                       s   e Zd Zddedee ddf fddZ						dd	ejd
eej de	de	de	deee
e
f  de	deeef fddZ  ZS )BeitEncoderNrG   r   r)   c                    sz   t     | _ j| _| jrt d| _dd tjd j	 j
ddD t fddt j
D | _d| _d S )	Nr   c                 S   s   g | ]}|  qS r#   )item.0r   r#   r#   r$   
<listcomp>~  s    z(BeitEncoder.__init__.<locals>.<listcomp>r   cpu)r,   c                    s(   g | ]}t   jrnd | dqS )N)r   r   )r   use_relative_position_biasr  irG   dprr   r#   r$   r    s    F)r9   r:   rG   !use_shared_relative_position_biasr   r   r   r/   linspacer   num_hidden_layersr   
ModuleListrangelayergradient_checkpointingr   r<   r"  r$   r:   v  s   
 

zBeitEncoder.__init__FTr>   r   r   output_hidden_statesrv   r   return_dictc              	   C   s   |rdnd }|r
dnd }	t | jD ]M\}
}|r||f }| jr;|\}}|| jj || jj f}| j|||jd d}nd }|d urE||
 nd }|||||||d}|d }|r^|	|d f }	q|rf||f }|sttdd |||	fD S t|||	dS )	Nr#   r   )rv   r   )r   r   r   rv   r   r   c                 s   s    | ]	}|d ur|V  qd S r8   r#   )r  vr#   r#   r$   	<genexpr>  s    z&BeitEncoder.forward.<locals>.<genexpr>)last_hidden_stater>   
attentions)		enumerater)  r   rG   rP   r   r-   r   r   )r;   r>   r   r   r+  rv   r   r,  all_hidden_statesall_self_attentionsr!  layer_moduler]   r^   r   r   layer_head_masklayer_outputsr#   r#   r$   r@     sD   

	
zBeitEncoder.forwardr8   )NFFFNT)r   r    r!   r   r   r   r:   r/   r   r   r   r   r   r@   rE   r#   r#   r<   r$   r  u  s2     
	r  c                   @   s4   e Zd ZeZdZdZdZdgZdgZ	dZ
dd ZdS )	BeitPreTrainedModelbeitrw   Tr   z.*relative_position_index.*c                 C   sb  t |tjtjtjfr%|jjjd| jj	d |j
dur#|j
j  dS dS t |tjrH|jjjd| jj	d |jdurF|jj|j   dS dS t |tjr]|j
j  |jjd dS t |tr|jj  |jdurs|jj  |jdur|jj  dS dS t |tr|jj  dS t |tr|jdur|jj| jj |jj| jj dS dS dS )zInitialize the weightsr%   )meanstdNg      ?)rQ   r   r   r   ConvTranspose2dweightdatanormal_rG   initializer_ranger   zero_	Embeddingpadding_idxr   fill_rF   rK   rM   rX   r   r   r   r   r   r   )r;   moduler#   r#   r$   _init_weights  s8   







z!BeitPreTrainedModel._init_weightsN)r   r    r!   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules"_keys_to_ignore_on_load_unexpected_supports_sdparE  r#   r#   r#   r$   r7    s    r7  c                       s   e Zd Zddededdf fddZdd	 Zd
d Ze						dde	j
dee	j dee	j
 dee dee dedee deeef fddZ  ZS )	BeitModelTrG   add_pooling_layerr)   Nc                    sp   t  | || _t|| _t|| jjjd| _|j	rt
 nt
j|j|jd| _|r/t|nd| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   r   N)r9   r:   rG   rF   r\   r  rO   r   encoderuse_mean_poolingr   r   r   rJ   r   	layernorm
BeitPoolerpooler	post_init)r;   rG   rN  r<   r#   r$   r:     s   
zBeitModel.__init__c                 C      | j jS r8   r\   rO   rA   r#   r#   r$   get_input_embeddings     zBeitModel.get_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrO  r)  r   r   )r;   heads_to_pruner)  r   r#   r#   r$   _prune_heads   s   zBeitModel._prune_headsFrw   rx   r   r   r+  rv   r,  c              	   C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| || j j}| j||d\}}	|jdd }
| j|||||
||d}|d }| 	|}| j
durU| 
|nd}|sl|dura||fn|f}||dd  S t|||j|jdS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        N)rx   r`   )r   r   r+  r   r,  rv   r   r   )r/  pooler_outputr>   r0  )rG   r   r+  use_return_dictget_head_maskr&  r\   r-   rO  rQ  rS  r   r>   r0  )r;   rw   rx   r   r   r+  rv   r,  embedding_outputr~   r   encoder_outputssequence_outputpooled_outputhead_outputsr#   r#   r$   r@     s8   	
zBeitModel.forward)T)NNNNFN)r   r    r!   r   r   r:   rW  r[  r   r/   r   r   r   r   r   r   r@   rE   r#   r#   r<   r$   rM    s8    
	rM  c                       r   )	rR  rG   r)   Nc                    s2   t    |jrtj|j|jd| _d S d | _d S )Nr   )r9   r:   rP  r   r   rJ   r   rQ  r   r<   r#   r$   r:   A  s
   
zBeitPooler.__init__r>   c                 C   sL   | j d ur|d d dd d d f }|  |d}|S |d d df }|S )Nr   r   )rQ  r9  )r;   r>   patch_tokensrb  r#   r#   r$   r@   G  s   
zBeitPooler.forwardr   r#   r#   r<   r$   rR  @  s    rR  a  
    Beit Model transformer with a 'language' modeling head on top. BEiT does masked image modeling by predicting
    visual tokens of a Vector-Quantize Variational Autoencoder (VQ-VAE), whereas other vision models like ViT and DeiT
    predict RGB pixel values. As a result, this class is incompatible with [`AutoModelForMaskedImageModeling`], so you
    will need to use [`BeitForMaskedImageModeling`] directly if you wish to do masked image modeling with BEiT.
    c                       s   e Zd Zdeddf fddZe								ddeej deej	 d	eej d
eej dee
 dee
 de
dee
 deeef fddZ  ZS )BeitForMaskedImageModelingrG   r)   Nc                    sT   t  | |j| _t|dd| _tj|j|jd| _	t
|j|j| _|   d S )NFrN  r   )r9   r:   
num_labelsrM  r8  r   r   rJ   r   rQ  r   
vocab_sizelm_headrT  r   r<   r#   r$   r:   \  s   z#BeitForMaskedImageModeling.__init__Frw   rx   r   labelsr   r+  rv   r,  c	              	   C   s   |dur|n| j j}| j|||||||d}	|	d }
| |
}
| |
ddddf }d}|dur<t }||| |}|sR|f|	dd  }|durP|f| S |S t|||	j|	jdS )a  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, BeitForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
        >>> model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, logits = outputs.loss, outputs.logits
        >>> list(logits.shape)
        [1, 196, 8192]
        ```N)rx   r   r   r+  rv   r,  r   r   losslogitsr>   r0  )	rG   r]  r8  rQ  ri  r   r   r>   r0  )r;   rw   rx   r   rj  r   r+  rv   r,  r   ra  prediction_scoresmasked_lm_lossloss_fctr5   r#   r#   r$   r@   i  s4   ,

z"BeitForMaskedImageModeling.forward)NNNNNNFN)r   r    r!   r   r:   r   r   r/   r   r   r   r   r   r   r@   rE   r#   r#   r<   r$   re  S  s<    		

re  z
    Beit Model transformer with an image classification head on top (a linear layer on top of the average of the final
    hidden states of the patch tokens) e.g. for ImageNet.
    c                       s   e Zd Zdeddf fddZe							ddeej deej d	eej d
ee	 dee	 de	dee	 de
eef fddZ  ZS )BeitForImageClassificationrG   r)   Nc                    sR   t  | |j| _t|dd| _|jdkrt|j|jnt | _	| 
  d S )NTrf  r   )r9   r:   rg  rM  r8  r   r   rJ   r   
classifierrT  r   r<   r#   r$   r:     s
   $z#BeitForImageClassification.__init__Frw   r   rj  r   r+  rv   r,  c                 C   sl  |dur|n| j j}| j||||||d}|r|jn|d }	| |	}
d}|dur| j jdu rS| jdkr9d| j _n| jdkrO|jtj	ksJ|jtj
krOd| j _nd| j _| j jdkrqt }| jdkrk||
 | }n+||
|}n%| j jdkrt }||
d| j|d}n| j jdkrt }||
|}|s|
f|dd  }|dur|f| S |S t||
|j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r+  rv   r,  r   
regressionsingle_label_classificationmulti_label_classificationr_   r`   rk  )rG   r]  r8  r\  rr  problem_typerg  r+   r/   longr   r	   r  r   rn   r   r   r>   r0  )r;   rw   r   rj  r   r+  rv   r,  r   rb  rm  rl  rp  r5   r#   r#   r$   r@     sN   	


"


z"BeitForImageClassification.forwardNNNNNFN)r   r    r!   r   r:   r   r   r/   r   r   r   r   r   r@   rE   r#   r#   r<   r$   rq    s6    
	rq  c                       s   e Zd ZdZ			ddededeeeeef f deeeeef ef d	ed
eeeeef f ddf fddZ	de
jde
jfddZ  ZS )BeitConvModuleaD  
    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    r   Fr   in_channelsout_channelsr   paddingr   dilationr)   Nc                    s<   t    tj||||||d| _t|| _t | _d S )N)r{  r|  r   r}  r   r~  )	r9   r:   r   r   convBatchNorm2dbnReLU
activation)r;   r{  r|  r   r}  r   r~  r<   r#   r$   r:     s   
	zBeitConvModule.__init__r&   c                 C   s"   |  |}| |}| |}|S r8   )r  r  r  )r;   r&   r5   r#   r#   r$   r@   '  s   


zBeitConvModule.forward)r   Fr   )r   r    r!   r"   r   r   r   rD   r   r:   r/   r   r@   rE   r#   r#   r<   r$   rz  
  s*    rz  c                       sD   e Zd Zdedededdf fddZdejdejfd	d
Z  ZS )BeitPyramidPoolingBlock
pool_scaler{  channelsr)   Nc                    sL   t    t|t||ddg| _t| jD ]\}}| t|| qd S )Nr   r   )	r9   r:   r   AdaptiveAvgPool2drz  layersr1  
add_modulerD   )r;   r  r{  r  r!  r)  r<   r#   r$   r:   0  s   
z BeitPyramidPoolingBlock.__init__r&   c                 C   s   |}| j D ]}||}q|S r8   )r  )r;   r&   hidden_stater)  r#   r#   r$   r@   9  s   

zBeitPyramidPoolingBlock.forward)	r   r    r!   r   r:   r/   r   r@   rE   r#   r#   r<   r$   r  /  s    	r  c                
       sX   e Zd ZdZdeedf dedededdf
 fd	d
Zdej	de
ej	 fddZ  ZS )BeitPyramidPoolingModulea  
    Pyramid Pooling Module (PPM) used in PSPNet.

    Args:
        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
            Module.
        in_channels (int): Input channels.
        channels (int): Channels after modules, before conv_seg.
        align_corners (bool): align_corners argument of F.interpolate.

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    pool_scales.r{  r  re   r)   Nc                    sh   t    || _|| _|| _|| _g | _t|D ]\}}t|||d}| j	| | 
t|| qd S )N)r  r{  r  )r9   r:   r  re   r{  r  blocksr1  r  appendr  rD   )r;   r  r{  r  re   r!  r  blockr<   r#   r$   r:   N  s   
z!BeitPyramidPoolingModule.__init__r   c                 C   sH   g }| j D ]}||}tjj|| dd  d| jd}|| q|S )Nr`   r  rb   )r  r   rl   rm   rc   re   r  )r;   r   ppm_outsppmppm_outupsampled_ppm_outr#   r#   r$   r@   Z  s   
z BeitPyramidPoolingModule.forward)r   r    r!   r"   r   r   r   r:   r/   r   listr@   rE   r#   r#   r<   r$   r  @  s    *"r  c                       sH   e Zd ZdZdeddf fddZdd Zd	ejdejfd
dZ	  Z
S )BeitUperHeadz
    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
    [UPerNet](https://huggingface.co/papers/1807.10221).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    rG   r)   Nc                    s  t    |j| _|jgd | _|j| _d| _tj| j|j	dd| _
t| j| jd | j| jd| _t| jd t| j| j  | jddd| _t | _t | _| jd d D ] }t|| jdd}t| j| jddd}| j| | j| qWtt| j| j | jddd| _d S )	N   Fr   r  r_   )re   r
   r   r}  )r9   r:   r  rJ   r{  r  re   r   r   rg  rr  r  psp_modulesrz  r   
bottleneckr'  lateral_convs	fpn_convsr  fpn_bottleneck)r;   rG   r{  l_convfpn_convr<   r#   r$   r:   m  s>   


zBeitUperHead.__init__c                 C   s:   |d }|g}| | | tj|dd}| |}|S )Nr_   r   rf   )extendr  r/   ro   r  )r;   inputsr   psp_outsr5   r#   r#   r$   psp_forward  s   
zBeitUperHead.psp_forwardencoder_hidden_statesc                    s   fddt jD   t}t|d ddD ]$}|d  jdd  }|d  tjj	| |dj
d |d < q fd	dt|d D }|d  t|d ddD ]}tjj	|| |d jdd  dj
d||< qbtj|dd
}|}|}|S )Nc                    s   g | ]
\}}| | qS r#   r#   )r  r!  lateral_conv)r  r#   r$   r    s    z(BeitUperHead.forward.<locals>.<listcomp>r   r   r_   r`   r  rb   c                    s   g | ]}j |  | qS r#   )r  r   )lateralsr;   r#   r$   r    s    rf   )r1  r  r  r  r   r(  r-   r   rl   rm   re   r/   ro   r  rr  )r;   r  used_backbone_levelsr!  
prev_shapefpn_outsr5   r#   )r  r  r;   r$   r@     s$   

zBeitUperHead.forward)r   r    r!   r"   r   r:   r  r/   r   r@   rE   r#   r#   r<   r$   r  e  s
    &	r  c                       s`   e Zd ZdZ	ddedededeeeeef f d	d
f
 fddZde	j
d	e	j
fddZ  ZS )BeitFCNHeada  
    Fully Convolution Networks for Semantic Segmentation. This head is implemented of
    [FCNNet](https://huggingface.co/papers/1411.4038>).

    Args:
        config (BeitConfig): Configuration.
        in_channels
        kernel_size (int): The kernel size for convs in the head. Default: 3.
        dilation (int): The dilation rate for convs in the head. Default: 1.


    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    r`   r
   r   rG   in_indexr   r~  r)   Nc              
      s   t    |j| _|j| _|j| _|j| _	|| _
|d | }g }|t| j| j|||d t| jd D ]}|t| j| j|||d q5| jdkrQt | _ntj| | _| j	rjt| j| j | j||d d| _tj| j|jdd| _d S )Nr`   )r   r}  r~  r   r   r  r  )r9   r:   rJ   r{  auxiliary_channelsr  auxiliary_num_convs	num_convsauxiliary_concat_inputconcat_inputr  r  rz  r(  r   r   convs
Sequentialconv_catr   rg  rr  )r;   rG   r  r   r~  conv_paddingr  r!  r<   r#   r$   r:     s6   

zBeitFCNHead.__init__r  c                 C   s@   || j  }| |}| jr| tj||gdd}| |}|S )Nr   rf   )r  r  r  r  r/   ro   rr  )r;   r  r>   r5   r#   r#   r$   r@     s   


zBeitFCNHead.forward)r`   r
   r   )r   r    r!   r"   r   r   r   r   r:   r/   r   r@   rE   r#   r#   r<   r$   r    s    "r  c                       s   e Zd Zdeddf fddZdd Ze							dd	eej	 d
eej	 deej	 dee
 dee
 de
dee
 deeef fddZ  ZS )BeitForSemanticSegmentationrG   r)   Nc                    s   t  | |j| _t|dd| _t| jjdkrtdt	
t	j|j|jdddt	|jt	 t	j|j|jddd| _t	
t	j|j|jddd| _t	 | _t	jddd| _t|| _|jrft|nd | _|   d S )NFrf  r  zBeitForSemanticSegmentation requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.r`   r   )r9   r:   rg  rM  r8  r   rG   out_indicesr   r   r  r;  rJ   r  GELUfpn1fpn2r   fpn3	MaxPool2dfpn4r  decode_headuse_auxiliary_headr  auxiliary_headrT  r   r<   r#   r$   r:     s*   


z$BeitForSemanticSegmentation.__init__c           
      C   s   t jj||jdd  ddd}|d ur"t jj||jdd  ddd}t| jjd}|||}|}|d urA|||}	|| jj|	 7 }|S )Nr   r  Frb   )ignore_index)r   rl   rm   r-   r   rG   semantic_loss_ignore_indexauxiliary_loss_weight)
r;   rm  auxiliary_logitsrj  upsampled_logitsupsampled_auxiliary_logitsrp  	main_lossrl  auxiliary_lossr#   r#   r$   compute_loss  s   

z(BeitForSemanticSegmentation.compute_lossFrw   r   rj  r   r+  rv   r,  c                    s  |dur|nj j}|dur|nj j}|dur"j jdkr"tdj|||d||d}|r2|jn|d }	fddt|	D }
|jd  j j	j j
  fd	d|
D }
jjjjg}tt|
D ]}|| |
| |
|< qh|
}d}jdur|
}d}|dur|||}|s|r|f|dd  }n	|f|d
d  }|dur|f| S |S t|||r|jnd|jdS )aD  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, BeitForSemanticSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
        >>> model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTrs  c                    s$   g | ]\}}|d   j jv r|qS r*   )rG   r  )r  idxfeaturerA   r#   r$   r  a  s   $ z7BeitForSemanticSegmentation.forward.<locals>.<listcomp>r   c                    s<   g | ]}|d d dd d d f  ddd dqS )Nr   r   r`   r_   )rk   rj   r  )r   patch_resolutionr#   r$   r  d  s    0r`   rk  )rG   r]  r+  rg  r   r8  r>   r1  r-   rR   rP   r  r  r  r  r(  r   r  r  r  r   r0  )r;   rw   r   rj  r   r+  rv   r,  r   r  featuresopsr!  rm  r  rl  r5   r#   )r   r  r;   r$   r@   *  sR   "	



z#BeitForSemanticSegmentation.forwardry  )r   r    r!   r   r:   r  r   r   r/   r   r   r   r   r   r@   rE   r#   r#   r<   r$   r    s8     
	r  zM
    BEiT backbone, to be used with frameworks like DETR and MaskFormer.
    c                       sZ   e Zd Z fddZdd Ze			ddedee dee d	ee d
e	f
ddZ
  ZS )BeitBackbonec                    s   t    t     fddt jd D | _t | _t | jj	j
d| _ jrut| jjdkr9td j}ttj||dddtj| jd	t tj||ddd| _ttj||ddd| _t | _tjddd| _|   d S )
Nc                    s   g | ]} j qS r#   )rJ   )r  r~   rG   r#   r$   r    s    z)BeitBackbone.__init__.<locals>.<listcomp>r   r   r  zBeitBackbone requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.r`   r   r   )r9   r:   _init_backboner(  r&  num_featuresrF   r\   r  rO   r   rO  add_fpnr   rG   r  r   rJ   r   r  r;  r  batch_norm_epsr  r  r  r   r  r  r  rT  )r;   rG   rJ   r<   r  r$   r:     s*   

zBeitBackbone.__init__c                 C   rU  r8   rV  rA   r#   r#   r$   rW    rX  z!BeitBackbone.get_input_embeddingsNrw   r+  r   r,  r)   c                 C   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|jd }| |\}\}}|jdd }	| j|d||	|d}
|rB|
jn|
d }d}t| j	|D ].\}}|| j
v r|| j jrw|ddddddf }|ddd}||d||}||f7 }qN| j jr| |d | |d | |d | |d	 g}t|}|s|r|f|
dd  }|S |f|
dd  }|S t||r|
jnd|
jd
S )a:  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224")
        >>> model = AutoBackbone.from_pretrained(
        ...     "microsoft/beit-base-patch16-224", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 14, 14]
        ```Nr   r`   T)r+  r   r   r,  r   r#   r_   r
   )feature_mapsr>   r0  )rG   r]  r+  r   r-   r\   rO  r>   zipstage_namesout_featuresreshape_hidden_statesrk   rj   r  r  r  r  r  r   r   r0  )r;   rw   r+  r   r,  r   r_  r   r   r   r   r>   r  stager  r5   r#   r#   r$   r@     sT    


zBeitBackbone.forward)NNN)r   r    r!   r:   rW  r   r   r   r   r   r@   rE   r#   r#   r<   r$   r    s$    r  )rq  re  r  rM  r7  r  )r%   F)Mr"   collections.abcrS   r   ry   dataclassesr   typingr   r   r/   torch.utils.checkpointr   r   torch.nnr   r   r	   activationsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   utils.backbone_utilsr   configuration_beitr   
get_loggerr   r   r   rC   r   r6   Moduler7   rF   rN   r   r   r   r   r   r   r   r   r   r  r7  rM  rR  re  rq  rz  r  r  r  r  r  r  __all__r#   r#   r#   r$   <module>   s    
 	f&S>,ASL&W[N%%U; w