o
    eij                  	   @   s  d Z ddlZddlZddlmZ ddlZddlmZmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$ e!%e&Z'ee ddG dd deZ(dRdejde)de*dejfddZ+G dd dej,Z-G dd  d ej,Z.G d!d" d"ej,Z/G d#d$ d$ej,Z0G d%d& d&e0Z1G d'd( d(ej,Z2e0e1d)Z3G d*d+ d+ej,Z4G d,d- d-ej,Z5G d.d/ d/ej,Z6G d0d1 d1eZ7G d2d3 d3ej,Z8G d4d5 d5ej,Z9e G d6d7 d7eZ:e G d8d9 d9e:Z;G d:d; d;ej,Z<e d<dG d=d> d>e:Z=e d?dG d@dA dAe:Z>G dBdC dCej,Z?G dDdE dEej,Z@G dFdG dGej,ZAG dHdI dIej,ZBG dJdK dKej,ZCe G dLdM dMe:ZDe dNdG dOdP dPee:ZEg dQZFdS )SzPyTorch BEiT model.    N)	dataclass)Tensornn)CrossEntropyLoss   )initialization)ACT2FN)BackboneMixin)GradientCheckpointingLayer)BackboneOutputBaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedLMOutputSemanticSegmenterOutput)PreTrainedModel)#compile_compatible_method_lru_cache)auto_docstringlogging	torch_int   )
BeitConfigz-
    Class for outputs of [`BeitModel`].
    )custom_introc                   @   s   e Zd ZdZdS )BeitModelOutputWithPoolingaF  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    N)__name__
__module____qualname____doc__ r   r   d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/beit/modeling_beit.pyr   -   s    r           Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    r    r   r   r   )dtypedevice)shapendimtorchrandr&   r'   floor_div)r!   r"   r#   	keep_probr(   random_tensoroutputr   r   r   	drop_path<   s   r1   c                       sT   e Zd ZdZddedB ddf fddZdejdejfdd	Zde	fd
dZ
  ZS )BeitDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr"   r$   c                    s   t    || _d S N)super__init__r"   )selfr"   	__class__r   r   r5   N   s   

zBeitDropPath.__init__hidden_statesc                 C   s   t || j| jS r3   )r1   r"   r#   r6   r9   r   r   r   forwardR   s   zBeitDropPath.forwardc                 C   s   d| j  S )Nzp=)r"   r6   r   r   r   
extra_reprU   s   zBeitDropPath.extra_reprr3   )r   r   r   r   floatr5   r*   r   r;   strr=   __classcell__r   r   r7   r   r2   K   s
    r2   c                       sl   e Zd ZdZdeddf fddZdejded	edejfd
dZ		ddejdej
dB dejfddZ  ZS )BeitEmbeddingszc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    configr$   Nc                    s   t    ttdd|j| _|jr!ttdd|j| _	nd | _	t
|| _|j| _t|jtjjr8|jn|j|jf| _| jj}|jrUttd|d |j| _nd | _t|j| _d S )Nr   )r4   r5   r   	Parameterr*   zeroshidden_size	cls_tokenuse_mask_token
mask_tokenBeitPatchEmbeddingspatch_embeddings
patch_size
isinstance
image_sizecollectionsabcIterablenum_patches use_absolute_position_embeddingsposition_embeddingsDropouthidden_dropout_probdropout)r6   rB   rQ   r7   r   r   r5   a   s    


zBeitEmbeddings.__init__
embeddingsheightwidthc                 C   s   |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| j }	|| j }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r      bicubicFsizemodealign_cornersdim)r(   rS   r*   jit
is_tracingrK   r   reshapepermuter   
functionalinterpolateviewcat)r6   rW   rX   rY   rQ   num_positionsclass_pos_embedpatch_pos_embedrb   
new_height	new_widthsqrt_num_positionsr   r   r   interpolate_pos_encodingx   s(   



z'BeitEmbeddings.interpolate_pos_encodingpixel_valuesbool_masked_posc                 C   s   |j \}}}}| |\}\}}| \}	}
}|d ur5| j|	|
d}|d|}|d|  ||  }| j|	dd}tj	||fdd}| j
d urT|| ||| }| |}|||ffS NrZ   r   ra   )r(   rJ   r^   rH   expand	unsqueezetype_asrF   r*   rj   rS   rq   rV   )r6   rr   rs   _rX   rY   rW   patch_heightpatch_width
batch_sizeseq_lenmask_tokensw
cls_tokensr   r   r   r;      s   

zBeitEmbeddings.forwardr3   )r   r   r   r   r   r5   r*   r   intrq   
BoolTensorr;   r@   r   r   r7   r   rA   [   s    +rA   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )rI   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|d |d  |d |d  f}|| _|| _|| _|| _
|| _tj||||d| _d S )Nr   r   kernel_sizestride)r4   r5   rM   rK   num_channelsrE   rL   rN   rO   rP   rQ   patch_shaper   Conv2d
projection)r6   rB   rM   rK   r   rE   rQ   r   r7   r   r   r5      s   
  zBeitPatchEmbeddings.__init__rr   r$   c           	      C   sj   |j \}}}}|| jkrtd| || jjj}|j d |j d }}|ddd}|||ffS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r[   r   r   )	r(   r   
ValueErrorr   toweightr&   flatten	transpose)	r6   rr   r{   r   rX   rY   rW   ry   rz   r   r   r   r;      s   
zBeitPatchEmbeddings.forward)	r   r   r   r   r5   r*   r   r;   r@   r   r   r7   r   rI      s    rI   c                          e Zd ZddededB ddf fddZ				ddejd	ed
ejdB dedee	 dB deej eejejf B fddZ
  ZS )BeitSelfAttentionNrB   window_sizer$   c                    s   t    || _|j|j dkr"t|ds"td|j d|j d|j| _t|j|j | _| j| j | _	t
|j| j	| _t
j|j| j	dd| _t
|j| j	| _t
|j| _t|| _| jrkt||d| _d S d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .F)biasr   )r4   r5   rB   rE   num_attention_headshasattrr   r   attention_head_sizeall_head_sizer   LinearquerykeyvaluerT   attention_probs_dropout_probrV   boolhas_relative_position_biasBeitRelativePositionBiasrelative_position_biasr6   rB   r   r7   r   r   r5      s&   


zBeitSelfAttention.__init__Fr9   output_attentionsr   rq   
resolutionc                 C   s\  |j \}}}| ||d| j| jdd}	| ||d| j| jdd}
| ||d| j| jdd}t	|	|
dd}|t
| j }| jrk|\}}|| jj || jj f}|| j|||j d d }|d urs|| }tjj|dd}| |}t	||}|dddd }| d d | jf }|j| }|r||f}|S |f}|S )	NrZ   r   r[   dim_sizera   r   r   )r(   r   ri   r   r   r   r   r   r*   matmulmathsqrtr   rB   rK   r   r   rg   softmaxrV   rf   
contiguousr^   r   )r6   r9   r   r   rq   r   r{   
seq_lengthrx   query_layer	key_layervalue_layerattention_scoresrX   rY   r   attention_probscontext_layernew_context_layer_shapeoutputsr   r   r   r;      sB   

zBeitSelfAttention.forwardr3   FNFNr   r   r   r   tupler5   r*   r   r   r   r;   r@   r   r   r7   r   r      s&     
r   c                   @   s^   e Zd Z				ddejdedejdB dedee dB deej eejejf B fd	d
ZdS )BeitSdpaSelfAttentionFNr9   r   r   rq   r   r$   c              	   C   s`  |rt | jj d |j\}}}| ||d| j| j	dd}	| 
||d| j| j	dd}
| ||d| j| j	dd}d }| jre|\}}|| jj || jj f}| j|||jd d}|d urt|d u rp|}n||7 }dt| j }tjjj|	|
||| jr| jjndd|d}|d	ddd
 }| d d | jf }|j| }|d fS )Nz does not support `output_attentions=True`. The returned attention weights will be `None`. If you want to get attention weights, please set `attn_implementation='eager'` when loading the model.rZ   r   r[   r   r    F)	attn_mask	dropout_p	is_causalscaler   r   r   )loggerwarning_oncer8   r   r(   r   ri   r   r   r   r   r   r   rB   rK   r   r   r   r*   r   rg   scaled_dot_product_attentionr#   r   rf   r   r^   r   )r6   r9   r   r   rq   r   r{   r   rx   r   r   r   	attn_biasrX   rY   r   scalingr   r   r   r   r   r;   5  sT   	
zBeitSdpaSelfAttention.forwardr   )	r   r   r   r*   r   r   r   r   r;   r   r   r   r   r   4  s$    
r   c                       sH   e Zd ZdZdeddf fddZddejdejdejfd	d
Z  Z	S )BeitSelfOutputz
    The residual connection is defined in BeitLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rB   r$   Nc                    s.   t    t|j|j| _t|j| _d S r3   )	r4   r5   r   r   rE   denserT   rU   rV   r6   rB   r7   r   r   r5   x     
zBeitSelfOutput.__init__r9   input_tensorc                 C      |  |}| |}|S r3   r   rV   )r6   r9   r   gammar   r   r   r;   }     

zBeitSelfOutput.forwardr3   )
r   r   r   r   r   r5   r*   r   r;   r@   r   r   r7   r   r   r  s    &r   )eagersdpac                       r   )BeitAttentionNrB   r   r$   c                    s,   t    t|j ||d| _t|| _d S )Nr   )r4   r5   BEIT_SELF_ATTENTION_CLASSES_attn_implementation	attentionr   r0   r   r7   r   r   r5     s   
zBeitAttention.__init__Fr9   r   r   rq   r   c           	      C   s8   |  |||||}| |d |}|f|dd   }|S )Nr   r   )r   r0   )	r6   r9   r   r   rq   r   self_outputsattention_outputr   r   r   r   r;     s   
zBeitAttention.forwardr3   r   r   r   r   r7   r   r     s&     
r   c                       <   e Zd Zdeddf fddZdejdejfddZ  ZS )	BeitIntermediaterB   r$   Nc                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r3   )r4   r5   r   r   rE   intermediate_sizer   rL   
hidden_actr?   r   intermediate_act_fnr   r7   r   r   r5     s
   
zBeitIntermediate.__init__r9   c                 C   r   r3   )r   r   r:   r   r   r   r;     r   zBeitIntermediate.forward	r   r   r   r   r5   r*   r   r;   r@   r   r   r7   r   r     s    r   c                       r   )	
BeitOutputrB   r$   Nc                    s.   t    t|j|j| _t|j| _	d S r3   )
r4   r5   r   r   r   rE   r   rT   rU   rV   r   r7   r   r   r5     r   zBeitOutput.__init__r9   c                 C   r   r3   r   r:   r   r   r   r;     r   zBeitOutput.forwardr   r   r   r7   r   r     s    r   c                       s   e Zd ZdZddededB deddf fdd	Z	
		
	ddej	de
dej	dB de
deeef dB deej	 eej	ej	f B fddZ  ZS )	BeitLayerz?This corresponds to the Block class in the timm implementation.Nr    rB   r   drop_path_rater$   c                    s   t    |j| _d| _t||d| _t|| _t|| _	t
j|j|jd| _|dkr/t|nt
 | _t
j|j|jd| _|j}|dkrct
j|t|j dd| _t
j|t|j dd| _d S d\| _| _d S )	Nr   r   epsr    r   T)requires_grad)NN)r4   r5   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r0   r   	LayerNormrE   layer_norm_epslayernorm_beforer2   Identityr1   layernorm_afterlayer_scale_init_valuerC   r*   oneslambda_1lambda_2)r6   rB   r   r   init_valuesr7   r   r   r5     s   


 zBeitLayer.__init__Fr9   r   r   rq   r   c           
      C   s   | j | |||||d}|d }|dd  }| jd ur!| j| }| || }| |}	| |	}	| |	}	| jd urA| j|	 }	| |	| }	|	f| }|S )Nr   r   rq   r   r   r   )r   r   r   r1   r   r   r0   r   )
r6   r9   r   r   rq   r   self_attention_outputsr   r   layer_outputr   r   r   r;     s(   







zBeitLayer.forward)Nr    r   )r   r   r   r   r   r   r>   r5   r*   r   r   r   r;   r@   r   r   r7   r   r     s(    $r   c                       sf   e Zd Zdededdf fddZedddeeef dej	fd	d
Z
ddedej	fddZ  ZS )r   rB   r   r$   Nc                    sR   t    || _d|d  d d|d  d  d | _tt| j|j| _	d S )Nr[   r   r   r   )
r4   r5   r   num_relative_distancer   rC   r*   rD   r   relative_position_bias_tabler   r7   r   r   r5     s   
&
z!BeitRelativePositionBias.__init__
   )maxsizec           	      C   s  d|d  d d|d  d  d }|d |d  }t jt |d t |d dd}t |}t |d}|dddddf |dddddf  }|ddd }|dddddf  |d d 7  < |dddddf  |d d 7  < |dddddf  d|d  d 9  < t j|d fd |jd}|	d	|ddddf< |d |dddf< |d |dddf< |d |d
< |S )z
        This method creates the relative position index, modified to support arbitrary window sizes,
        as introduced in [MiDaS v3.1](https://huggingface.co/papers/2307.14460).
        r[   r   r   r   ij)indexingN)r^   r&   rZ   )r   r   )
r*   meshgridarangestackr   rf   r   rD   r&   sum)	r6   r   r   window_areagridcoordscoords_flattenrelative_coordsrelative_position_indexr   r   r    generate_relative_position_index  s    $$
,&&*z9BeitRelativePositionBias.generate_relative_position_indexFrq   c                 C   sf  d| j d  d }d| j d  d }d|d  d }d|d  d }| j}| j}	|| d }
|d|	d  }|d||ddddd}tjj|t|t|fdd}|dddd|
d d}t	
|||	d d g}| |}||d }||d |d  d |d |d  d d}|ddd }|rtjj|d||fdd	d
d}|dS )zu
        Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes.
        r[   r   r   r   NrZ   bilinear)r^   r_   Fr]   )r   r   r   re   rf   r   rg   rh   r   r*   rj   r   ri   r   rv   squeeze)r6   r   rq   r   
old_height	old_widthrn   ro    old_relative_position_bias_tableold_num_relative_distancenew_num_relative_distanceold_sub_tablenew_sub_table new_relative_position_bias_tabler   r   r   r   r   r;   !  s@   
&
z BeitRelativePositionBias.forward)FN)r   r   r   r   r   r5   r   r   r*   r   r   r   r;   r@   r   r   r7   r   r     s
    	r   c                       st   e Zd ZddededB ddf fddZ					dd	ejd
edededee	e	f dB dedee
B fddZ  ZS )BeitEncoderNrB   r   r$   c                    sz   t     | _ j| _| jrt d| _dd tjd j	 j
ddD t fddt j
D | _d| _d S )	Nr   c                 S   s   g | ]}|  qS r   )item.0xr   r   r   
<listcomp>Z  s    z(BeitEncoder.__init__.<locals>.<listcomp>r   cpu)r'   c                    s(   g | ]}t   jrnd | dqS )N)r   r   )r   use_relative_position_biasr  irB   dprr   r   r   r  \  s    F)r4   r5   rB   !use_shared_relative_position_biasr   r   r   r*   linspacer   num_hidden_layersr   
ModuleListrangelayergradient_checkpointingr   r7   r  r   r5   R  s   
 

zBeitEncoder.__init__FTr9   r   output_hidden_statesrq   r   return_dictc                 C   s   |rdnd }|r
dnd }t | jD ]B\}	}
|r||f }| jr;|\}}|| jj || jj f}| j|||jd d}nd }|
|||||d}|d }|rS||d f }q|r[||f }|sitdd |||fD S t|||dS )	Nr   r   )rq   r   r   r   c                 s   s    | ]	}|d ur|V  qd S r3   r   )r  vr   r   r   	<genexpr>  s    z&BeitEncoder.forward.<locals>.<genexpr>)last_hidden_stater9   
attentions)		enumerater  r   rB   rK   r   r(   r   r   )r6   r9   r   r  rq   r   r  all_hidden_statesall_self_attentionsr  layer_modulerX   rY   r   r   layer_outputsr   r   r   r;   g  s@   	

zBeitEncoder.forwardr3   )FFFNT)r   r   r   r   r   r5   r*   r   r   r   r   r;   r@   r   r   r7   r   r  Q  s,     r  c                       sN   e Zd ZU eed< dZdZdZdZdgZ	dgZ
dZe  fdd	Z  ZS )
BeitPreTrainedModelrB   beit)imagerr   Tr   z.*relative_position_index.*c                    s   t  | t|tr+t|j |jdurt|j |jdur)t|j dS dS t|t	r8t|j
 dS t|trV|jdurXt|j| jj t|j| jj dS dS dS )zInitialize the weightsN)r4   _init_weightsrL   rA   initzeros_rF   rH   rS   r   r   r   r   	constant_rB   r   r   )r6   moduler7   r   r   r,    s    





z!BeitPreTrainedModel._init_weights)r   r   r   r   __annotations__base_model_prefixinput_modalitiesmain_input_namesupports_gradient_checkpointing_no_split_modules"_keys_to_ignore_on_load_unexpected_supports_sdpar*   no_gradr,  r@   r   r   r7   r   r)    s   
 r)  c                       s   e Zd Zddededdf fddZdd	 Ze				
	ddej	dej
dB dedB dedB dededB deeB fddZ  ZS )	BeitModelTrB   add_pooling_layerr$   Nc                    sp   t  | || _t|| _t|| jjjd| _|j	rt
 nt
j|j|jd| _|r/t|nd| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   r   N)r4   r5   rB   rA   rW   r  rJ   r   encoderuse_mean_poolingr   r   r   rE   r   	layernorm
BeitPoolerpooler	post_init)r6   rB   r;  r7   r   r   r5     s   
zBeitModel.__init__c                 C      | j jS r3   rW   rJ   r<   r   r   r   get_input_embeddings     zBeitModel.get_input_embeddingsFrr   rs   r   r  rq   r  c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| j||d\}}	|jdd }
| j||||
||d}|d }| |}| jdurL| |nd}|sc|durX||fn|f}||dd  S t	|||j
|jdS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        N)rs   r[   )r   r  r   r  rq   r   r   )r"  pooler_outputr9   r#  )rB   r   r  use_return_dictrW   r(   r<  r>  r@  r   r9   r#  )r6   rr   rs   r   r  rq   r  kwargsembedding_outputrx   r   encoder_outputssequence_outputpooled_outputhead_outputsr   r   r   r;     s4   
zBeitModel.forward)T)NNNFN)r   r   r   r   r   r5   rD  r   r*   r   r   r   r   r;   r@   r   r   r7   r   r:    s0    	r:  c                       r   )	r?  rB   r$   Nc                    s2   t    |jrtj|j|jd| _d S d | _d S )Nr   )r4   r5   r=  r   r   rE   r   r>  r   r7   r   r   r5     s
   
zBeitPooler.__init__r9   c                 C   sL   | j d ur|d d dd d d f }|  |d}|S |d d df }|S )Nr   r   )r>  mean)r6   r9   patch_tokensrL  r   r   r   r;     s   
zBeitPooler.forwardr   r   r   r7   r   r?    s    r?  a  
    Beit Model transformer with a 'language' modeling head on top. BEiT does masked image modeling by predicting
    visual tokens of a Vector-Quantize Variational Autoencoder (VQ-VAE), whereas other vision models like ViT and DeiT
    predict RGB pixel values. As a result, this class is incompatible with [`AutoModelForMaskedImageModeling`], so you
    will need to use [`BeitForMaskedImageModeling`] directly if you wish to do masked image modeling with BEiT.
    c                       s   e Zd Zdeddf fddZdd Ze							dd	ejdB d
ej	dB dejdB de
dB de
dB de
de
dB deeB fddZ  ZS )BeitForMaskedImageModelingrB   r$   Nc                    sT   t  | |j| _t|dd| _tj|j|jd| _	t
|j|j| _|   d S )NFr;  r   )r4   r5   
num_labelsr:  r*  r   r   rE   r   r>  r   
vocab_sizelm_headrA  r   r7   r   r   r5     s   z#BeitForMaskedImageModeling.__init__c                 C   s   d S r3   r   r<   r   r   r   get_output_embeddings'  s   z0BeitForMaskedImageModeling.get_output_embeddingsFrr   rs   labelsr   r  rq   r  c                 K   s   |dur|n| j j}| j||||||d}	|	d }
| |
}
| |
ddddf }d}|dur;t }||| |}|sQ|f|	dd  }|durO|f| S |S t|||	j|	jdS )a  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, BeitForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
        >>> model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, logits = outputs.loss, outputs.logits
        >>> list(logits.shape)
        [1, 196, 8192]
        ```N)rs   r   r  rq   r  r   r   losslogitsr9   r#  )	rB   rG  r*  r>  rT  r   r   r9   r#  )r6   rr   rs   rV  r   r  rq   r  rH  r   rK  prediction_scoresmasked_lm_lossloss_fctr0   r   r   r   r;   *  s2   .	
z"BeitForMaskedImageModeling.forward)NNNNNFN)r   r   r   r   r5   rU  r   r*   r   r   r   r   r   r;   r@   r   r   r7   r   rP    s8    	
rP  z
    Beit Model transformer with an image classification head on top (a linear layer on top of the average of the final
    hidden states of the patch tokens) e.g. for ImageNet.
    c                       sz   e Zd Zdeddf fddZe						ddejdB dejdB d	edB d
edB dededB de	e
B fddZ  ZS )BeitForImageClassificationrB   r$   Nc                    sR   t  | |j| _t|dd| _|jdkrt|j|jnt | _	| 
  d S )NTrQ  r   )r4   r5   rR  r:  r*  r   r   rE   r   
classifierrA  r   r7   r   r   r5     s
   $z#BeitForImageClassification.__init__Frr   rV  r   r  rq   r  c                 K   s   |dur|n| j j}| j|||||d}|r|jn|d }	| |	}
d}|dur0| ||
| j }|sF|
f|dd  }|durD|f| S |S t||
|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r  rq   r  r   r[   rW  )	rB   rG  r*  rF  r^  loss_functionr   r9   r#  )r6   rr   rV  r   r  rq   r  rH  r   rL  rY  rX  r0   r   r   r   r;     s,   
z"BeitForImageClassification.forwardNNNNFN)r   r   r   r   r5   r   r*   r   r   r   r   r;   r@   r   r   r7   r   r]  x  s0    	r]  c                       s   e Zd ZdZ			ddededeeeef B deeeef B eB d	ed
eeeef B ddf fddZde	j
de	j
fddZ  ZS )BeitConvModuleaD  
    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    r   Fr   in_channelsout_channelsr   paddingr   dilationr$   Nc                    s<   t    tj||||||d| _t|| _t | _d S )N)rc  rd  r   re  r   rf  )	r4   r5   r   r   convBatchNorm2dbnReLU
activation)r6   rc  rd  r   re  r   rf  r7   r   r   r5     s   
	zBeitConvModule.__init__r!   c                 C   s"   |  |}| |}| |}|S r3   )rg  ri  rk  )r6   r!   r0   r   r   r   r;     s   


zBeitConvModule.forward)r   Fr   )r   r   r   r   r   r   r?   r   r5   r*   r   r;   r@   r   r   r7   r   rb    s*    rb  c                       sD   e Zd Zdedededdf fddZdejdejfd	d
Z  ZS )BeitPyramidPoolingBlock
pool_scalerc  channelsr$   Nc                    sL   t    t|t||ddg| _t| jD ]\}}| t|| qd S )Nr   r   )	r4   r5   r   AdaptiveAvgPool2drb  layersr$  
add_moduler?   )r6   rm  rc  rn  r  r  r7   r   r   r5     s   
z BeitPyramidPoolingBlock.__init__r!   c                 C   s   |}| j D ]}||}q|S r3   )rq  )r6   r!   hidden_stater  r   r   r   r;     s   

zBeitPyramidPoolingBlock.forward)	r   r   r   r   r5   r*   r   r;   r@   r   r   r7   r   rl    s    	rl  c                
       sX   e Zd ZdZdeedf dedededdf
 fd	d
Zdej	de
ej	 fddZ  ZS )BeitPyramidPoolingModulea  
    Pyramid Pooling Module (PPM) used in PSPNet.

    Args:
        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
            Module.
        in_channels (int): Input channels.
        channels (int): Channels after modules, before conv_seg.
        align_corners (bool): align_corners argument of F.interpolate.

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    pool_scales.rc  rn  r`   r$   Nc                    sh   t    || _|| _|| _|| _g | _t|D ]\}}t|||d}| j	| | 
t|| qd S )N)rm  rc  rn  )r4   r5   ru  r`   rc  rn  blocksr$  rl  appendrr  r?   )r6   ru  rc  rn  r`   r  rm  blockr7   r   r   r5     s   
z!BeitPyramidPoolingModule.__init__r  c                 C   sH   g }| j D ]}||}tjj|| dd  d| jd}|| q|S )Nr[   r  r]   )rv  r   rg   rh   r^   r`   rw  )r6   r  ppm_outsppmppm_outupsampled_ppm_outr   r   r   r;   	  s   
z BeitPyramidPoolingModule.forward)r   r   r   r   r   r   r   r5   r*   r   listr;   r@   r   r   r7   r   rt    s    *"rt  c                       sH   e Zd ZdZdeddf fddZdd Zd	ejdejfd
dZ	  Z
S )BeitUperHeadz
    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
    [UPerNet](https://huggingface.co/papers/1807.10221).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    rB   r$   Nc                    s  t    |j| _|jgd | _|j| _d| _tj| j|j	dd| _
t| j| jd | j| jd| _t| jd t| j| j  | jddd| _t | _t | _| jd d D ] }t|| jdd}t| j| jddd}| j| | j| qWtt| j| j | jddd| _d S )	N   Fr   ro  rZ   )r`   r   r   re  )r4   r5   ru  rE   rc  rn  r`   r   r   rR  r^  rt  psp_modulesrb  len
bottleneckr  lateral_convs	fpn_convsrw  fpn_bottleneck)r6   rB   rc  l_convfpn_convr7   r   r   r5     s>   


zBeitUperHead.__init__c                 C   s:   |d }|g}| | | tj|dd}| |}|S rt   )extendr  r*   rj   r  )r6   inputsr  psp_outsr0   r   r   r   psp_forwardB  s   
zBeitUperHead.psp_forwardencoder_hidden_statesc                    s   fddt jD   t}t|d ddD ]$}|d  jdd  }|d  tjj	| |dj
d |d < q fd	dt|d D }|d  t|d ddD ]}tjj	|| |d jdd  dj
d||< qbtj|dd
}|}|}|S )Nc                    s   g | ]
\}}| | qS r   r   )r  r  lateral_conv)r  r   r   r  M  s    z(BeitUperHead.forward.<locals>.<listcomp>r   r   rZ   r[   r  r]   c                    s   g | ]}j |  | qS r   )r  r  )lateralsr6   r   r   r  Z  s    ra   )r$  r  rw  r  r  r  r(   r   rg   rh   r`   r*   rj   r  r^  )r6   r  used_backbone_levelsr  
prev_shapefpn_outsr0   r   )r  r  r6   r   r;   K  s$   

zBeitUperHead.forward)r   r   r   r   r   r5   r  r*   r   r;   r@   r   r   r7   r   r~    s
    &	r~  c                       s\   e Zd ZdZ	ddedededeeeef B d	d
f
 fddZdej	d	ej	fddZ
  ZS )BeitFCNHeada  
    Fully Convolution Networks for Semantic Segmentation. This head is implemented of
    [FCNNet](https://huggingface.co/papers/1411.4038>).

    Args:
        config (BeitConfig): Configuration.
        in_channels
        kernel_size (int): The kernel size for convs in the head. Default: 3.
        dilation (int): The dilation rate for convs in the head. Default: 1.


    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    r[   r   r   rB   in_indexr   rf  r$   Nc              
      s   t    |j| _|j| _|j| _|j| _	|| _
|d | }g }|t| j| j|||d t| jd D ]}|t| j| j|||d q5| jdkrQt | _ntj| | _| j	rjt| j| j | j||d d| _tj| j|jdd| _d S )Nr[   )r   re  rf  r   r   r  ro  )r4   r5   rE   rc  auxiliary_channelsrn  auxiliary_num_convs	num_convsauxiliary_concat_inputconcat_inputr  rw  rb  r  r   r   convs
Sequentialconv_catr   rR  r^  )r6   rB   r  r   rf  conv_paddingr  r  r7   r   r   r5   x  s6   

zBeitFCNHead.__init__r  c                 C   s@   || j  }| |}| jr| tj||gdd}| |}|S )Nr   ra   )r  r  r  r  r*   rj   r^  )r6   r  r9   r0   r   r   r   r;     s   


zBeitFCNHead.forward)r[   r   r   )r   r   r   r   r   r   r   r5   r*   r   r;   r@   r   r   r7   r   r  i  s    "r  c                       s   e Zd Zdeddf fddZdd Ze						dd	ejdB d
ejdB de	dB de	dB de	de	dB de
eB fddZ  ZS )BeitForSemanticSegmentationrB   r$   Nc                    s   t  | |j| _t|dd| _t| jjdkrtdt	
t	j|j|jdddt	|jt	 t	j|j|jddd| _t	
t	j|j|jddd| _t	 | _t	jddd| _t|| _|jrft|nd | _|   d S )NFrQ  r  zBeitForSemanticSegmentation requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.r[   r   )r4   r5   rR  r:  r*  r  rB   out_indicesr   r   r  ConvTranspose2drE   rh  GELUfpn1fpn2r   fpn3	MaxPool2dfpn4r~  decode_headuse_auxiliary_headr  auxiliary_headrA  r   r7   r   r   r5     s*   


z$BeitForSemanticSegmentation.__init__c           
      C   s   t jj||jdd  ddd}|d ur"t jj||jdd  ddd}t| jjd}|||}|}|d urA|||}	|| jj|	 7 }|S )Nr   r  Fr]   )ignore_index)r   rg   rh   r(   r   rB   semantic_loss_ignore_indexauxiliary_loss_weight)
r6   rY  auxiliary_logitsrV  upsampled_logitsupsampled_auxiliary_logitsr\  	main_lossrX  auxiliary_lossr   r   r   compute_loss  s   

z(BeitForSemanticSegmentation.compute_lossFrr   rV  r   r  rq   r  c                    s  |dur|nj j}|dur|nj j}|dur"j jdkr"tdj||d||d}|r1|jn|d }	fddt|	D }
|jd  j j	j j
  fd	d|
D }
jjjjg}tt|
D ]}|| |
| |
|< qg|
}d}jdur|
}d}|dur|||}|s|r|f|dd  }n	|f|d
d  }|dur|f| S |S t|||r|jnd|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, BeitForSemanticSegmentation
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
        >>> model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr_  c                    s$   g | ]\}}|d   j jv r|qS r%   )rB   r  )r  idxfeaturer<   r   r   r    s   $ z7BeitForSemanticSegmentation.forward.<locals>.<listcomp>r   c                    s<   g | ]}|d d dd d d f  ddd dqS )Nr   r   r[   rZ   )rf   re   r  )r{   patch_resolutionr   r   r    s    0r[   rW  )rB   rG  r  rR  r   r*  r9   r$  r(   rM   rK   r  r  r  r  r  r  r  r  r  r   r#  )r6   rr   rV  r   r  rq   r  rH  r   r  featuresopsr  rY  r  rX  r0   r   )r{   r  r6   r   r;     sP   $



z#BeitForSemanticSegmentation.forwardra  )r   r   r   r   r5   r  r   r*   r   r   r   r   r;   r@   r   r   r7   r   r    s2     	r  zM
    BEiT backbone, to be used with frameworks like DETR and MaskFormer.
    c                       sZ   e Zd Z fddZdd Ze			ddededB dedB d	edB d
ef
ddZ	  Z
S )BeitBackbonec                    s   t     fddt jd D | _t | _t | jjj	d| _
 jrot| jjdkr3td j}ttj||dddtj| jd	t tj||ddd| _ttj||ddd| _t | _tjddd| _|   d S )
Nc                    s   g | ]} j qS r   )rE   )r  rx   rB   r   r   r  ?  s    z)BeitBackbone.__init__.<locals>.<listcomp>r   r   r  zBeitBackbone requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.r[   r   r   )r4   r5   r  r  num_featuresrA   rW   r  rJ   r   r<  add_fpnr  rB   r  r   rE   r   r  r  rh  batch_norm_epsr  r  r  r   r  r  r  rA  )r6   rB   rE   r7   r  r   r5   <  s(   

zBeitBackbone.__init__c                 C   rB  r3   rC  r<   r   r   r   rD  Y  rE  z!BeitBackbone.get_input_embeddingsNrr   r  r   r  r$   c                 K   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|jd }| |\}\}}	|jdd }
| j|d||
|d}|rB|jn|d }d}t| j	|D ].\}}|| j
v r|| j jrw|ddddddf }|ddd}||d||	}||f7 }qN| j jr| |d | |d | |d | |d	 g}t|}|s|r|f|dd  }|S |f|dd  }|S t||r|jnd|jd
S )a  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224")
        >>> model = AutoBackbone.from_pretrained(
        ...     "microsoft/beit-base-patch16-224", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 14, 14]
        ```Nr   r[   T)r  r   r   r  r   r   rZ   r   )feature_mapsr9   r#  )rB   rG  r  r   r(   rW   r<  r9   zipstage_namesout_featuresreshape_hidden_statesrf   re   r  r  r  r  r  r   r   r#  )r6   rr   r  r   r  rH  r{   rI  ry   rz   r   r   r9   r  stagers  r0   r   r   r   r;   \  sT   #


zBeitBackbone.forward)NNN)r   r   r   r5   rD  r   r   r   r   r;   r@   r   r   r7   r   r  6  s$    r  )r]  rP  r  r:  r)  r  )r    F)Gr   collections.abcrN   r   dataclassesr   r*   r   r   torch.nnr    r   r-  activationsr   backbone_utilsr	   modeling_layersr
   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   configuration_beitr   
get_loggerr   r   r   r>   r   r1   Moduler2   rA   rI   r   r   r   r   r   r   r   r   r   r  r)  r:  r?  rP  r]  rb  rl  rt  r~  r  r  r  __all__r   r   r   r   <module>   s    
 	_&T>?SHG_;%%U; y