o
    eiR                     @   s  d dl Zd dl mZ d dlZd dlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlm Z  G dd dej!Z"G dd dej!Z#		d>dej!dej$dej$dej$dej$dB de%dB de%dee fddZ&G d d! d!ej!Z'G d"d# d#ej!Z(G d$d% d%ej!Z)d?d'ej$d(e%d)e*d*ej$fd+d,Z+G d-d. d.ej!Z,G d/d0 d0ej!Z-G d1d2 d2eZ.G d3d4 d4ej!Z/eG d5d6 d6eZ0eG d7d8 d8e0Z1ed9d:G d;d< d<ee0Z2g d=Z3dS )@    N)Callable)nn   )initialization)ACT2FN)BackboneMixin)GradientCheckpointingLayer)BackboneOutputBaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring
is_tracing)merge_with_config_defaults)capture_outputs   )PixioConfigc                       sB   e Zd ZdZdef fddZddejdedejfd	d
Z	  Z
S )PixioPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    configc                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )kernel_sizestride)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesr   Conv2d
projection)selfr   r   r   r   r   r$   	__class__ f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/pixio/modeling_pixio.pyr   /   s   
 zPixioPatchEmbeddings.__init__Fpixel_valuesinterpolate_pos_encodingreturnc              
   C   s   |j \}}}}|| jkrtd| j d| d|s?|| jd ks(|| jd kr?td| d| d| jd  d| jd  d		| |d
dd
}|S )NzoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r   zInput image size (*z) doesn't match model (z).   )shaper   
ValueErrorr   r&   flatten	transpose)r'   r,   r-   
batch_sizer   heightwidth
embeddingsr*   r*   r+   forward>   s(   
zPixioPatchEmbeddings.forwardF)__name__
__module____qualname____doc__r   r   torchTensorboolr:   __classcell__r*   r*   r(   r+   r   (   s    $r   c                       s^   e Zd ZdZdeddf fddZdejded	edejfd
dZ	dejdejfddZ
  ZS )PixioEmbeddingszB
    Construct the CLS tokens, position and patch embeddings.
    r   r.   Nc                    s   t    ttd|j|j| _d | _	t
|| _| jj}ttd||j |j| _t|j| _|j| _|j| _|| _d S )Nr   )r   r   r   	Parameterr@   randnn_cls_tokensr   	cls_token
mask_tokenr   patch_embeddingsr$   position_embeddingsDropouthidden_dropout_probdropoutr   r   )r'   r   r$   r(   r*   r+   r   T   s   


zPixioEmbeddings.__init__r9   r7   r8   c                 C   s  |j d | j }| jj d | j }t s||kr||kr| jS | jddd| jf }| jdd| jdf }|j d }|| j }	|| j }
t|d }|d|||}|dddd}|j}t	j
j|tj|	|
fdd	d
j|d}|dddddd|}tj||fddS )a#  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support tracing and interpolation at torch.float32 precision.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r   r1   bicubicF)sizemodealign_cornersdtypedim)r2   rG   rK   r   r   intreshapepermuterU   r   
functionalinterpolatetor@   float32viewcat)r'   r9   r7   r8   r$   num_positionsclass_pos_embedpatch_pos_embedrW   
new_height	new_widthsqrt_num_positionstarget_dtyper*   r*   r+   r-   a   s.   	



z(PixioEmbeddings.interpolate_pos_encodingr,   c           	      C   sn   |j \}}}}| jjjj}| |j|d}| j|dd}tj	||fdd}|| 
||| }| |}|S )NrT   rO   r   rV   )r2   rJ   r&   weightrU   r]   rH   expandr@   r`   r-   rN   )	r'   r,   r6   _r7   r8   rg   r9   
cls_tokensr*   r*   r+   r:      s   
zPixioEmbeddings.forward)r<   r=   r>   r?   r   r   r@   rA   rX   r-   r:   rC   r*   r*   r(   r+   rD   O   s
    &rD           modulequerykeyvalueattention_maskscalingrN   kwargsc           
      K   s   |d u r| dd }t||dd| }|d ur|| }tjj|dd}tjj||| jd}t||}	|	dd	 }	|	|fS )NrO         r1   r   rV   )ptrainingr   )
rQ   r@   matmulr5   r   r[   softmaxrN   rv   
contiguous)
rm   rn   ro   rp   rq   rr   rN   rs   attn_weightsattn_outputr*   r*   r+   eager_attention_forward   s   
r|   c                       sB   e Zd Zdef fddZdejdeejejf fddZ  Z	S )PixioSelfAttentionr   c                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r/   rt   Fbias)r   r   r   num_attention_headshasattrr3   r   rX   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probrr   	is_causalr   Linearqkv_biasrn   ro   rp   r'   r   r(   r*   r+   r      s"   

zPixioSelfAttention.__init__hidden_statesr.   c              
   C   s   |j d }|d| j| jf}| |j| dd}| |j| dd}| |j| dd}t	| j
jt}|| |||d | j| j| jsHdn| jd\}}	| d d | jf }
||
}||	fS )Nr   rO   r   r1   rl   )r   rr   rN   )r2   r   r   ro   r_   r5   rp   rn   r   get_interfacer   _attn_implementationr|   r   rr   rv   r   rQ   r   rY   )r'   r   r6   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shaper*   r*   r+   r:      s*   


zPixioSelfAttention.forward)
r<   r=   r>   r   r   r@   rA   tupler:   rC   r*   r*   r(   r+   r}      s    (r}   c                       sB   e Zd ZdZdef fddZdejdejdejfdd	Z  Z	S )
PixioSelfOutputz
    The residual connection is defined in PixioLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r   c                    s.   t    t|j|j| _t|j| _d S N)	r   r   r   r   r   denserL   rM   rN   r   r(   r*   r+   r      s   
zPixioSelfOutput.__init__r   input_tensorr.   c                 C   s   |  |}| |}|S r   )r   rN   )r'   r   r   r*   r*   r+   r:      s   

zPixioSelfOutput.forward)
r<   r=   r>   r?   r   r   r@   rA   r:   rC   r*   r*   r(   r+   r      s    $r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )PixioAttentionr   c                    s"   t    t|| _t|| _d S r   )r   r   r}   	attentionr   outputr   r(   r*   r+   r      s   

zPixioAttention.__init__r   r.   c                 C   s   |  |\}}| ||}|S r   )r   r   )r'   r   self_attn_outputrj   r   r*   r*   r+   r:      s   zPixioAttention.forward	r<   r=   r>   r   r   r@   rA   r:   rC   r*   r*   r(   r+   r      s    r   Finput	drop_probrv   r.   c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    rl   r   r   )r   )rU   device)r2   ndimr@   randrU   r   floor_div)r   r   rv   	keep_probr2   random_tensorr   r*   r*   r+   	drop_path  s   r   c                       sT   e Zd ZdZddedB ddf fddZdejdejfdd	Zde	fd
dZ
  ZS )PixioDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r.   c                    s   t    || _d S r   )r   r   r   )r'   r   r(   r*   r+   r     s   

zPixioDropPath.__init__r   c                 C   s   t || j| jS r   )r   r   rv   )r'   r   r*   r*   r+   r:     s   zPixioDropPath.forwardc                 C   s   d| j  S )Nzp=)r   r'   r*   r*   r+   
extra_repr  s   zPixioDropPath.extra_reprr   )r<   r=   r>   r?   floatr   r@   rA   r:   strr   rC   r*   r*   r(   r+   r     s
    r   c                       s4   e Zd Zd fddZdejdejfddZ  ZS )	PixioMLPr.   Nc                    sn   t    |j }}t|j|j }tj||dd| _t|j	t
r(t|j	 | _n|j	| _tj||dd| _d S )NTr   )r   r   r   rX   	mlp_ratior   r   fc1r    
hidden_actr   r   
activationfc2)r'   r   in_featuresout_featureshidden_featuresr(   r*   r+   r     s   

zPixioMLP.__init__hidden_statec                 C   s"   |  |}| |}| |}|S r   )r   r   r   )r'   r   r*   r*   r+   r:   *  s   


zPixioMLP.forward)r.   N)r<   r=   r>   r   r@   rA   r:   rC   r*   r*   r(   r+   r     s    r   c                       s<   e Zd Zdeddf fddZdejdejfddZ  ZS )	
PixioLayerr   r.   Nc                    sh   t    tj|j|jd| _t|| _|j	dkrt
|j	nt | _tj|j|jd| _t|| _d S )Nepsrl   )r   r   r   	LayerNormr   layer_norm_epsnorm1r   r   drop_path_rater   Identityr   norm2r   mlpr   r(   r*   r+   r   2  s   

zPixioLayer.__init__r   c                 C   sH   |  |}| |}| || }| |}| |}| || }|S r   )r   r   r   r   r   )r'   r   hidden_states_normself_attention_outputlayer_outputr*   r*   r+   r:   <  s   



zPixioLayer.forwardr   r*   r*   r(   r+   r   1  s    
r   c                       s<   e Zd Zdef fddZd
dejdedefdd	Z	  Z
S )PixioEncoderr   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r*   )r   .0rj   r   r*   r+   
<listcomp>N  s    z)PixioEncoder.__init__.<locals>.<listcomp>F)	r   r   r   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   r(   r   r+   r   K  s   
 
zPixioEncoder.__init__Fr   output_hidden_statesr.   c                 C   sT   |r|gnd }t | jD ]\}}||}|r|| qt||r&t|dS d dS )N)last_hidden_stater   )	enumerater   appendr
   r   )r'   r   r   all_hidden_statesilayer_moduler*   r*   r+   r:   Q  s   

zPixioEncoder.forwardr;   )r<   r=   r>   r   r   r@   rA   rB   r
   r:   rC   r*   r*   r(   r+   r   J  s    "r   c                   @   sl   e Zd ZU eed< dZdZdZdZddgZ	dZ
dZdZdZeedZe d	ejejB ejB fd
dZdS )PixioPreTrainedModelr   pixior,   )imageTrD   r   )r   
attentionsrm   c                 C   s   t |tjtjfr#tj|jd| jjd |j	dur!t
|j	 dS dS t |tjr7t
|j	 t|j dS t |tr_tj|jd| jjd tj|jd| jjd |jdurat
|j dS dS dS )zInitialize the weightsrl   )meanstdN)r    r   r   r%   inittrunc_normal_rh   r   initializer_ranger   zeros_r   ones_rD   rK   rH   rI   )r'   rm   r*   r*   r+   _init_weightso  s   


z"PixioPreTrainedModel._init_weightsN)r<   r=   r>   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r}   _can_record_outputsr@   no_gradr   r   r%   r   r   r*   r*   r*   r+   r   ^  s    
 "r   c                
       sh   e Zd Zdef fddZdefddZeedde					dd
e
jd	B ded	B defddZ  ZS )
PixioModelr   c                    sF   t  | || _t|| _t|| _tj|j	|j
d| _|   d S )Nr   )r   r   r   rD   r9   r   encoderr   r   r   r   	layernorm	post_initr   r(   r*   r+   r     s   

zPixioModel.__init__r.   c                 C      | j jS r   r9   rJ   r   r*   r*   r+   get_input_embeddings     zPixioModel.get_input_embeddingsF)tie_last_hidden_statesNr,   r   c                 K   s~   |d u r| j j}|d u rtd| |}| j||d}|j}| |}|d d d | jjd d f jdd}t	|||j
dS )Nz You have to specify pixel_valuesr   r   rV   )r   pooler_outputr   )r   r   r3   r9   r   r   r   rG   r   r   r   )r'   r,   r   rs   embedding_outputencoder_outputssequence_outputpooled_outputr*   r*   r+   r:     s   	

&zPixioModel.forward)NN)r<   r=   r>   r   r   r   r   r   r   r   r@   rA   rB   r   r:   rC   r*   r*   r(   r+   r     s    r   zN
    Pixio backbone, to be used with frameworks like DETR and MaskFormer.
    )custom_introc                
       sV   e Zd Z fddZdefddZeee	dde	j
dedB defd	d
Z  ZS )PixioBackbonec                    s^   t     fddt jd D | _t | _t | _t	j
 j jd| _|   d S )Nc                    s   g | ]} j qS r*   )r   r   r   r*   r+   r     s    z*PixioBackbone.__init__.<locals>.<listcomp>r   r   )r   r   r   r   num_featuresrD   r9   r   r   r   r   r   r   r   r   r   r(   r   r+   r     s   

zPixioBackbone.__init__r.   c                 C   r   r   r   r   r*   r*   r+   r     r   z"PixioBackbone.get_input_embeddingsNr,   r   c                 K   s   |du r| j j}| |}| j|dd}|j}g }t| j|D ]H\}}	|| jv rg| j jr1| 	|	}	| j j
rb|	dd| jjdf }	|j\}
}}}| j j}|	|
|| || d}	|	dddd }	||	 qtt||rr|d	S dd	S )
aw  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> processor = AutoImageProcessor.from_pretrained("facebook/pixio-huge")
        >>> model = AutoBackbone.from_pretrained(
        ...     "facebook/pixio-huge", out_features=["stage7", "stage15", "stage23", "stage31"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 1280, 16, 16]
        ```NTr   rO   r   r   r   r1   )feature_mapsr   )r   r   r9   r   r   zipstage_namesr   apply_layernormr   reshape_hidden_statesrG   r2   r   rY   rZ   ry   r   r	   r   )r'   r,   r   rs   r   r   r   r  stager   r6   rj   r7   r8   r   r*   r*   r+   r:     s0    



zPixioBackbone.forwardr   )r<   r=   r>   r   r   r   r   r   r   r@   rA   rB   r	   r:   rC   r*   r*   r(   r+   r     s    r   )r   r   r   )Nrl   )rl   F)4collections.abcr!   r   r@   r    r   r   activationsr   backbone_utilsr   modeling_layersr   modeling_outputsr	   r
   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   configuration_pixior   Moduler   rD   rA   r   r|   r}   r   r   rB   r   r   r   r   r   r   r   r   __all__r*   r*   r*   r+   <module>   sj   'M
2 !,J