o
    eidA                     @   sp  d Z ddlZddlmZ ddlmZ ddlmZmZmZ ddl	m
Z
mZmZ ddlmZ dd	lmZ d
dlmZ d
dlmZmZmZ d
dlmZmZmZ eeZG dd deZG dd deZG dd dej Z!G dd deZ"G dd deZ#G dd deZ$G dd deZ%G dd dej Z&G dd deZ'e
G d d! d!e'Z(e
d"d#G d$d% d%eZ)g d&Z*dS )'zPyTorch Pixio model.    N)nn   )GradientCheckpointingLayer)BackboneOutputBaseModelOutputBaseModelOutputWithPooling)auto_docstring
is_tracinglogging)merge_with_config_defaults)capture_outputs   )Dinov2Config)Dinov2BackboneDinov2DropPath	Dinov2MLP)ViTAttentionViTPatchEmbeddingsViTPreTrainedModelc                       sL   e Zd ZdZdZ											
											d fdd	Z  ZS )PixioConfiga  
    This is the configuration class to store the configuration of a [`PixioModel`]. It is used to instantiate a
    Pixio model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the ViT
    [facebook/pixio-huge](https://huggingface.co/facebook/pixio-huge) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1280):
            Dimensionality of the encoder layers and the pooler layer.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        mlp_ratio (`int`, *optional*, defaults to 4):
            Ratio of the hidden size of the MLPs relative to the `hidden_size`.
        n_cls_tokens (`int`, *optional*, defaults to 8):
            Number of class tokens in the Transformer encoder.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` are supported.
        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        image_size (`int`, *optional*, defaults to 256):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        qkv_bias (`bool`, *optional*, defaults to `True`):
            Whether to add a bias to the queries, keys and values.
        drop_path_rate (`float`, *optional*, defaults to 0.0):
            Stochastic depth rate per sample (when applied in the main path of residual layers).
        out_features (`list[str]`, *optional*):
            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
            same order as defined in the `stage_names` attribute.
        out_indices (`list[int]`, *optional*):
            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
            If unset and `out_features` is unset, will default to the last stage. Must be in the
            same order as defined in the `stage_names` attribute.
        apply_layernorm (`bool`, *optional*, defaults to `True`):
            Whether to apply layer normalization to the feature maps in case the model is used as backbone.
        reshape_hidden_states (`bool`, *optional*, defaults to `True`):
            Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
            case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
            seq_len, hidden_size)`.

    Example:

    ```python
    >>> from transformers import PixioConfig, PixioModel

    >>> # Initializing a Pixio pixio-huge style configuration
    >>> configuration = PixioConfig()

    >>> # Initializing a model (with random weights) from the pixio-huge style configuration
    >>> model = PixioModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```pixio                gelu        {Gz?ư>   r   TNc                    s   t  jdi d|d|d|d|d|d|d|d|	d	|
d
|d|d|d|d|d|d| || _| `| `| `d S )Nhidden_sizenum_hidden_layersnum_attention_heads	mlp_ratio
hidden_acthidden_dropout_probattention_probs_dropout_probinitializer_rangelayer_norm_eps
image_size
patch_sizenum_channelsqkv_biasdrop_path_rateapply_layernormreshape_hidden_states )super__init__n_cls_tokenslayerscale_valueuse_swiglu_ffnuse_mask_token)selfr!   r"   r#   r$   r4   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   out_featuresout_indicesr/   r0   kwargs	__class__r1   e/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/pixio/modular_pixio.pyr3   p   sJ   	
zPixioConfig.__init__)r   r   r   r   r   r   r   r   r   r   r    r   r   Tr   NNTT)__name__
__module____qualname____doc__
model_typer3   __classcell__r1   r1   r<   r>   r   $   s.    Ir   c                   @      e Zd ZdS )PixioPatchEmbeddingsNr?   r@   rA   r1   r1   r1   r>   rF          rF   c                       s^   e Zd ZdZdeddf fddZdejded	edejfd
dZ	dejdejfddZ
  ZS )PixioEmbeddingszB
    Construct the CLS tokens, position and patch embeddings.
    configreturnNc                    s   t    ttd|j|j| _d | _	t
|| _| jj}ttd||j |j| _t|j| _|j| _|j| _|| _d S )N   )r2   r3   r   	Parametertorchrandnr4   r!   	cls_token
mask_tokenrF   patch_embeddingsnum_patchesposition_embeddingsDropoutr&   dropoutr+   rJ   )r8   rJ   rS   r<   r1   r>   r3      s   


zPixioEmbeddings.__init__
embeddingsheightwidthc                 C   s  |j d | j }| jj d | j }t s||kr||kr| jS | jddd| jf }| jdd| jdf }|j d }|| j }	|| j }
t|d }|d|||}|dddd}|j}t	j
j|tj|	|
fdd	d
j|d}|dddddd|}tj||fddS )a#  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support tracing and interpolation at torch.float32 precision.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        rL   Ng      ?r   r   r   bicubicF)sizemodealign_cornersdtypedim)shaper4   rT   r	   r+   intreshapepermuter`   r   
functionalinterpolatetorN   float32viewcat)r8   rW   rX   rY   rS   num_positionsclass_pos_embedpatch_pos_embedrb   
new_height	new_widthsqrt_num_positionstarget_dtyper1   r1   r>   interpolate_pos_encoding   s.   	



z(PixioEmbeddings.interpolate_pos_encodingpixel_valuesc           	      C   sn   |j \}}}}| jjjj}| |j|d}| j|dd}tj	||fdd}|| 
||| }| |}|S )Nr_   rZ   rL   ra   )rc   rR   
projectionweightr`   ri   rP   expandrN   rl   rt   rV   )	r8   ru   
batch_size_rX   rY   rs   rW   
cls_tokensr1   r1   r>   forward   s   
zPixioEmbeddings.forward)r?   r@   rA   rB   r   r3   rN   Tensorrd   rt   r|   rD   r1   r1   r<   r>   rI      s
    &rI   c                   @   rE   )PixioAttentionNrG   r1   r1   r1   r>   r~      rH   r~   c                   @   rE   )PixioDropPathNrG   r1   r1   r1   r>   r      rH   r   c                   @   rE   )PixioMLPNrG   r1   r1   r1   r>   r      rH   r   c                       s<   e Zd Zdeddf fddZdejdejfddZ  ZS )	
PixioLayerrJ   rK   Nc                    sh   t    tj|j|jd| _t|| _|j	dkrt
|j	nt | _tj|j|jd| _t|| _d S )Nepsr   )r2   r3   r   	LayerNormr!   r)   norm1r~   	attentionr.   r   Identity	drop_pathnorm2r   mlpr8   rJ   r<   r1   r>   r3      s   

zPixioLayer.__init__hidden_statesc                 C   sH   |  |}| |}| || }| |}| |}| || }|S N)r   r   r   r   r   )r8   r   hidden_states_normself_attention_outputlayer_outputr1   r1   r>   r|     s   



zPixioLayer.forward)	r?   r@   rA   r   r3   rN   r}   r|   rD   r1   r1   r<   r>   r      s    
r   c                       s<   e Zd Zdef fddZd
dejdedefdd	Z	  Z
S )PixioEncoderrJ   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r1   )r   ).0rz   rJ   r1   r>   
<listcomp>  s    z)PixioEncoder.__init__.<locals>.<listcomp>F)	r2   r3   rJ   r   
ModuleListranger"   layergradient_checkpointingr   r<   r   r>   r3     s   
 
zPixioEncoder.__init__Fr   output_hidden_statesrK   c                 C   sT   |r|gnd }t | jD ]\}}||}|r|| qt||r&t|dS d dS )N)last_hidden_stater   )	enumerater   appendr   tuple)r8   r   r   all_hidden_statesilayer_moduler1   r1   r>   r|     s   

zPixioEncoder.forward)F)r?   r@   rA   r   r3   rN   r}   boolr   r|   rD   r1   r1   r<   r>   r     s    "r   c                   @   rE   )PixioPreTrainedModelNrG   r1   r1   r1   r>   r   %  rH   r   c                
       sh   e Zd Zdef fddZdefddZeedde					dd
e
jd	B ded	B defddZ  ZS )
PixioModelrJ   c                    sF   t  | || _t|| _t|| _tj|j	|j
d| _|   d S )Nr   )r2   r3   rJ   rI   rW   r   encoderr   r   r!   r)   	layernorm	post_initr   r<   r1   r>   r3   +  s   

zPixioModel.__init__rK   c                 C   s   | j jS r   )rW   rR   )r8   r1   r1   r>   get_input_embeddings6  s   zPixioModel.get_input_embeddingsF)tie_last_hidden_statesNru   r   c                 K   s~   |d u r| j j}|d u rtd| |}| j||d}|j}| |}|d d d | jjd d f jdd}t	|||j
dS )Nz You have to specify pixel_valuesr   rL   ra   )r   pooler_outputr   )rJ   r   
ValueErrorrW   r   r   r   r4   meanr   r   )r8   ru   r   r;   embedding_outputencoder_outputssequence_outputpooled_outputr1   r1   r>   r|   9  s   	

&zPixioModel.forward)NN)r?   r@   rA   r   r3   rF   r   r   r   r   rN   r}   r   r   r|   rD   r1   r1   r<   r>   r   )  s    r   zN
    Pixio backbone, to be used with frameworks like DETR and MaskFormer.
    )custom_introc                
   @   s8   e Zd Zeee	ddejdedB de	fddZ
dS )PixioBackboneNru   r   rK   c                 K   s   |du r| j j}| |}| j|dd}|j}g }t| j|D ]H\}}	|| jv rg| j jr1| 	|	}	| j j
rb|	dd| jjdf }	|j\}
}}}| j j}|	|
|| || d}	|	dddd }	||	 qtt||rr|d	S dd	S )
aw  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> processor = AutoImageProcessor.from_pretrained("facebook/pixio-huge")
        >>> model = AutoBackbone.from_pretrained(
        ...     "facebook/pixio-huge", out_features=["stage7", "stage15", "stage23", "stage31"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 1280, 16, 16]
        ```NTr   rZ   r   r   rL   r   )feature_mapsr   )rJ   r   rW   r   r   zipstage_namesr9   r/   r   r0   r4   rc   r+   re   rf   
contiguousr   r   r   )r8   ru   r   r;   r   outputr   r   stagehidden_statery   rz   rX   rY   r+   r1   r1   r>   r|   \  s0    



zPixioBackbone.forwardr   )r?   r@   rA   r   r   r   rN   r}   r   r   r|   r1   r1   r1   r>   r   V  s    r   )r   r   r   r   )+rB   rN   r   modeling_layersr   modeling_outputsr   r   r   utilsr   r	   r
   utils.genericr   utils.output_capturingr   dinov2.configuration_dinov2r   dinov2.modeling_dinov2r   r   r   vit.modeling_vitr   r   r   
get_loggerr?   loggerr   rF   ModulerI   r~   r   r   r   r   r   r   r   __all__r1   r1   r1   r>   <module>   s8   
}G,;