o
    wi                     @   s  d Z ddlZddlmZ ddlmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZmZ ddlmZmZmZm Z  ddl!m"Z" ddl#m$Z$ e%e&Z'eeddG dd deZ(eeddG dd deZ)G dd dej*Z+G dd dej*Z,G dd dej*Z-	dVdej*d e	j.d!e	j.d"e	j.d#ee	j. d$e/d%e/fd&d'Z0G d(d) d)ej*Z1G d*d+ d+ej*Z2G d,d- d-ej*Z3G d.d/ d/ej*Z4G d0d1 d1ej*Z5G d2d3 d3eZ6G d4d5 d5ej*Z7G d6d7 d7ej*Z8d8d9 Z9G d:d; d;ej*Z:G d<d= d=ej*Z;G d>d? d?ej*Z<G d@dA dAej*Z=eG dBdC dCeZ>eG dDdE dEe>Z?G dFdG dGej*Z@G dHdI dIej*ZAG dJdK dKej*ZBedLdG dMdN dNe>ZCG dOdP dPej*ZDG dQdR dRej*ZEeG dSdT dTe>ZFg dUZGdS )WzPyTorch DPT (Dense Prediction Transformers) model.

This implementation is heavily inspired by OpenMMLab's implementation, found here:
https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/dpt_head.py.

    N)	dataclass)CallableOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputDepthEstimatorOutputSemanticSegmenterOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging	torch_int)load_backbone   )	DPTConfigz
    Base class for model's outputs that also contains intermediate activations that can be used at later stages. Useful
    in the context of Vision models.:
    )custom_introc                   @   s>   e Zd ZU dZdZeej ed< dZ	ee
ejdf  ed< dS )*BaseModelOutputWithIntermediateActivationsak  
    last_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
        Intermediate activations that can be used to compute hidden states of the model at various layers.
    Nlast_hidden_states.intermediate_activations)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   tuple r%   r%   a/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/dpt/modeling_dpt.pyr   ,   s   
 r   z
    Base class for model's outputs that also contains a pooling of the last hidden states as well as intermediate
    activations that can be used by the model at later stages.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dZeeejdf  ed< dS )	4BaseModelOutputWithPoolingAndIntermediateActivationsa  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Last layer hidden-state of the first token of the sequence (classification token) after further processing
        through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
        the classification token after processing through a linear layer and a tanh activation function. The linear
        layer weights are trained from the next sentence prediction (classification) objective during pretraining.
    intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
        Intermediate activations that can be used to compute hidden states of the model at various layers.
    Nlast_hidden_statepooler_output.hidden_states
attentionsr   )r   r   r   r    r(   r   r!   r"   r#   r)   r*   r$   r+   r   r%   r%   r%   r&   r'   ?   s   
 
r'   c                	       sN   e Zd ZdZd fdd	ZdddZ	dd	ejd
ededejfddZ	  Z
S )DPTViTHybridEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    Nc           
         sj  t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }t
|| _| jjd }t| jjdkr[tdt| jj ddg| _|d u rr|j}	|	dd  }|	d }nt|tjj	r{|n||f}| jjd }|| _|d | _|| _tj||dd| _ttdd|j| _ttd|d |j| _d S )Nr   r   r   z1Expected backbone to have 3 output features, got kernel_size)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterabler   backbonechannelslen
ValueErrorresidual_feature_map_indexbackbone_featmap_shaper   Conv2d
projection	Parameterr!   zeros	cls_tokenposition_embeddings)
selfconfigfeature_sizer3   r4   r5   r6   num_patchesfeature_dimfeat_map_shape	__class__r%   r&   r2   _   s0   
 



 zDPTViTHybridEmbeddings.__init__r   c                 C   s   |d d d |f }|d|d f }t t|d }|d||ddddd}tjj|||fdd}|ddddd|| d}tj||gdd	}|S 
Nr         ?r   r-   r      bilinear)sizemodedim)	r   r=   reshapepermuter   
functionalinterpolater!   catrG   posembgrid_size_heightgrid_size_widthstart_index
posemb_tokposemb_gridold_grid_sizer%   r%   r&   _resize_pos_embed   s   z(DPTViTHybridEmbeddings._resize_pos_embedFpixel_valuesinterpolate_pos_encodingreturn_dictreturnc              
      s  |j \}}}}|| jkrtd|s7|| jd ks || jd kr7td| d| d| jd  d| jd  d	| | j|| j || j }| |  jd }	 fd	d
| j	D }
| 
|	ddd}| j|dd}tj||fdd}|| }|s||
fS t||
dS )NeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).r-   c                    s   g | ]} j | qS r%   )feature_maps).0indexbackbone_outputr%   r&   
<listcomp>   s    z2DPTViTHybridEmbeddings.forward.<locals>.<listcomp>rQ   rU   )r   r   )shaper5   r>   r3   rd   rF   r4   r;   rk   r?   rB   flatten	transposerE   expandr!   r[   r   )rG   re   rf   rg   
batch_sizer5   heightwidthrF   featuresoutput_hidden_states
embeddings
cls_tokensr%   rn   r&   forward   s<   


zDPTViTHybridEmbeddings.forwardNr   )FF)r   r   r   r    r2   rd   r!   Tensorboolr|   __classcell__r%   r%   rM   r&   r,   X   s    
"r,   c                       s4   e Zd ZdZ fddZd
ddZddd	Z  ZS )DPTViTEmbeddingszB
    Construct the CLS token, position and patch embeddings.

    c                    sh   t    ttdd|j| _t|| _	| j	j
}ttd|d |j| _t|j| _|| _d S )Nr   )r1   r2   r   rC   r!   rD   r6   rE   DPTViTPatchEmbeddingspatch_embeddingsrJ   rF   Dropouthidden_dropout_probdropoutrH   )rG   rH   rJ   rM   r%   r&   r2      s   


zDPTViTEmbeddings.__init__r   c                 C   s   |d d d |f }|d|d f }t |dd }|d||ddddd}tjj|||fdd}|ddddd|| d}tj||gdd	}|S rO   )	r   rS   rW   rX   r   rY   rZ   r!   r[   r\   r%   r%   r&   rd      s   z"DPTViTEmbeddings._resize_pos_embedFc                 C   s   |j \}}}}| jj}| | j|| || }| |}	|	 \}}
}| j|dd}t	j
||	fdd}	|	| }	| |	}	|sB|	fS t|	dS )Nr-   r   rU   )r   )rq   rH   r4   rd   rF   r   rS   rE   rt   r!   r[   r   r   )rG   re   rg   ru   r5   rv   rw   r4   rF   rz   seq_len_r{   r%   r%   r&   r|      s   


zDPTViTEmbeddings.forwardr~   )F)r   r   r   r    r2   rd   r|   r   r%   r%   rM   r&   r      s
    

r   c                       s(   e Zd ZdZ fddZdd Z  ZS )r   z$
    Image to Patch Embedding.

    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )r0   stride)r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   rJ   r   rA   rB   )rG   rH   r3   r4   r5   r6   rJ   rM   r%   r&   r2      s   
 zDPTViTPatchEmbeddings.__init__c                 C   s<   |j \}}}}|| jkrtd| |ddd}|S )Nri   rQ   r   )rq   r5   r>   rB   rr   rs   )rG   re   ru   r5   rv   rw   rz   r%   r%   r&   r|   
  s   
zDPTViTPatchEmbeddings.forwardr   r   r   r    r2   r|   r   r%   r%   rM   r&   r      s    r           modulequerykeyvalueattention_maskscalingr   c           
      K   s|   t ||dd| }tjj|dt jd|j}tjj	||| j
d}|d ur,|| }t ||}	|	dd }	|	|fS )Nr-   r.   )rV   dtype)ptrainingr   rQ   )r!   matmulrs   r   rY   softmaxfloat32tor   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr%   r%   r&   eager_attention_forward  s   r   c                
       sv   e Zd Zdeddf fddZdejdejfddZ		dd
eej de	de
eejejf eej f fddZ  ZS )DPTSelfAttentionrH   rh   Nc                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .g      F)bias)r1   r2   r6   num_attention_headshasattrr>   rH   intattention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearqkv_biasr   r   r   rG   rH   rM   r%   r&   r2   5  s"   

zDPTSelfAttention.__init__xc                 C   s6   |  d d | j| jf }||}|ddddS )Nr-   r   rQ   r   r   )rS   r   r   viewrX   )rG   r   new_x_shaper%   r%   r&   transpose_for_scoresI  s   
z%DPTSelfAttention.transpose_for_scoresF	head_maskoutput_attentionsc              
   C   s   |  | |}|  | |}|  | |}t}| jjdkr4| jjdkr.|r.td nt	| jj }|| ||||| j
| j| jsCdn| jd\}}	| d d | jf }
||
}|rc||	f}|S |f}|S )Neagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   )r   r   r   r.   )r   r   r   r   r   rH   _attn_implementationloggerwarning_oncer   r   r   r   r   rS   r   rW   )rG   r*   r   r   	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapeoutputsr%   r%   r&   r|   N  s4   

zDPTSelfAttention.forwardNF)r   r   r   r   r2   r!   r   r   r   r   r   r$   r|   r   r%   r%   rM   r&   r   4  s    r   c                       sF   e Zd ZdZdeddf fddZdejdejdejfd	d
Z  Z	S )DPTViTSelfOutputz
    The residual connection is defined in DPTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rH   rh   Nc                    s.   t    t|j|j| _t|j| _d S r}   )	r1   r2   r   r   r6   denser   r   r   r   rM   r%   r&   r2   y     
zDPTViTSelfOutput.__init__r*   input_tensorc                 C      |  |}| |}|S r}   r   r   rG   r*   r   r%   r%   r&   r|   ~     

zDPTViTSelfOutput.forward)
r   r   r   r    r   r2   r!   r   r|   r   r%   r%   rM   r&   r   s  s    $r   c                       s~   e Zd Zdeddf fddZdee ddfddZ			dd
ej	de
ej	 dedeeej	ej	f eej	 f fddZ  ZS )DPTViTAttentionrH   rh   Nc                    s*   t    t|| _t|| _t | _d S r}   )r1   r2   r   	attentionr   outputsetpruned_headsr   rM   r%   r&   r2     s   


zDPTViTAttention.__init__headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rU   )r=   r   r   r   r   r   r   r   r   r   r   r   r   union)rG   r   rm   r%   r%   r&   prune_heads  s   zDPTViTAttention.prune_headsFr*   r   r   c                 C   s4   |  |||}| |d |}|f|dd   }|S )Nr   r   )r   r   )rG   r*   r   r   self_outputsattention_outputr   r%   r%   r&   r|     s   zDPTViTAttention.forwardr   )r   r   r   r   r2   r   r   r   r!   r   r   r   r   r$   r|   r   r%   r%   rM   r&   r     s    r   c                       s<   e Zd Zdeddf fddZdejdejfddZ  ZS )	DPTViTIntermediaterH   rh   Nc                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r}   )r1   r2   r   r   r6   intermediate_sizer   r7   
hidden_actstrr	   intermediate_act_fnr   rM   r%   r&   r2     s
   
zDPTViTIntermediate.__init__r*   c                 C   r   r}   )r   r   )rG   r*   r%   r%   r&   r|     r   zDPTViTIntermediate.forward	r   r   r   r   r2   r!   r   r|   r   r%   r%   rM   r&   r     s    r   c                       sB   e Zd Zdeddf fddZdejdejdejfdd	Z  ZS )
DPTViTOutputrH   rh   Nc                    s.   t    t|j|j| _t|j| _	d S r}   )
r1   r2   r   r   r   r6   r   r   r   r   r   rM   r%   r&   r2     r   zDPTViTOutput.__init__r*   r   c                 C   s    |  |}| |}|| }|S r}   r   r   r%   r%   r&   r|     s   

zDPTViTOutput.forwardr   r%   r%   rM   r&   r     s    $r   c                       sl   e Zd ZdZdeddf fddZ		ddejd	eej d
e	de
eejejf eej f fddZ  ZS )DPTViTLayerz?This corresponds to the Block class in the timm implementation.rH   rh   Nc                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   eps)r1   r2   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr6   layer_norm_epslayernorm_beforelayernorm_afterr   rM   r%   r&   r2     s   



zDPTViTLayer.__init__Fr*   r   r   c                 C   s`   | j | |||d}|d }|dd  }|| }| |}| |}| ||}|f| }|S )N)r   r   r   )r   r   r   r   r   )rG   r*   r   r   self_attention_outputsr   r   layer_outputr%   r%   r&   r|     s   


zDPTViTLayer.forwardr   )r   r   r   r    r   r2   r!   r   r   r   r   r$   r|   r   r%   r%   rM   r&   r     s    r   c                       sb   e Zd Zdeddf fddZ				ddejd	eej d
ededede	e
ef fddZ  ZS )DPTViTEncoderrH   rh   Nc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r%   )r   )rl   r   rH   r%   r&   rp     s    z*DPTViTEncoder.__init__.<locals>.<listcomp>F)	r1   r2   rH   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   rM   r   r&   r2     s   
 
zDPTViTEncoder.__init__FTr*   r   r   ry   rg   c                 C   s   |rdnd }|r
dnd }t | jD ](\}}	|r||f }|d ur$|| nd }
|	||
|}|d }|r9||d f }q|rA||f }|sOtdd |||fD S t|||dS )Nr%   r   r   c                 s   s    | ]	}|d ur|V  qd S r}   r%   )rl   vr%   r%   r&   	<genexpr>  s    z(DPTViTEncoder.forward.<locals>.<genexpr>)r(   r*   r+   )	enumerater   r$   r   )rG   r*   r   r   ry   rg   all_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputsr%   r%   r&   r|     s(   

zDPTViTEncoder.forward)NFFT)r   r   r   r   r2   r!   r   r   r   r   r$   r   r|   r   r%   r%   rM   r&   r     s&    	
r   c                       sP   e Zd ZdZ fddZdd Zdd Zdd	eej	 d
eej	 fddZ
  ZS )DPTReassembleStagea@  
    This class reassembles the hidden states of the backbone into image-like feature representations at various
    resolutions.

    This happens in 3 stages:
    1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
       `config.readout_type`.
    2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
    3. Resizing the spatial dimensions (height, width).

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
    c                    sB   t    || _t | _|jr| | n| | |j	| _	d S r}   )
r1   r2   rH   r   r   layers	is_hybrid_init_reassemble_dpt_hybrid_init_reassemble_dptneck_ignore_stagesr   rM   r%   r&   r2   6  s   


zDPTReassembleStage.__init__c              	   C   s   t tt|j|jD ]#\}}|dkr| jt  q|dkr.| jt	||j| |d q|j
dkr=td|j
 dt | _t|}tt|jD ])}|dkr_| jtt  qM|dkrv| jttd| |t|j  qMdS )a   "
        For DPT-Hybrid the first 2 reassemble layers are set to `nn.Identity()`, please check the official
        implementation: https://github.com/isl-org/DPT/blob/f43ef9e08d70a752195028a51be5e1aff227b913/dpt/vit.py#L438
        for more details.
        r   r<   factorprojectzReadout type z! is not supported for DPT-Hybrid.rQ   N)zipr   r=   neck_hidden_sizesreassemble_factorsr   appendr   IdentityDPTReassembleLayerreadout_typer>   r   readout_projects_get_backbone_hidden_size
Sequentialr   r	   r   )rG   rH   r   r   r6   r%   r%   r&   r   B  s&   

z.DPTReassembleStage._init_reassemble_dpt_hybridc              	   C   s   t tt|j|jD ]\}}| jt||j| |d q|jdkrIt	
 | _t|}tt|jD ]}| jt	t	d| |t|j  q3d S d S )Nr   r   rQ   )r   r   r=   r  r  r   r  r  r  r   r   r  r  r	  r   r	   r   )rG   rH   r   r   r6   r   r%   r%   r&   r   \  s   

z'DPTReassembleStage._init_reassemble_dptNr*   rh   c                 C   sL  g }t |D ]\}}|| jvr|dddf |ddddf }}|j\}}	}
|dur9|dur9|||||
}nt|	d }|||||
}|dddd }|j}| jjdkr|	dd}|
d|}| j| t||fd	}|ddd|}n| jjd
kr|	d|
d	 }||}| j| |}|| q|S )z
        Args:
            hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
                List of hidden states from the backbone.
        Nr   r   rP   r   rQ   r   )r   rQ   r   r-   add)r   r   rq   rW   r   rX   r   rH   r  rr   	unsqueeze	expand_asr  r!   r[   r   r  )rG   r*   patch_heightpatch_widthoutr   hidden_staterE   ru   sequence_lengthr5   rS   feature_shapereadoutr%   r%   r&   r|   h  s,   
&
zDPTReassembleStage.forwardNN)r   r   r   r    r2   r   r   listr!   r   r|   r   r%   r%   rM   r&   r   &  s    (r   c                 C   s"   | j d ur| jdu r| j jS | jS r   )backbone_configr   r6   r   r%   r%   r&   r    s   r  c                       $   e Zd Z fddZdd Z  ZS )r  c                    s   t    t|}tj||dd| _|dkr#tj||||dd| _d S |dkr.t | _d S |dk rCtj||dt	d| dd| _d S d S )Nr   )in_channelsout_channelsr0   r   r0   r   paddingr   )
r1   r2   r  r   rA   rB   ConvTranspose2dresizer  r   )rG   rH   r<   r   r6   rM   r%   r&   r2     s   
"zDPTReassembleLayer.__init__c                 C   r   r}   )rB   r  )rG   r  r%   r%   r&   r|     s   

zDPTReassembleLayer.forwardr   r   r   r2   r|   r   r%   r%   rM   r&   r    s    r  c                       r  )DPTFeatureFusionStagec                    s<   t    t | _tt|jD ]
}| jt	| qd S r}   )
r1   r2   r   r   r   r   r=   r  r  DPTFeatureFusionLayer)rG   rH   r   rM   r%   r&   r2     s
   

zDPTFeatureFusionStage.__init__c                 C   sV   |d d d }g }d }t || jD ]\}}|d u r||}n|||}|| q|S )Nr-   )r   r   r  )rG   r*   fused_hidden_statesfused_hidden_stater  r   r%   r%   r&   r|     s   

zDPTFeatureFusionStage.forwardr  r%   r%   rM   r&   r    s    r  c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )DPTPreActResidualLayerz
    ResidualConvUnit, pre-activate residual unit.

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
    c                    s   t    |j| _|jd ur|jn| j }t | _tj|j	|j	ddd|d| _
t | _tj|j	|j	ddd|d| _| jrNt|j	| _t|j	| _d S d S )Nr   r   )r0   r   r  r   )r1   r2   !use_batch_norm_in_fusion_residualuse_batch_normuse_bias_in_fusion_residualr   ReLUactivation1rA   fusion_hidden_sizeconvolution1activation2convolution2BatchNorm2dbatch_norm1batch_norm2)rG   rH   r&  rM   r%   r&   r2     s8   



		zDPTPreActResidualLayer.__init__r  rh   c                 C   sT   |}|  |}| |}| jr| |}| |}| |}| jr&| |}|| S r}   )r(  r*  r%  r.  r+  r,  r/  rG   r  residualr%   r%   r&   r|     s   





zDPTPreActResidualLayer.forward)	r   r   r   r    r2   r!   r   r|   r   r%   r%   rM   r&   r#    s    "r#  c                       s,   e Zd ZdZd fdd	Zd	ddZ  ZS )
r   a3  Feature fusion layer, merges feature maps from different stages.

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
        align_corners (`bool`, *optional*, defaults to `True`):
            The align_corner setting for bilinear upsample.
    Tc                    s@   t    || _tj|j|jddd| _t|| _t|| _	d S )Nr   T)r0   r   )
r1   r2   align_cornersr   rA   r)  rB   r#  residual_layer1residual_layer2)rG   rH   r2  rM   r%   r&   r2   
  s
   

zDPTFeatureFusionLayer.__init__Nc                 C   st   |d ur#|j |j krtjj||j d |j d fddd}|| | }| |}tjj|dd| jd}| |}|S )NrQ   r   rR   FrS   rT   r2  scale_factorrT   r2  )rq   r   rY   rZ   r3  r4  r2  rB   r0  r%   r%   r&   r|     s   


zDPTFeatureFusionLayer.forwardTr}   r   r%   r%   rM   r&   r      s    	
r   c                   @   s4   e Zd ZeZdZdZdZdZdZ	dZ
dZdd ZdS )DPTPreTrainedModeldptre   Tc                 C   s   t |tjtjtjfr"|jjjd| jj	d |j
dur!|j
j  nt |tjtjfr8|j
j  |jjd t |ttfrM|jj  |jj  dS dS )zInitialize the weightsr   )meanstdNg      ?)r7   r   r   rA   r  weightdatanormal_rH   initializer_ranger   zero_r   r-  fill_r   r,   rE   rF   )rG   r   r%   r%   r&   _init_weights0  s   
z DPTPreTrainedModel._init_weightsN)r   r   r   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_2_supports_flex_attn_supports_attention_backendrC  r%   r%   r%   r&   r9  %  s    r9  c                       sz   e Zd Zd fdd	Zdd Zdd Ze				dd	ejd
e	ej de	e
 de	e
 de	e
 deeef fddZ  ZS )DPTModelTc                    sj   t  | || _|jrt|| _nt|| _t|| _t	j
|j|jd| _|r,t|nd| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   N)r1   r2   rH   r   r,   rz   r   r   encoderr   r   r6   r   	layernormDPTViTPoolerpooler	post_init)rG   rH   add_pooling_layerrM   r%   r&   r2   B  s   

zDPTModel.__init__c                 C   s   | j jr| jS | jjS r}   )rH   r   rz   r   rG   r%   r%   r&   get_input_embeddingsW  s   zDPTModel.get_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrM  r   r   r   )rG   heads_to_pruner   r   r%   r%   r&   _prune_heads]  s   zDPTModel._prune_headsNre   r   r   ry   rg   rh   c                 C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| || j j}| j||d}|s3|d n|j}| j|||||d}|d }	| 	|	}	| j
d urS| 
|	nd }
|sp|
d ur_|	|
fn|	f}||dd   |dd   S t|	|
|j|j|jdS )N)rg   r   r   r   ry   rg   r   )r(   r)   r*   r+   r   )rH   r   ry   use_return_dictget_head_maskr   rz   r   rM  rN  rP  r'   r*   r+   r   )rG   re   r   r   ry   rg   embedding_outputembedding_last_hidden_statesencoder_outputssequence_outputpooled_outputhead_outputsr%   r%   r&   r|   e  s6   	
zDPTModel.forwardr8  )NNNN)r   r   r   r2   rT  rW  r   r!   r"   r   r   r   r$   r'   r|   r   r%   r%   rM   r&   rL  @  s,    
rL  c                       s*   e Zd Zdef fddZdd Z  ZS )rO  rH   c                    s,   t    t|j|j| _t|j | _	d S r}   )
r1   r2   r   r   r6   pooler_output_sizer   r	   
pooler_act
activationr   rM   r%   r&   r2     s   
zDPTViTPooler.__init__c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   rc  )rG   r*   first_token_tensorr_  r%   r%   r&   r|     s   

zDPTViTPooler.forward)r   r   r   r   r2   r|   r   r%   r%   rM   r&   rO    s    rO  c                       s@   e Zd ZdZ fddZd	deej deej fddZ  Z	S )
DPTNecka;  
    DPTNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
    input and produces another list of tensors as output. For DPT, it includes 2 stages:

    * DPTReassembleStage
    * DPTFeatureFusionStage.

    Args:
        config (dict): config dict.
    c              
      sz   t    || _|jd ur|jjdv rd | _nt|| _t | _	|j
D ]}| j	tj||jdddd q$t|| _d S )N)swinv2r   r   Fr0   r  r   )r1   r2   rH   r  
model_typereassemble_stager   r   r   convsr  r  rA   r)  r  fusion_stage)rG   rH   channelrM   r%   r&   r2     s   



 zDPTNeck.__init__Nr*   rh   c                    sn   t |ttfstdt|t jjkrtd jdur% |||} fddt	|D } 
|}|S )z
        Args:
            hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
                List of hidden states from the backbone.
        z2hidden_states should be a tuple or list of tensorszOThe number of hidden states should be equal to the number of neck hidden sizes.Nc                    s   g | ]\}} j | |qS r%   )rj  )rl   r   featurerS  r%   r&   rp     s    z#DPTNeck.forward.<locals>.<listcomp>)r7   r$   r  	TypeErrorr=   rH   r  r>   ri  r   rk  )rG   r*   r  r  rx   r   r%   rS  r&   r|     s   

zDPTNeck.forwardr  
r   r   r   r    r2   r  r!   r   r|   r   r%   r%   rM   r&   re    s    (re  c                       s:   e Zd ZdZ fddZdeej dejfddZ  Z	S )DPTDepthEstimationHeada	  
    Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
    the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
    supplementary material).
    c                    s   t    || _d | _|jrtjdddddd| _|j}ttj||d ddddtj	ddd	d
tj|d dddddt
 tjddddddt
 | _d S )N   )r   r   )r   r   r  rQ   r   r   rR   Tr6      r   )r1   r2   rH   rB   add_projectionr   rA   r)  r	  Upsampler'  headrG   rH   rx   rM   r%   r&   r2     s   

zDPTDepthEstimationHead.__init__r*   rh   c                 C   sF   || j j }| jd ur| |}t |}| |}|jdd}|S )Nr   rU   )rH   head_in_indexrB   r   r'  ru  squeeze)rG   r*   predicted_depthr%   r%   r&   r|     s   


zDPTDepthEstimationHead.forwardro  r%   r%   rM   r&   rp    s    "rp  zu
    DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
    c                       sz   e Zd Z fddZe					ddejdeej deej dee	 dee	 d	ee	 d
e
eej ef fddZ  ZS )DPTForDepthEstimationc                    sj   t  | d | _|jdu r|jd us|jd urt|| _nt|dd| _t|| _	t
|| _|   d S NF)rR  )r1   r2   r;   r   r  r   rL  r:  re  neckrp  ru  rQ  r   rM   r%   r&   r2     s   

zDPTForDepthEstimation.__init__Nre   r   labelsr   ry   rg   rh   c                    s  d}|dur
t d|dur|n jj}|dur|n jj}|dur$|n jj} jdur: jj|||d}|j}	nF j|||d|d}|rI|j	n|d }	 jj
sa fddt|	dd D }	n|rf|jnt|d	 }
|
 fd
dt|	dd D  |
}	d\}} jjdur jj
du r|j\}}}} jjj}|| }|| } |	||}	 |	}|s|r|f|dd  }n	|f|dd  }|dur|f| S |S t|||r|j	nd|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth depth estimation maps for computing the loss.

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DPTForDepthEstimation
        >>> import torch
        >>> import numpy as np
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large")
        >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # interpolate to original size
        >>> post_processed_output = image_processor.post_process_depth_estimation(
        ...     outputs,
        ...     target_sizes=[(image.height, image.width)],
        ... )

        >>> # visualize the prediction
        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
        >>> depth = predicted_depth * 255 / predicted_depth.max()
        >>> depth = depth.detach().cpu().numpy()
        >>> depth = Image.fromarray(depth.astype("uint8"))
        ```NzTraining is not implemented yet)ry   r   TrX  r   c                        g | ]\}}| j jv r|qS r%   rH   backbone_out_indicesrl   idxrm  rS  r%   r&   rp   g      z1DPTForDepthEstimation.forward.<locals>.<listcomp>r-   c                 3   ,    | ]\}}| j jd d v r|V  qdS rQ   Nr  r  rS  r%   r&   r   l  s    z0DPTForDepthEstimation.forward.<locals>.<genexpr>r  FrQ   )lossry  r*   r+   )NotImplementedErrorrH   rY  ry   r   r;   forward_with_filtered_kwargsrk   r:  r*   r   r   r   r  extendr  rq   r4   r|  ru  r   r+   )rG   re   r   r}  r   ry   rg   r  r   r*   backbone_hidden_statesr  r  r   rv   rw   r4   ry  r   r%   rS  r&   r|     s`   .



zDPTForDepthEstimation.forward)NNNNN)r   r   r   r2   r   r!   r"   r   
LongTensorr   r   r$   r   r   r|   r   r%   r%   rM   r&   rz    s.    rz  c                       s6   e Zd Z fddZdeej dejfddZ  ZS )DPTSemanticSegmentationHeadc                    sl   t    || _|j}ttj||ddddt|t t	|j
tj||jddtjdddd	| _d S )
Nr   r   Frg  r/   rQ   rR   Tr6  )r1   r2   rH   r)  r   r	  rA   r-  r'  r   semantic_classifier_dropout
num_labelsrt  ru  rv  rM   r%   r&   r2     s   


z$DPTSemanticSegmentationHead.__init__r*   rh   c                 C   s   || j j }| |}|S r}   )rH   rw  ru  rG   r*   logitsr%   r%   r&   r|     s   
z#DPTSemanticSegmentationHead.forward)	r   r   r   r2   r  r!   r   r|   r   r%   r%   rM   r&   r    s    "r  c                       r  )DPTAuxiliaryHeadc                    sX   t    |j}ttj||ddddt|t tddtj||j	dd| _
d S )Nr   r   Frg  g?r/   )r1   r2   r)  r   r	  rA   r-  r'  r   r  ru  rv  rM   r%   r&   r2     s   


zDPTAuxiliaryHead.__init__c                 C   s   |  |}|S r}   )ru  r  r%   r%   r&   r|     s   
zDPTAuxiliaryHead.forwardr  r%   r%   rM   r&   r    s    r  c                       s   e Zd Z fddZe						ddeej deej deej dee	 dee	 d	ee	 d
e
eej ef fddZ  ZS )DPTForSemanticSegmentationc                    sN   t  | t|dd| _t|| _t|| _|jrt	|nd | _
|   d S r{  )r1   r2   rL  r:  re  r|  r  ru  use_auxiliary_headr  auxiliary_headrQ  r   rM   r%   r&   r2     s   

z#DPTForSemanticSegmentation.__init__Nre   r   r}  r   ry   rg   rh   c                    s  |dur|n j j}|dur|n j j}|dur" j jdkr"td j|||d|d}|r1|jn|d } j jsI fddt|dd D }n|rN|j	nt
|d }	|	 fd	d
t|dd D  |	} j|d} |}
d} jdur |d }d}|durtjj|
|jdd ddd}|durtjj||jdd ddd}t j jd}|||}|||}| j j|  }|s|r|
f|dd  }n	|
f|dd  }|dur|f| S |S t||
|r|jnd|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DPTForSemanticSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large-ade")
        >>> model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTrX  c                    r~  r%   r  r  rS  r%   r&   rp      r  z6DPTForSemanticSegmentation.forward.<locals>.<listcomp>r-   c                 3   r  r  r  r  rS  r%   r&   r     s    "z5DPTForSemanticSegmentation.forward.<locals>.<genexpr>)r*   r.   rR   Fr5  )ignore_indexrQ   )r  r  r*   r+   )rH   rY  ry   r  r>   r:  r*   r   r   r   r  r  r|  ru  r  r   rY   rZ   rq   r   semantic_loss_ignore_indexauxiliary_loss_weightr   r+   )rG   re   r   r}  r   ry   rg   r   r*   r  r  auxiliary_logitsr  upsampled_logitsupsampled_auxiliary_logitsloss_fct	main_lossauxiliary_lossr   r%   rS  r&   r|     sf    




z"DPTForSemanticSegmentation.forward)NNNNNN)r   r   r   r2   r   r   r!   r"   r  r   r   r$   r   r   r|   r   r%   r%   rM   r&   r    s0    r  )rz  r  rL  r9  )r   )Hr    collections.abcr8   dataclassesr   typingr   r   r   r!   torch.utils.checkpointr   torch.nnr   activationsr	   modeling_layersr
   modeling_outputsr   r   r   modeling_utilsr   r   pytorch_utilsr   r   utilsr   r   r   r   utils.backbone_utilsr   configuration_dptr   
get_loggerr   r   r   r'   Moduler,   r   r   r   floatr   r   r   r   r   r   r   r   r   r  r  r  r#  r   r9  rL  rO  re  rp  rz  r  r  r  __all__r%   r%   r%   r&   <module>   s   
c:'
?*++h=%X5) w