o
    i                     @   s"  d Z ddlZddlmZ ddlmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZmZ ddlmZmZmZmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% e&e'Z(eeddG dd deZ)eeddG dd deZ*G dd de	j+Z,G dd de	j+Z-G dd de	j+Z.	dWd e	j+d!ej/d"ej/d#ej/d$eej/ d%e0d&e0fd'd(Z1G d)d* d*e	j+Z2G d+d, d,e	j+Z3G d-d. d.e	j+Z4G d/d0 d0e	j+Z5G d1d2 d2e	j+Z6G d3d4 d4eZ7G d5d6 d6e	j+Z8G d7d8 d8e	j+Z9d9d: Z:G d;d< d<e	j+Z;G d=d> d>e	j+Z<G d?d@ d@e	j+Z=G dAdB dBe	j+Z>eG dCdD dDeZ?eG dEdF dFe?Z@G dGdH dHe	j+ZAG dIdJ dJe	j+ZBG dKdL dLe	j+ZCedMdG dNdO dOe?ZDG dPdQ dQe	j+ZEG dRdS dSe	j+ZFeG dTdU dUe?ZGg dVZHdS )XzPyTorch DPT (Dense Prediction Transformers) model.

This implementation is heavily inspired by OpenMMLab's implementation, found here:
https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/dpt_head.py.

    N)	dataclass)CallableOptional)nn)CrossEntropyLoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputDepthEstimatorOutputSemanticSegmenterOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging	torch_int)load_backbone)can_return_tuplecheck_model_inputs   )	DPTConfigz
    Base class for model's outputs that also contains intermediate activations that can be used at later stages. Useful
    in the context of Vision models.:
    )custom_introc                   @   s>   e Zd ZU dZdZeej ed< dZ	ee
ejdf  ed< dS )*BaseModelOutputWithIntermediateActivationsak  
    last_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
        Intermediate activations that can be used to compute hidden states of the model at various layers.
    Nlast_hidden_states.intermediate_activations)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   tuple r&   r&   a/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/dpt/modeling_dpt.pyr   ,   s   
 r   z
    Base class for model's outputs that also contains a pooling of the last hidden states as well as intermediate
    activations that can be used by the model at later stages.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dZeeejdf  ed< dS )	4BaseModelOutputWithPoolingAndIntermediateActivationsa  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Last layer hidden-state of the first token of the sequence (classification token) after further processing
        through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
        the classification token after processing through a linear layer and a tanh activation function. The linear
        layer weights are trained from the next sentence prediction (classification) objective during pretraining.
    intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
        Intermediate activations that can be used to compute hidden states of the model at various layers.
    Nlast_hidden_statepooler_output.hidden_states
attentionsr   )r   r   r    r!   r)   r   r"   r#   r$   r*   r+   r%   r,   r   r&   r&   r&   r'   r(   ?   s   
 
r(   c                       s^   e Zd ZdZddedeeeef  f fddZddd	Z		
dde
jdedefddZ  ZS )DPTViTHybridEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    Nconfigfeature_sizec           
         sj  t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }t
|| _| jjd }t| jjdkr[tdt| jj ddg| _|d u rr|j}	|	dd  }|	d }nt|tjj	r{|n||f}| jjd }|| _|d | _|| _tj||dd| _ttdd|j| _ttd|d |j| _d S )Nr   r   r   z1Expected backbone to have 3 output features, got kernel_size)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterabler   backbonechannelslen
ValueErrorresidual_feature_map_indexbackbone_featmap_shaper   Conv2d
projection	Parameterr"   zeros	cls_tokenposition_embeddings)
selfr.   r/   r6   r7   r8   r9   num_patchesfeature_dimfeat_map_shape	__class__r&   r'   r5   _   s0   
 



 zDPTViTHybridEmbeddings.__init__r   c                 C   s   |d d d |f }|d|d f }t t|d }|d||ddddd}tjj|||fdd}|ddddd|| d}tj||gdd	}|S 
Nr         ?r   r0   r      bilinear)sizemodedim)	r   r@   reshapepermuter   
functionalinterpolater"   catrJ   posembgrid_size_heightgrid_size_widthstart_index
posemb_tokposemb_gridold_grid_sizer&   r&   r'   _resize_pos_embed   s   z(DPTViTHybridEmbeddings._resize_pos_embedFpixel_valuesinterpolate_pos_encodingreturnc              
      s   |j \}}}}|| jkrtd|s7|| jd ks || jd kr7td| d| d| jd  d| jd  d	| | j|| j || j }| |  jd } fd	d
| j	D }	| 
|ddd}
| j|dd}tj||
fdd}
|
| }
t|
|	dS )NeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).r0   c                    s   g | ]} j | qS r&   )feature_maps).0indexbackbone_outputr&   r'   
<listcomp>   s    z2DPTViTHybridEmbeddings.forward.<locals>.<listcomp>rR   rV   )r   r   )shaper8   rA   r6   re   rI   r7   r>   rk   rB   rE   flatten	transposerH   expandr"   r\   r   )rJ   rf   rg   
batch_sizer8   heightwidthrI   featuresoutput_hidden_states
embeddings
cls_tokensr&   rn   r'   forward   s8   


zDPTViTHybridEmbeddings.forwardNr   )F)r   r   r    r!   r   r   r%   intr5   re   r"   Tensorboolr   r|   __classcell__r&   r&   rN   r'   r-   X   s    $
"r-   c                       s>   e Zd ZdZ fddZdddZdejdefd	d
Z	  Z
S )DPTViTEmbeddingszB
    Construct the CLS token, position and patch embeddings.

    c                    sh   t    ttdd|j| _t|| _	| j	j
}ttd|d |j| _t|j| _|| _d S )Nr   )r4   r5   r   rF   r"   rG   r9   rH   DPTViTPatchEmbeddingspatch_embeddingsrK   rI   Dropouthidden_dropout_probdropoutr.   )rJ   r.   rK   rN   r&   r'   r5      s   


zDPTViTEmbeddings.__init__r   c                 C   s   |d d d |f }|d|d f }t |dd }|d||ddddd}tjj|||fdd}|ddddd|| d}tj||gdd	}|S rP   )	r   rT   rX   rY   r   rZ   r[   r"   r\   r]   r&   r&   r'   re      s   z"DPTViTEmbeddings._resize_pos_embedrf   rh   c                 C   s   |j \}}}}| jj}| | j|| || }| |}| \}}	}
| j|dd}t	j
||fdd}|| }| |}t|dS )Nr0   r   rV   )r   )rq   r.   r7   re   rI   r   rT   rH   rt   r"   r\   r   r   )rJ   rf   ru   r8   rv   rw   r7   rI   rz   seq_len_r{   r&   r&   r'   r|      s   


zDPTViTEmbeddings.forwardr~   )r   r   r    r!   r5   re   r"   r   r   r|   r   r&   r&   rN   r'   r      s
    

r   c                       <   e Zd ZdZdef fddZdejdejfddZ  Z	S )	r   z$
    Image to Patch Embedding.

    r.   c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )r3   stride)r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   rK   r   rD   rE   )rJ   r.   r6   r7   r8   r9   rK   rN   r&   r'   r5      s   
 zDPTViTPatchEmbeddings.__init__rf   rh   c                 C   s<   |j \}}}}|| jkrtd| |ddd}|S )Nri   rR   r   )rq   r8   rA   rE   rr   rs   )rJ   rf   ru   r8   rv   rw   rz   r&   r&   r'   r|     s   
zDPTViTPatchEmbeddings.forward
r   r   r    r!   r   r5   r"   r   r|   r   r&   r&   rN   r'   r      s    r           modulequerykeyvalueattention_maskscalingr   c           
      K   s|   t ||dd| }tjj|dt jd|j}tjj	||| j
d}|d ur,|| }t ||}	|	dd }	|	|fS )Nr0   r1   )rW   dtype)ptrainingr   rR   )r"   matmulrs   r   rZ   softmaxfloat32tor   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr&   r&   r'   eager_attention_forward  s   r   c                	       sP   e Zd Zdef fddZ	d
dejdeej deejejf fdd	Z	  Z
S )DPTSelfAttentionr.   c                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .g      F)bias)r4   r5   r9   num_attention_headshasattrrA   r.   r   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearqkv_biasr   r   r   rJ   r.   rN   r&   r'   r5   /  s"   

zDPTSelfAttention.__init__Nr+   	head_maskrh   c              
   C   s   |j d }|d| j| jf}| |j| dd}| |j| dd}| |j| dd}t}| j	j
dkr?t| j	j
 }|| ||||| j| j| jsNdn| jd\}	}
|	 d d | jf }|	|}	|	|
fS )	Nr   r0   r   rR   eagerr   )r   r   r   r1   )rq   r   r   r   viewrs   r   r   r   r.   _attn_implementationr   r   r   r   r   rT   r   rX   )rJ   r+   r   ru   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shaper&   r&   r'   r|   C  s*   


zDPTSelfAttention.forwardr}   )r   r   r    r   r5   r"   r   r   r%   r|   r   r&   r&   rN   r'   r   .  s    r   c                       sB   e Zd ZdZdef fddZdejdejdejfdd	Z  Z	S )
DPTViTSelfOutputz
    The residual connection is defined in ViTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r.   c                    s.   t    t|j|j| _t|j| _d S r}   )	r4   r5   r   r   r9   denser   r   r   r   rN   r&   r'   r5   i     
zDPTViTSelfOutput.__init__r+   input_tensorrh   c                 C      |  |}| |}|S r}   r   r   rJ   r+   r   r&   r&   r'   r|   n     

zDPTViTSelfOutput.forwardr   r&   r&   rN   r'   r   c  s    $r   c                       sV   e Zd Zdef fddZdee fddZddej	d	e
ej	 d
ej	fddZ  ZS )DPTViTAttentionr.   c                    s*   t    t|| _t|| _t | _d S r}   )r4   r5   r   	attentionr   outputsetpruned_headsr   rN   r&   r'   r5   v  s   


zDPTViTAttention.__init__headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rV   )r@   r   r   r   r   r   r   r   r   r   r   r   r   union)rJ   r   rm   r&   r&   r'   prune_heads|  s   zDPTViTAttention.prune_headsNr+   r   rh   c                 C   s    |  ||\}}| ||}|S r}   )r   r   )rJ   r+   r   self_attn_outputr   r   r&   r&   r'   r|     s   zDPTViTAttention.forwardr}   )r   r   r    r   r5   r   r   r   r"   r   r   r|   r   r&   r&   rN   r'   r   u  s    *r   c                       8   e Zd Zdef fddZdejdejfddZ  ZS )DPTViTIntermediater.   c                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r}   )r4   r5   r   r   r9   intermediate_sizer   r:   
hidden_actstrr   intermediate_act_fnr   rN   r&   r'   r5     s
   
zDPTViTIntermediate.__init__r+   rh   c                 C   r   r}   )r   r   )rJ   r+   r&   r&   r'   r|     r   zDPTViTIntermediate.forward	r   r   r    r   r5   r"   r   r|   r   r&   r&   rN   r'   r     s    r   c                       s>   e Zd Zdef fddZdejdejdejfddZ  ZS )	DPTViTOutputr.   c                    s.   t    t|j|j| _t|j| _	d S r}   )
r4   r5   r   r   r   r9   r   r   r   r   r   rN   r&   r'   r5     r   zDPTViTOutput.__init__r+   r   rh   c                 C   s    |  |}| |}|| }|S r}   r   r   r&   r&   r'   r|     s   

zDPTViTOutput.forwardr   r&   r&   rN   r'   r     s    $r   c                       sH   e Zd ZdZdef fddZddejdeej dejfd	d
Z	  Z
S )DPTViTLayerz?This corresponds to the Block class in the timm implementation.r.   c                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   eps)r4   r5   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr9   layer_norm_epslayernorm_beforelayernorm_afterr   rN   r&   r'   r5     s   



zDPTViTLayer.__init__Nr+   r   rh   c                 C   sB   |  |}| ||}|| }| |}| |}| ||}|S r}   )r   r   r   r   r   )rJ   r+   r   hidden_states_normattention_outputlayer_outputr&   r&   r'   r|     s   


zDPTViTLayer.forwardr}   )r   r   r    r!   r   r5   r"   r   r   r|   r   r&   r&   rN   r'   r     s    *
r   c                	       sH   e Zd Zdef fddZ	ddejdeej ded	e	fd
dZ
  ZS )DPTViTEncoderr.   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r&   )r   )rl   r   r.   r&   r'   rp     s    z*DPTViTEncoder.__init__.<locals>.<listcomp>F)	r4   r5   r.   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   rN   r   r'   r5     s   
 
zDPTViTEncoder.__init__NFr+   r   ry   rh   c                 C   sj   |r|gnd }t | jD ]\}}|d ur|| nd }|||}|r&|| qt||r1t|dS d dS )N)r)   r+   )	enumerater   appendr
   r%   )rJ   r+   r   ry   all_hidden_statesilayer_modulelayer_head_maskr&   r&   r'   r|     s   


zDPTViTEncoder.forwardNF)r   r   r    r   r5   r"   r   r   r   r
   r|   r   r&   r&   rN   r'   r     s    r   c                       sP   e Zd ZdZ fddZdd Zdd Zdd	eej	 d
eej	 fddZ
  ZS )DPTReassembleStagea@  
    This class reassembles the hidden states of the backbone into image-like feature representations at various
    resolutions.

    This happens in 3 stages:
    1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
       `config.readout_type`.
    2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
    3. Resizing the spatial dimensions (height, width).

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
    c                    sB   t    || _t | _|jr| | n| | |j	| _	d S r}   )
r4   r5   r.   r   r   layers	is_hybrid_init_reassemble_dpt_hybrid_init_reassemble_dptneck_ignore_stagesr   rN   r&   r'   r5     s   


zDPTReassembleStage.__init__c              	   C   s   t tt|j|jD ]#\}}|dkr| jt  q|dkr.| jt	||j| |d q|j
dkr=td|j
 dt | _t|}tt|jD ])}|dkr_| jtt  qM|dkrv| jttd| |t|j  qMdS )a   "
        For DPT-Hybrid the first 2 reassemble layers are set to `nn.Identity()`, please check the official
        implementation: https://github.com/isl-org/DPT/blob/f43ef9e08d70a752195028a51be5e1aff227b913/dpt/vit.py#L438
        for more details.
        r   r?   factorprojectzReadout type z! is not supported for DPT-Hybrid.rR   N)zipr   r@   neck_hidden_sizesreassemble_factorsr   r   r   IdentityDPTReassembleLayerreadout_typerA   r   readout_projects_get_backbone_hidden_size
Sequentialr   r   r   )rJ   r.   r   r   r9   r&   r&   r'   r     s&   

z.DPTReassembleStage._init_reassemble_dpt_hybridc              	   C   s   t tt|j|jD ]\}}| jt||j| |d q|jdkrIt	
 | _t|}tt|jD ]}| jt	t	d| |t|j  q3d S d S )Nr   r   rR   )r   r   r@   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )rJ   r.   r   r   r9   r   r&   r&   r'   r     s   

z'DPTReassembleStage._init_reassemble_dptNr+   rh   c                 C   sL  g }t |D ]\}}|| jvr|dddf |ddddf }}|j\}}	}
|dur9|dur9|||||
}nt|	d }|||||
}|dddd }|j}| jjdkr|	dd}|
d|}| j| t||fd	}|ddd|}n| jjd
kr|	d|
d	 }||}| j| |}|| q|S )z
        Args:
            hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
                List of hidden states from the backbone.
        Nr   r   rQ   r   rR   r   )r   rR   r   r0   add)r   r   rq   rX   r   rY   r   r.   r   rr   	unsqueeze	expand_asr   r"   r\   r   r   )rJ   r+   patch_heightpatch_widthoutr   hidden_staterH   ru   sequence_lengthr8   rT   feature_shapereadoutr&   r&   r'   r|   +  s,   
&
zDPTReassembleStage.forwardNN)r   r   r    r!   r5   r   r   listr"   r   r|   r   r&   r&   rN   r'   r     s    (r   c                 C   s"   | j d ur| jdu r| j jS | jS r   )backbone_configr   r9   r   r&   r&   r'   r   Q  s   r   c                       s2   e Zd Zdededef fddZdd Z  ZS )r   r.   r?   r   c                    s   t    t|}tj||dd| _|dkr#tj||||dd| _d S |dkr.t | _d S |dk rCtj||dt	d| dd| _d S d S )Nr   )in_channelsout_channelsr3   r   r3   r   paddingr   )
r4   r5   r   r   rD   rE   ConvTranspose2dresizer   r   )rJ   r.   r?   r   r9   rN   r&   r'   r5   Y  s   
"zDPTReassembleLayer.__init__c                 C   r   r}   )rE   r  )rJ   r  r&   r&   r'   r|   h  r   zDPTReassembleLayer.forward)r   r   r    r   r   r5   r|   r   r&   r&   rN   r'   r   X  s    r   c                       s*   e Zd Zdef fddZdd Z  ZS )DPTFeatureFusionStager.   c                    s<   t    t | _tt|jD ]
}| jt	| qd S r}   )
r4   r5   r   r   r   r   r@   r   r   DPTFeatureFusionLayer)rJ   r.   r   rN   r&   r'   r5   o  s
   

zDPTFeatureFusionStage.__init__c                 C   sV   |d d d }g }d }t || jD ]\}}|d u r||}n|||}|| q|S )Nr0   )r   r   r   )rJ   r+   fused_hidden_statesfused_hidden_stater  r   r&   r&   r'   r|   u  s   

zDPTFeatureFusionStage.forward)r   r   r    r   r5   r|   r   r&   r&   rN   r'   r  n  s    r  c                       r   )	DPTPreActResidualLayerz
    ResidualConvUnit, pre-activate residual unit.

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
    r.   c                    s   t    |j| _|jd ur|jn| j }t | _tj|j	|j	ddd|d| _
t | _tj|j	|j	ddd|d| _| jrNt|j	| _t|j	| _d S d S )Nr   r   )r3   r   r  r   )r4   r5   !use_batch_norm_in_fusion_residualuse_batch_normuse_bias_in_fusion_residualr   ReLUactivation1rD   fusion_hidden_sizeconvolution1activation2convolution2BatchNorm2dbatch_norm1batch_norm2)rJ   r.   r  rN   r&   r'   r5     s8   



		zDPTPreActResidualLayer.__init__r  rh   c                 C   sT   |}|  |}| |}| jr| |}| |}| |}| jr&| |}|| S r}   )r  r  r  r#  r   r!  r$  rJ   r  residualr&   r&   r'   r|     s   





zDPTPreActResidualLayer.forwardr   r&   r&   rN   r'   r    s    "r  c                       sN   e Zd ZdZddedef fddZddejd	e	ej d
ejfddZ
  ZS )r  a3  Feature fusion layer, merges feature maps from different stages.

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
        align_corners (`bool`, *optional*, defaults to `True`):
            The align_corner setting for bilinear upsample.
    Tr.   align_cornersc                    s@   t    || _tj|j|jddd| _t|| _t|| _	d S )Nr   T)r3   r   )
r4   r5   r'  r   rD   r  rE   r  residual_layer1residual_layer2)rJ   r.   r'  rN   r&   r'   r5     s
   

zDPTFeatureFusionLayer.__init__Nr  r&  rh   c                 C   st   |d ur#|j |j krtjj||j d |j d fddd}|| | }| |}tjj|dd| jd}| |}|S )NrR   r   rS   FrT   rU   r'  scale_factorrU   r'  )rq   r   rZ   r[   r(  r)  r'  rE   r%  r&   r&   r'   r|     s   


zDPTFeatureFusionLayer.forwardTr}   )r   r   r    r!   r   r   r5   r"   r   r   r|   r   r&   r&   rN   r'   r    s    	*
r  c                   @   sB   e Zd ZU eed< dZdZdZdZdZ	dZ
dZdeiZdd ZdS )	DPTPreTrainedModelr.   dptrf   Tr,   c                 C   s   t |tjtjtjfr"|jjjd| jj	d |j
dur!|j
j  nt |tjtjfr8|j
j  |jjd t |ttfrM|jj  |jj  dS dS )zInitialize the weightsr   )meanstdNg      ?)r:   r   r   rD   r  weightdatanormal_r.   initializer_ranger   zero_r   r"  fill_r   r-   rH   rI   )rJ   r   r&   r&   r'   _init_weights  s   
z DPTPreTrainedModel._init_weightsN)r   r   r    r   r$   base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   _can_record_outputsr8  r&   r&   r&   r'   r.    s   
 r.  c                       sr   e Zd Zddedef fddZdd Zdd	 Zed
de			dde
jdee
j dee defddZ  ZS )DPTModelTr.   add_pooling_layerc                    sj   t  | || _|jrt|| _nt|| _t|| _t	j
|j|jd| _|r,t|nd| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   N)r4   r5   r.   r   r-   rz   r   r   encoderr   r   r9   r   	layernormDPTViTPoolerpooler	post_init)rJ   r.   rB  rN   r&   r'   r5     s   

zDPTModel.__init__c                 C   s   | j jr| jS | jjS r}   )r.   r   rz   r   rJ   r&   r&   r'   get_input_embeddings  s   zDPTModel.get_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrC  r   r   r   )rJ   heads_to_pruner   r   r&   r&   r'   _prune_heads#  s   zDPTModel._prune_headsF)tie_last_hidden_statesNrf   r   ry   rh   c           
      K   s|   |d u r| j j}| || j j}| |}|j}| j|||d}|j}| |}| j	d ur2| 	|nd }	t
||	|j|jdS )Nr   ry   )r)   r*   r   r+   )r.   ry   get_head_maskr   rz   r   rC  r)   rD  rF  r(   r   r+   )
rJ   rf   r   ry   r   embedding_outputembedding_last_hidden_statesencoder_outputssequence_outputpooled_outputr&   r&   r'   r|   +  s"   	

zDPTModel.forwardr-  r  )r   r   r    r   r   r5   rI  rL  r   r   r"   r#   r   r(   r|   r   r&   r&   rN   r'   rA    s"    rA  c                       r   )rE  r.   c                    s,   t    t|j|j| _t|j | _	d S r}   )
r4   r5   r   r   r9   pooler_output_sizer   r   
pooler_act
activationr   rN   r&   r'   r5   S  s   
zDPTViTPooler.__init__r+   rh   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   rW  )rJ   r+   first_token_tensorrT  r&   r&   r'   r|   X  s   

zDPTViTPooler.forwardr   r&   r&   rN   r'   rE  R  s    rE  c                
       sZ   e Zd ZdZdef fddZ		ddeej de	e
 de	e
 d	eej fd
dZ  ZS )DPTNecka;  
    DPTNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
    input and produces another list of tensors as output. For DPT, it includes 2 stages:

    * DPTReassembleStage
    * DPTFeatureFusionStage.

    Args:
        config (dict): config dict.
    r.   c              
      sz   t    || _|jd ur|jjdkrd | _nt|| _t | _	|j
D ]}| j	tj||jdddd q$t|| _d S )Nswinv2r   r   Fr3   r  r   )r4   r5   r.   r  
model_typereassemble_stager   r   r   convsr   r   rD   r  r  fusion_stage)rJ   r.   channelrN   r&   r'   r5   m  s   



 zDPTNeck.__init__Nr+   r  r  rh   c                    sn   t |ttfstdt|t jjkrtd jdur% |||} fddt	|D } 
|}|S )z
        Args:
            hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
                List of hidden states from the backbone.
        z2hidden_states should be a tuple or list of tensorszOThe number of hidden states should be equal to the number of neck hidden sizes.Nc                    s   g | ]\}} j | |qS r&   )r^  )rl   r   featurerH  r&   r'   rp     s    z#DPTNeck.forward.<locals>.<listcomp>)r:   r%   r  	TypeErrorr@   r.   r   rA   r]  r   r_  )rJ   r+   r  r  rx   r   r&   rH  r'   r|   ~  s   

zDPTNeck.forwardr  )r   r   r    r!   r   r5   r  r"   r   r   r   r|   r   r&   r&   rN   r'   rY  a  s    rY  c                       s@   e Zd ZdZdef fddZdeej dejfddZ	  Z
S )	DPTDepthEstimationHeada	  
    Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
    the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
    supplementary material).
    r.   c                    s   t    || _d | _|jrtjdddddd| _|j}ttj||d ddddtj	ddd	d
tj|d dddddt
 tjddddddt
 | _d S )N   )r   r   )r   r   r  rR   r   r   rS   Tr+      r   )r4   r5   r.   rE   add_projectionr   rD   r  r   Upsampler  headrJ   r.   rx   rN   r&   r'   r5     s   

zDPTDepthEstimationHead.__init__r+   rh   c                 C   sF   || j j }| jd ur| |}t |}| |}|jdd}|S )Nr   rV   )r.   head_in_indexrE   r   r  rh  squeeze)rJ   r+   predicted_depthr&   r&   r'   r|     s   


zDPTDepthEstimationHead.forward)r   r   r    r!   r   r5   r  r"   r   r|   r   r&   r&   rN   r'   rc    s    "rc  zu
    DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
    c                       s\   e Zd Z fddZee			ddejdeej deej	 dee
 def
d	d
Z  ZS )DPTForDepthEstimationc                    sj   t  | d | _|jdu r|jd us|jd urt|| _nt|dd| _t|| _	t
|| _|   d S NF)rB  )r4   r5   r>   r   r  r   rA  r/  rY  neckrc  rh  rG  r   rN   r&   r'   r5     s   

zDPTForDepthEstimation.__init__Nrf   r   labelsry   rh   c                    s>  |du r j j}d}|durtd jdur' jj|fddi|}|j}n: j|f|dd|}|j} j jsJ fddt	|dd D }n|j
}	|	 fd	d
t	|dd D  |	}d\}
} j jdur j jdu r|j\}}}} j jj}|| }
|| } ||
|} |}t|||r|jnd|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth depth estimation maps for computing the loss.

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DPTForDepthEstimation
        >>> import torch
        >>> import numpy as np
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large")
        >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # interpolate to original size
        >>> post_processed_output = image_processor.post_process_depth_estimation(
        ...     outputs,
        ...     target_sizes=[(image.height, image.width)],
        ... )

        >>> # visualize the prediction
        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
        >>> depth = predicted_depth * 255 / predicted_depth.max()
        >>> depth = depth.detach().cpu().numpy()
        >>> depth = Image.fromarray(depth.astype("uint8"))
        ```NzTraining is not implemented yetry   TrN  c                        g | ]\}}| j jv r|qS r&   r.   backbone_out_indicesrl   idxra  rH  r&   r'   rp         z1DPTForDepthEstimation.forward.<locals>.<listcomp>r   c                 3   ,    | ]\}}| j jd d v r|V  qdS rR   Nrr  rt  rH  r&   r'   	<genexpr>  s    z0DPTForDepthEstimation.forward.<locals>.<genexpr>r  F)lossrl  r+   r,   )r.   ry   NotImplementedErrorr>   forward_with_filtered_kwargsrk   r/  r+   r   r   r   extendr  rq   r7   ro  rh  r   r,   )rJ   rf   r   rp  ry   r   rz  outputsr+   backbone_hidden_statesr  r  r   rv   rw   r7   rl  r&   rH  r'   r|     sB   /



zDPTForDepthEstimation.forward)NNN)r   r   r    r5   r   r   r"   r#   r   
LongTensorr   r   r|   r   r&   r&   rN   r'   rm    s$    rm  c                       s<   e Zd Zdef fddZdeej dejfddZ  Z	S )DPTSemanticSegmentationHeadr.   c                    sl   t    || _|j}ttj||ddddt|t t	|j
tj||jddtjdddd	| _d S )
Nr   r   Fr[  r2   rR   rS   Tr+  )r4   r5   r.   r  r   r   rD   r"  r  r   semantic_classifier_dropout
num_labelsrg  rh  ri  rN   r&   r'   r5   9  s   


z$DPTSemanticSegmentationHead.__init__r+   rh   c                 C   s   || j j }| |}|S r}   )r.   rj  rh  rJ   r+   logitsr&   r&   r'   r|   G  s   
z#DPTSemanticSegmentationHead.forward)
r   r   r    r   r5   r  r"   r   r|   r   r&   r&   rN   r'   r  8  s    "r  c                       r   )DPTAuxiliaryHeadr.   c                    sX   t    |j}ttj||ddddt|t tddtj||j	dd| _
d S )Nr   r   Fr[  g?r2   )r4   r5   r  r   r   rD   r"  r  r   r  rh  ri  rN   r&   r'   r5   O  s   


zDPTAuxiliaryHead.__init__r+   rh   c                 C   s   |  |}|S r}   )rh  r  r&   r&   r'   r|   [  s   
zDPTAuxiliaryHead.forwardr   r&   r&   rN   r'   r  N  s    r  c                       sh   e Zd Zdef fddZee				ddeej	 deej	 deej
 dee d	ef
d
dZ  ZS )DPTForSemanticSegmentationr.   c                    sN   t  | t|dd| _t|| _t|| _|jrt	|nd | _
|   d S rn  )r4   r5   rA  r/  rY  ro  r  rh  use_auxiliary_headr  auxiliary_headrG  r   rN   r&   r'   r5   b  s   

z#DPTForSemanticSegmentation.__init__Nrf   r   rp  ry   rh   c                    sp  |du r j j}|dur j jdkrtd j|f|dd|}|j} j js9 fddt|dd D }n|j}|	 fdd	t|dd D  |} j
|d
} |}	d}
 jduri |d }
d}|durtjj|	|jdd ddd}|
durtjj|
|jdd ddd}t j jd}|||}|||}| j j|  }t||	|r|jnd|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DPTForSemanticSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large-ade")
        >>> model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTrN  c                    rq  r&   rr  rt  rH  r&   r'   rp     rv  z6DPTForSemanticSegmentation.forward.<locals>.<listcomp>c                 3   rw  rx  rr  rt  rH  r&   r'   ry    s    "z5DPTForSemanticSegmentation.forward.<locals>.<genexpr>)r+   r0   r1   rS   Fr*  )ignore_index)rz  r  r+   r,   )r.   ry   r  rA   r/  r+   r   r   r   r}  ro  rh  r  r   rZ   r[   rq   r   semantic_loss_ignore_indexauxiliary_loss_weightr   r,   )rJ   rf   r   rp  ry   r   r~  r+   r  r  auxiliary_logitsrz  upsampled_logitsupsampled_auxiliary_logitsloss_fct	main_lossauxiliary_lossr&   rH  r'   r|   q  sZ    




z"DPTForSemanticSegmentation.forward)NNNN)r   r   r    r   r5   r   r   r   r"   r#   r  r   r   r|   r   r&   r&   rN   r'   r  `  s&    r  )rm  r  rA  r.  )r   )Ir!   collections.abcr;   dataclassesr   typingr   r   r"   r   torch.nnr   activationsr   modeling_layersr	   modeling_outputsr
   r   r   modeling_utilsr   r   pytorch_utilsr   r   utilsr   r   r   r   utils.backbone_utilsr   utils.genericr   r   configuration_dptr   
get_loggerr   loggerr   r(   Moduler-   r   r   r   floatr   r   r   r   r   r   r   r   r   r   r   r  r  r  r.  rA  rE  rY  rc  rm  r  r  r  __all__r&   r&   r&   r'   <module>   s   
`7'
5 h=%K:(ph