o
    ei                     @   sD  d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( e )e*Z+eeddG dd deZ,eeddG dd deZ-G dd dej.Z/G dd dej.Z0G dd  d ej.Z1		!dZd"ej.d#ej2d$ej2d%ej2d&ej2dB d'e3dB d(e3d)ee fd*d+Z4G d,d- d-ej.Z5G d.d/ d/ej.Z6G d0d1 d1ej.Z7G d2d3 d3ej.Z8G d4d5 d5ej.Z9G d6d7 d7eZ:G d8d9 d9ej.Z;G d:d; d;ej.Z<d<d= Z=G d>d? d?ej.Z>G d@dA dAej.Z?G dBdC dCej.Z@G dDdE dEej.ZAeG dFdG dGeZBeG dHdI dIeBZCG dJdK dKej.ZDG dLdM dMej.ZEG dNdO dOej.ZFedPdG dQdR dReBZGG dSdT dTej.ZHG dUdV dVej.ZIeG dWdX dXeBZJg dYZKdS )[zPyTorch DPT (Dense Prediction Transformers) model.

This implementation is heavily inspired by OpenMMLab's implementation, found here:
https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/dpt_head.py.

    N)Callable)	dataclass)nn)CrossEntropyLoss   )initialization)ACT2FN)load_backbone)GradientCheckpointingLayer)BaseModelOutputDepthEstimatorOutputSemanticSegmenterOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )	DPTConfigz
    Base class for model's outputs that also contains intermediate activations that can be used at later stages. Useful
    in the context of Vision models.:
    )custom_introc                   @   s>   e Zd ZU dZdZejdB ed< dZe	ejdf dB ed< dS )*BaseModelOutputWithIntermediateActivationsak  
    last_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
        Intermediate activations that can be used to compute hidden states of the model at various layers.
    Nlast_hidden_states.intermediate_activations)
__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   tuple r'   r'   b/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/dpt/modeling_dpt.pyr   -   s   
 r   z
    Base class for model's outputs that also contains a pooling of the last hidden states as well as intermediate
    activations that can be used by the model at later stages.
    c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ejdf dB ed< dZe
ejdf dB ed< dZe
ejdf dB ed< dS )	4BaseModelOutputWithPoolingAndIntermediateActivationsa  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Last layer hidden-state of the first token of the sequence (classification token) after further processing
        through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
        the classification token after processing through a linear layer and a tanh activation function. The linear
        layer weights are trained from the next sentence prediction (classification) objective during pretraining.
    intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
        Intermediate activations that can be used to compute hidden states of the model at various layers.
    Nlast_hidden_statepooler_output.hidden_states
attentionsr   )r   r    r!   r"   r*   r#   r$   r%   r+   r,   r&   r-   r   r'   r'   r'   r(   r)   @   s   
 
r)   c                       s^   e Zd ZdZddedeeef dB f fddZddd	Z	
dde	j
dedefddZ  ZS )DPTViTHybridEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    Nconfigfeature_sizec           
         sj  t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }t
|| _| jjd }t| jjdkr[tdt| jj ddg| _|d u rr|j}	|	dd  }|	d }nt|tjj	r{|n||f}| jjd }|| _|d | _|| _tj||dd| _ttdd|j| _ttd|d |j| _d S )Nr   r   r   z1Expected backbone to have 3 output features, got kernel_size)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterabler	   backbonechannelslen
ValueErrorresidual_feature_map_indexbackbone_featmap_shaper   Conv2d
projection	Parameterr#   zeros	cls_tokenposition_embeddings)
selfr/   r0   r7   r8   r9   r:   num_patchesfeature_dimfeat_map_shape	__class__r'   r(   r6   `   s0   
 



 zDPTViTHybridEmbeddings.__init__r   c                 C   s   |d d d |f }|d|d f }t t|d }|d||ddddd}tjj|||fdd}|ddddd|| d}tj||gdd	}|S 
Nr         ?r   r1   r      bilinear)sizemodedim)	r   rA   reshapepermuter   
functionalinterpolater#   catrK   posembgrid_size_heightgrid_size_widthstart_index
posemb_tokposemb_gridold_grid_sizer'   r'   r(   _resize_pos_embed   s   z(DPTViTHybridEmbeddings._resize_pos_embedFpixel_valuesinterpolate_pos_encodingreturnc              
      s   |j \}}}}|| jkrtd|s7|| jd ks || jd kr7td| d| d| jd  d| jd  d	| | j|| j || j }| |  jd } fd	d
| j	D }	| 
|ddd}
| j|dd}tj||
fdd}
|
| }
t|
|	dS )NeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).r1   c                    s   g | ]} j | qS r'   )feature_maps).0indexbackbone_outputr'   r(   
<listcomp>   s    z2DPTViTHybridEmbeddings.forward.<locals>.<listcomp>rS   rW   )r   r   )shaper9   rB   r7   rf   rJ   r8   r?   rl   rC   rF   flatten	transposerI   expandr#   r]   r   )rK   rg   rh   
batch_sizer9   heightwidthrJ   featuresoutput_hidden_states
embeddings
cls_tokensr'   ro   r(   forward   s8   


zDPTViTHybridEmbeddings.forwardNr   F)r   r    r!   r"   r   r&   intr6   rf   r#   Tensorboolr   r}   __classcell__r'   r'   rO   r(   r.   Y   s    $
"r.   c                       s>   e Zd ZdZ fddZdddZdejdefd	d
Z	  Z
S )DPTViTEmbeddingszB
    Construct the CLS token, position and patch embeddings.

    c                    sh   t    ttdd|j| _t|| _	| j	j
}ttd|d |j| _t|j| _|| _d S )Nr   )r5   r6   r   rG   r#   rH   r:   rI   DPTViTPatchEmbeddingspatch_embeddingsrL   rJ   Dropouthidden_dropout_probdropoutr/   )rK   r/   rL   rO   r'   r(   r6      s   


zDPTViTEmbeddings.__init__r   c                 C   s   |d d d |f }|d|d f }t |dd }|d||ddddd}tjj|||fdd}|ddddd|| d}tj||gdd	}|S rQ   )	r   rU   rY   rZ   r   r[   r\   r#   r]   r^   r'   r'   r(   rf      s   z"DPTViTEmbeddings._resize_pos_embedrg   ri   c                 C   s   |j \}}}}| jj}| | j|| || }| |}| \}}	}
| j|dd}t	j
||fdd}|| }| |}t|dS )Nr1   r   rW   )r   )rr   r/   r8   rf   rJ   r   rU   rI   ru   r#   r]   r   r   )rK   rg   rv   r9   rw   rx   r8   rJ   r{   seq_len_r|   r'   r'   r(   r}      s   


zDPTViTEmbeddings.forwardr   )r   r    r!   r"   r6   rf   r#   r   r   r}   r   r'   r'   rO   r(   r      s
    

r   c                       <   e Zd ZdZdef fddZdejdejfddZ  Z	S )	r   z$
    Image to Patch Embedding.

    r/   c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )r4   stride)r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   rL   r   rE   rF   )rK   r/   r7   r8   r9   r:   rL   rO   r'   r(   r6      s   
 zDPTViTPatchEmbeddings.__init__rg   ri   c                 C   s<   |j \}}}}|| jkrtd| |ddd}|S )Nrj   rS   r   )rr   r9   rB   rF   rs   rt   )rK   rg   rv   r9   rw   rx   r{   r'   r'   r(   r}     s   
zDPTViTPatchEmbeddings.forward
r   r    r!   r"   r   r6   r#   r   r}   r   r'   r'   rO   r(   r      s    r           modulequerykeyvalueattention_maskscalingr   kwargsc           
      K   s   |d u r| dd }t||dd| }|d ur|| }tjj|dd}tjj||| jd}t||}	|	dd	 }	|	|fS )Nr1         rS   r   rW   )ptrainingr   )
rU   r#   matmulrt   r   r[   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputr'   r'   r(   eager_attention_forward  s   
r   c                       sB   e Zd Zdef fddZdejdeejejf fddZ  Z	S )DPTSelfAttentionr/   c                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .r   F)bias)r5   r6   r:   num_attention_headshasattrrB   r/   r   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearqkv_biasr   r   r   rK   r/   rO   r'   r(   r6   .  s"   

zDPTSelfAttention.__init__r,   ri   c              
   C   s   |j d }|d| j| jf}| |j| dd}| |j| dd}| |j| dd}t	| j
jt}|| |||d | j| j| jsHdn| jd\}}	| d d | jf }
||
}||	fS )Nr   r1   r   rS   r   )r   r   r   r2   )rr   r   r   r   viewrt   r   r   r   get_interfacer/   _attn_implementationr   r   r   r   r   rU   r   rY   )rK   r,   rv   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shaper'   r'   r(   r}   B  s*   


zDPTSelfAttention.forward)
r   r    r!   r   r6   r#   r   r&   r}   r   r'   r'   rO   r(   r   -  s    (r   c                       sB   e Zd ZdZdef fddZdejdejdejfdd	Z  Z	S )
DPTViTSelfOutputz
    The residual connection is defined in ViTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r/   c                    s.   t    t|j|j| _t|j| _d S r~   )	r5   r6   r   r   r:   denser   r   r   r   rO   r'   r(   r6   f     
zDPTViTSelfOutput.__init__r,   input_tensorri   c                 C      |  |}| |}|S r~   r   r   rK   r,   r   r'   r'   r(   r}   k     

zDPTViTSelfOutput.forwardr   r'   r'   rO   r(   r   `  s    $r   c                       8   e Zd Zdef fddZdejdejfddZ  ZS )DPTViTAttentionr/   c                    s"   t    t|| _t|| _d S r~   )r5   r6   r   	attentionr   outputr   rO   r'   r(   r6   s  s   

zDPTViTAttention.__init__r,   ri   c                 C   s   |  |\}}| ||}|S r~   )r   r   )rK   r,   self_attn_outputr   r   r'   r'   r(   r}   x  s   zDPTViTAttention.forward	r   r    r!   r   r6   r#   r   r}   r   r'   r'   rO   r(   r   r      r   c                       r   )DPTViTIntermediater/   c                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r~   )r5   r6   r   r   r:   intermediate_sizer   r;   
hidden_actstrr   intermediate_act_fnr   rO   r'   r(   r6     s
   
zDPTViTIntermediate.__init__r,   ri   c                 C   r   r~   )r   r   )rK   r,   r'   r'   r(   r}     r   zDPTViTIntermediate.forwardr   r'   r'   rO   r(   r     s    r   c                       s>   e Zd Zdef fddZdejdejdejfddZ  ZS )	DPTViTOutputr/   c                    s.   t    t|j|j| _t|j| _	d S r~   )
r5   r6   r   r   r   r:   r   r   r   r   r   rO   r'   r(   r6     r   zDPTViTOutput.__init__r,   r   ri   c                 C   s    |  |}| |}|| }|S r~   r   r   r'   r'   r(   r}     s   

zDPTViTOutput.forwardr   r'   r'   rO   r(   r     s    $r   c                       r   )	DPTViTLayerz?This corresponds to the Block class in the timm implementation.r/   c                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   eps)r5   r6   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr:   layer_norm_epslayernorm_beforelayernorm_afterr   rO   r'   r(   r6     s   



zDPTViTLayer.__init__r,   ri   c                 C   s@   |  |}| |}|| }| |}| |}| ||}|S r~   )r   r   r   r   r   )rK   r,   hidden_states_normattention_outputlayer_outputr'   r'   r(   r}     s   



zDPTViTLayer.forwardr   r'   r'   rO   r(   r     s    
r   c                       s<   e Zd Zdef fddZd
dejdedefdd	Z	  Z
S )DPTViTEncoderr/   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r'   )r   )rm   r   r/   r'   r(   rq     s    z*DPTViTEncoder.__init__.<locals>.<listcomp>F)	r5   r6   r/   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   rO   r   r(   r6     s   
 
zDPTViTEncoder.__init__Fr,   rz   ri   c                 C   sT   |r|gnd }t | jD ]\}}||}|r|| qt||r&t|dS d dS )N)r*   r,   )	enumerater   appendr   r&   )rK   r,   rz   all_hidden_statesilayer_moduler'   r'   r(   r}     s   

zDPTViTEncoder.forwardr   )r   r    r!   r   r6   r#   r   r   r   r}   r   r'   r'   rO   r(   r     s    "r   c                       sP   e Zd ZdZ fddZdd Zdd Zdd	eej	 d
eej	 fddZ
  ZS )DPTReassembleStagea@  
    This class reassembles the hidden states of the backbone into image-like feature representations at various
    resolutions.

    This happens in 3 stages:
    1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
       `config.readout_type`.
    2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
    3. Resizing the spatial dimensions (height, width).

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
    c                    sB   t    || _t | _|jr| | n| | |j	| _	d S r~   )
r5   r6   r/   r   r   layers	is_hybrid_init_reassemble_dpt_hybrid_init_reassemble_dptneck_ignore_stagesr   rO   r'   r(   r6     s   


zDPTReassembleStage.__init__c              	   C   s   t tt|j|jD ]#\}}|dkr| jt  q|dkr.| jt	||j| |d q|j
dkr=td|j
 dt | _t|}tt|jD ])}|dkr_| jtt  qM|dkrv| jttd| |t|j  qMdS )a   "
        For DPT-Hybrid the first 2 reassemble layers are set to `nn.Identity()`, please check the official
        implementation: https://github.com/isl-org/DPT/blob/f43ef9e08d70a752195028a51be5e1aff227b913/dpt/vit.py#L438
        for more details.
        r   r@   factorprojectzReadout type z! is not supported for DPT-Hybrid.rS   N)zipr   rA   neck_hidden_sizesreassemble_factorsr   r   r   IdentityDPTReassembleLayerreadout_typerB   r   readout_projects_get_backbone_hidden_size
Sequentialr   r   r   )rK   r/   r   r   r:   r'   r'   r(   r     s&   

z.DPTReassembleStage._init_reassemble_dpt_hybridc              	   C   s   t tt|j|jD ]\}}| jt||j| |d q|jdkrIt	
 | _t|}tt|jD ]}| jt	t	d| |t|j  q3d S d S )Nr   r   rS   )r   r   rA   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )rK   r/   r   r   r:   r   r'   r'   r(   r     s   

z'DPTReassembleStage._init_reassemble_dptNr,   ri   c                 C   sL  g }t |D ]\}}|| jvr|dddf |ddddf }}|j\}}	}
|dur9|dur9|||||
}nt|	d }|||||
}|dddd }|j}| jjdkr|	dd}|
d|}| j| t||fd	}|ddd|}n| jjd
kr|	d|
d	 }||}| j| |}|| q|S )z
        Args:
            hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
                List of hidden states from the backbone.
        Nr   r   rR   r   rS   r   )r   rS   r   r1   add)r   r   rr   rY   r   rZ   r   r/   r   rs   	unsqueeze	expand_asr   r#   r]   r   r   )rK   r,   patch_heightpatch_widthoutr   hidden_staterI   rv   sequence_lengthr9   rU   feature_shapereadoutr'   r'   r(   r}     s,   
&
zDPTReassembleStage.forwardNN)r   r    r!   r"   r6   r   r   listr#   r   r}   r   r'   r'   rO   r(   r     s    (r   c                 C   s$   | j d urt| j dr| j jS | jS )Nr:   )backbone_configr   r:   r   r'   r'   r(   r   8  s   r   c                       s2   e Zd Zdededef fddZdd Z  ZS )r   r/   r@   r   c                    s   t    t|}tj||dd| _|dkr#tj||||dd| _d S |dkr.t | _d S |dk rCtj||dt	d| dd| _d S d S )Nr   )in_channelsout_channelsr4   r   r4   r   paddingr   )
r5   r6   r   r   rE   rF   ConvTranspose2dresizer   r   )rK   r/   r@   r   r:   rO   r'   r(   r6   @  s   
"zDPTReassembleLayer.__init__c                 C   r   r~   )rF   r  )rK   r   r'   r'   r(   r}   O  r   zDPTReassembleLayer.forward)r   r    r!   r   r   r6   r}   r   r'   r'   rO   r(   r   ?  s    r   c                       s*   e Zd Zdef fddZdd Z  ZS )DPTFeatureFusionStager/   c                    s<   t    t | _tt|jD ]
}| jt	| qd S r~   )
r5   r6   r   r   r   r   rA   r   r   DPTFeatureFusionLayer)rK   r/   r   rO   r'   r(   r6   V  s
   

zDPTFeatureFusionStage.__init__c                 C   sV   |d d d }g }d }t || jD ]\}}|d u r||}n|||}|| q|S )Nr1   )r   r   r   )rK   r,   fused_hidden_statesfused_hidden_stater   r   r'   r'   r(   r}   \  s   

zDPTFeatureFusionStage.forward)r   r    r!   r   r6   r}   r   r'   r'   rO   r(   r  U  s    r  c                       r   )	DPTPreActResidualLayerz
    ResidualConvUnit, pre-activate residual unit.

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
    r/   c                    s   t    |j| _|jd ur|jn| j }t | _tj|j	|j	ddd|d| _
t | _tj|j	|j	ddd|d| _| jrNt|j	| _t|j	| _d S d S )Nr   r   )r4   r   r
  r   )r5   r6   !use_batch_norm_in_fusion_residualuse_batch_normuse_bias_in_fusion_residualr   ReLUactivation1rE   fusion_hidden_sizeconvolution1activation2convolution2BatchNorm2dbatch_norm1batch_norm2)rK   r/   r  rO   r'   r(   r6   v  s8   



		zDPTPreActResidualLayer.__init__r   ri   c                 C   sT   |}|  |}| |}| jr| |}| |}| |}| jr&| |}|| S r~   )r  r  r  r  r  r  r  rK   r   residualr'   r'   r(   r}     s   





zDPTPreActResidualLayer.forwardr   r'   r'   rO   r(   r  m  s    "r  c                       sN   e Zd ZdZddedef fddZddejd	ejdB d
ejfddZ	  Z
S )r  a3  Feature fusion layer, merges feature maps from different stages.

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
        align_corners (`bool`, *optional*, defaults to `True`):
            The align_corner setting for bilinear upsample.
    Tr/   align_cornersc                    s@   t    || _tj|j|jddd| _t|| _t|| _	d S )Nr   T)r4   r   )
r5   r6   r   r   rE   r  rF   r  residual_layer1residual_layer2)rK   r/   r   rO   r'   r(   r6     s
   

zDPTFeatureFusionLayer.__init__Nr   r  ri   c                 C   st   |d ur#|j |j krtjj||j d |j d fddd}|| | }| |}tjj|dd| jd}| |}|S )NrS   r   rT   FrU   rV   r   scale_factorrV   r   )rr   r   r[   r\   r!  r"  r   rF   r  r'   r'   r(   r}     s   


zDPTFeatureFusionLayer.forwardTr~   )r   r    r!   r"   r   r   r6   r#   r   r}   r   r'   r'   rO   r(   r    s    	*
r  c                       sV   e Zd ZU eed< dZdZdZdZdZ	dZ
dZdZdeiZe  fddZ  ZS )	DPTPreTrainedModelr/   dptrg   )imageTr-   c                    s:   t  | t|ttfrt|j t|j dS dS )zInitialize the weightsN)	r5   _init_weightsr;   r   r.   initzeros_rI   rJ   )rK   r   rO   r'   r(   r*    s
   z DPTPreTrainedModel._init_weights)r   r    r!   r   r%   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   _can_record_outputsr#   no_gradr*  r   r'   r'   rO   r(   r'    s   
 r'  c                
       sb   e Zd Zddedef fddZdd Zeedd	e		
dde
jded
B defddZ  ZS )DPTModelTr/   add_pooling_layerc                    sj   t  | || _|jrt|| _nt|| _t|| _t	j
|j|jd| _|r,t|nd| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   N)r5   r6   r/   r   r.   r{   r   r   encoderr   r   r:   r   	layernormDPTViTPoolerpooler	post_init)rK   r/   r8  rO   r'   r(   r6     s   

zDPTModel.__init__c                 C   s   | j jr| jS | jjS r~   )r/   r   r{   r   rK   r'   r'   r(   get_input_embeddings  s   zDPTModel.get_input_embeddingsF)tie_last_hidden_statesNrg   rz   ri   c           	      K   sj   |d u r| j j}| |}|j}| j||d}|j}| |}| jd ur)| |nd }t|||j	|j
dS )N)rz   )r*   r+   r   r,   )r/   rz   r{   r   r9  r*   r:  r<  r)   r   r,   )	rK   rg   rz   r   embedding_outputembedding_last_hidden_statesencoder_outputssequence_outputpooled_outputr'   r'   r(   r}     s    	

zDPTModel.forwardr&  r~   )r   r    r!   r   r   r6   r?  r   r   r   r#   r$   r)   r}   r   r'   r'   rO   r(   r7    s    r7  c                       r   )r;  r/   c                    s,   t    t|j|j| _t|j | _	d S r~   )
r5   r6   r   r   r:   pooler_output_sizer   r   
pooler_act
activationr   rO   r'   r(   r6   %  s   
zDPTViTPooler.__init__r,   ri   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   rH  )rK   r,   first_token_tensorrE  r'   r'   r(   r}   *  s   

zDPTViTPooler.forwardr   r'   r'   rO   r(   r;  $  r   r;  c                
       sZ   e Zd ZdZdef fddZ		ddeej de	dB de	dB d	eej fd
dZ
  ZS )DPTNecka;  
    DPTNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
    input and produces another list of tensors as output. For DPT, it includes 2 stages:

    * DPTReassembleStage
    * DPTFeatureFusionStage.

    Args:
        config (dict): config dict.
    r/   c              
      sz   t    || _|jd ur|jjdkrd | _nt|| _t | _	|j
D ]}| j	tj||jdddd q$t|| _d S )Nswinv2r   r   Fr4   r
  r   )r5   r6   r/   r  
model_typereassemble_stager   r   r   convsr   r   rE   r  r  fusion_stage)rK   r/   channelrO   r'   r(   r6   ?  s   



 zDPTNeck.__init__Nr,   r   r   ri   c                    sn   t |ttfstdt|t jjkrtd jdur% |||} fddt	|D } 
|}|S )z
        Args:
            hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
                List of hidden states from the backbone.
        z2hidden_states should be a tuple or list of tensorszOThe number of hidden states should be equal to the number of neck hidden sizes.Nc                    s   g | ]\}} j | |qS r'   )rO  )rm   r   featurer>  r'   r(   rq   e  s    z#DPTNeck.forward.<locals>.<listcomp>)r;   r&   r  	TypeErrorrA   r/   r   rB   rN  r   rP  )rK   r,   r   r   ry   r   r'   r>  r(   r}   P  s   

zDPTNeck.forwardr  )r   r    r!   r"   r   r6   r  r#   r   r   r}   r   r'   r'   rO   r(   rJ  3  s    rJ  c                       s@   e Zd ZdZdef fddZdeej dejfddZ	  Z
S )	DPTDepthEstimationHeada	  
    Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
    the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
    supplementary material).
    r/   c                    s   t    || _d | _|jrtjdddddd| _|j}ttj||d ddddtj	ddd	d
tj|d dddddt
 tjddddddt
 | _d S )N   )r   r   )r   r   r	  rS   r   r   rT   Tr$      r   )r5   r6   r/   rF   add_projectionr   rE   r  r   Upsampler  headrK   r/   ry   rO   r'   r(   r6   t  s   

zDPTDepthEstimationHead.__init__r,   ri   c                 C   sF   || j j }| jd ur| |}t |}| |}|jdd}|S )Nr   rW   )r/   head_in_indexrF   r   r  rY  squeeze)rK   r,   predicted_depthr'   r'   r(   r}     s   


zDPTDepthEstimationHead.forward)r   r    r!   r"   r   r6   r  r#   r   r}   r   r'   r'   rO   r(   rT  m  s    "rT  zu
    DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
    c                       sP   e Zd Z fddZee		d
dejdejdB de	dB de
fdd	Z  ZS )DPTForDepthEstimationc                    s`   t  | d | _|jdu r|jd urt|| _nt|dd| _t|| _	t
|| _|   d S NF)r8  )r5   r6   r?   r   r  r	   r7  r(  rJ  neckrT  rY  r=  r   rO   r'   r(   r6     s   

zDPTForDepthEstimation.__init__Nrg   labelsrz   ri   c                    s<  |du r j j}d}|durtdd|d<  jdur) jj|fi |}|j}n7 j|fi |}|j} j jsI fddt	|dd D }n|j
}| fdd	t	|dd D  |}d
\}	}
 j jdur j jdu r|j\}}}} j jj}|| }	|| }
 ||	|
} |}t|||r|jnd|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth depth estimation maps for computing the loss.

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DPTForDepthEstimation
        >>> import torch
        >>> import numpy as np
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large")
        >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # interpolate to original size
        >>> post_processed_output = image_processor.post_process_depth_estimation(
        ...     outputs,
        ...     target_sizes=[(image.height, image.width)],
        ... )

        >>> # visualize the prediction
        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
        >>> depth = predicted_depth * 255 / predicted_depth.max()
        >>> depth = depth.detach().cpu().numpy()
        >>> depth = Image.fromarray(depth.astype("uint8"))
        ```NzTraining is not implemented yetTrz   c                        g | ]\}}| j jv r|qS r'   r/   backbone_out_indicesrm   idxrR  r>  r'   r(   rq         z1DPTForDepthEstimation.forward.<locals>.<listcomp>r   c                 3   ,    | ]\}}| j jd d v r|V  qdS rS   Nrc  re  r>  r'   r(   	<genexpr>  s    z0DPTForDepthEstimation.forward.<locals>.<genexpr>r  F)lossr]  r,   r-   )r/   rz   NotImplementedErrorr?   forward_with_filtered_kwargsrl   r(  r,   r   r   r   extendr  rr   r8   r`  rY  r   r-   )rK   rg   ra  rz   r   rk  outputsr,   backbone_hidden_statesr   r   r   rw   rx   r8   r]  r'   r>  r(   r}     sD   0



zDPTForDepthEstimation.forwardr  )r   r    r!   r6   r   r   r#   r$   
LongTensorr   r   r}   r   r'   r'   rO   r(   r^    s    r^  c                       s<   e Zd Zdef fddZdeej dejfddZ  Z	S )DPTSemanticSegmentationHeadr/   c                    sl   t    || _|j}ttj||ddddt|t t	|j
tj||jddtjdddd	| _d S )
Nr   r   FrL  r3   rS   rT   Tr$  )r5   r6   r/   r  r   r   rE   r  r  r   semantic_classifier_dropout
num_labelsrX  rY  rZ  rO   r'   r(   r6     s   


z$DPTSemanticSegmentationHead.__init__r,   ri   c                 C   s   || j j }| |}|S r~   )r/   r[  rY  rK   r,   logitsr'   r'   r(   r}     s   
z#DPTSemanticSegmentationHead.forward)
r   r    r!   r   r6   r  r#   r   r}   r   r'   r'   rO   r(   rr    s    "rr  c                       r   )DPTAuxiliaryHeadr/   c                    sX   t    |j}ttj||ddddt|t tddtj||j	dd| _
d S )Nr   r   FrL  g?r3   )r5   r6   r  r   r   rE   r  r  r   rt  rY  rZ  rO   r'   r(   r6   #  s   


zDPTAuxiliaryHead.__init__r,   ri   c                 C   s   |  |}|S r~   )rY  ru  r'   r'   r(   r}   /  s   
zDPTAuxiliaryHead.forwardr   r'   r'   rO   r(   rw  "  s    rw  c                       s\   e Zd Zdef fddZee			ddejdB dej	dB de
dB defd	d
Z  ZS )DPTForSemanticSegmentationr/   c                    sN   t  | t|dd| _t|| _t|| _|jrt	|nd | _
|   d S r_  )r5   r6   r7  r(  rJ  r`  rr  rY  use_auxiliary_headrw  auxiliary_headr=  r   rO   r'   r(   r6   6  s   

z#DPTForSemanticSegmentation.__init__Nrg   ra  rz   ri   c                    sr  |du r j j}|dur j jdkrtdd|d<  j|fi |}|j} j js: fddt|dd D }n|j}|	 fdd	t|dd D  |} j
|d
} |}d}	 jdurj |d }	d}
|durtjj||jdd ddd}|	durtjj|	|jdd ddd}t j jd}|||}|||}| j j|  }
t|
||r|jnd|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DPTForSemanticSegmentation
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large-ade")
        >>> model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTrz   c                    rb  r'   rc  re  r>  r'   r(   rq   s  rg  z6DPTForSemanticSegmentation.forward.<locals>.<listcomp>c                 3   rh  ri  rc  re  r>  r'   r(   rj  x  s    "z5DPTForSemanticSegmentation.forward.<locals>.<genexpr>)r,   r1   r2   rT   Fr#  )ignore_index)rk  rv  r,   r-   )r/   rz   rt  rB   r(  r,   r   r   r   rn  r`  rY  rz  r   r[   r\   rr   r   semantic_loss_ignore_indexauxiliary_loss_weightr   r-   )rK   rg   ra  rz   r   ro  r,   rp  rv  auxiliary_logitsrk  upsampled_logitsupsampled_auxiliary_logitsloss_fct	main_lossauxiliary_lossr'   r>  r(   r}   E  sP   !




z"DPTForSemanticSegmentation.forward)NNN)r   r    r!   r   r6   r   r   r#   r$   rq  r   r   r}   r   r'   r'   rO   r(   rx  4  s     rx  )r^  rx  r7  r'  )Nr   )Lr"   collections.abcr<   r   dataclassesr   r#   r   torch.nnr    r   r+  activationsr   backbone_utilsr	   modeling_layersr
   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_dptr   
get_loggerr   loggerr   r)   Moduler.   r   r   r   floatr   r   r   r   r   r   r   r   r   r   r   r  r  r  r'  r7  r;  rJ  rT  r^  rr  rw  rx  __all__r'   r'   r'   r(   <module>   s   
`7&
3h=%<:(rh