o
    eig                  	   @   s  d Z ddlZddlZddlmZ ddlmZ ddlmZmZ ddl	m
Z
 ddlmZmZ d	d
lmZ eeZd8dejdededejfddZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd  d ejZG d!d" d"ejZG d#d$ d$ejZ eG d%d& d&e
Z!eG d'd( d(e!Z"G d)d* d*ejZ#G d+d, d,ejZ$G d-d. d.ejZ%G d/d0 d0ejZ&G d1d2 d2ejZ'ed3d4G d5d6 d6e!Z(g d7Z)dS )9zPyTorch GLPN model.    N)nn   )ACT2FN)BaseModelOutputDepthEstimatorOutput)PreTrainedModel)auto_docstringlogging   )
GLPNConfig        Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    r   r
   r   )r
   )dtypedevice)shapendimtorchrandr   r   floor_div)r   r   r   	keep_probr   random_tensoroutput r   d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/glpn/modeling_glpn.py	drop_path    s   r   c                       sT   e Zd ZdZddedB ddf fddZdejdejfdd	Zde	fd
dZ
  ZS )GLPNDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                       t    || _d S N)super__init__r   )selfr   	__class__r   r   r#   3      

zGLPNDropPath.__init__hidden_statesc                 C   s   t || j| jS r!   )r   r   r   )r$   r(   r   r   r   forward7   s   zGLPNDropPath.forwardc                 C   s   d| j  S )Nzp=)r   )r$   r   r   r   
extra_repr:   s   zGLPNDropPath.extra_reprr!   )__name__
__module____qualname____doc__floatr#   r   Tensorr)   strr*   __classcell__r   r   r%   r   r   0   s
    r   c                       s(   e Zd ZdZ fddZdd Z  ZS )GLPNOverlapPatchEmbeddingsz+Construct the overlapping patch embeddings.c                    s4   t    tj|||||d d| _t|| _d S )N   kernel_sizestridepadding)r"   r#   r   Conv2dproj	LayerNorm
layer_norm)r$   
patch_sizer7   num_channelshidden_sizer%   r   r   r#   B   s   
z#GLPNOverlapPatchEmbeddings.__init__c                 C   s>   |  |}|j\}}}}|ddd}| |}|||fS )Nr4   r
   )r:   r   flatten	transposer<   )r$   pixel_values
embeddings_heightwidthr   r   r   r)   N   s
   


z"GLPNOverlapPatchEmbeddings.forwardr+   r,   r-   r.   r#   r)   r2   r   r   r%   r   r3   ?   s    r3   c                       s,   e Zd ZdZ fddZ	dddZ  ZS )GLPNEfficientSelfAttentionzSegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
    paper](https://huggingface.co/papers/2102.12122).c                    s   t    || _|| _| j| j dkr td| j d| j dt| j| j | _| j| j | _t	| j| j| _
t	| j| j| _t	| j| j| _t|j| _|| _|dkrktj||||d| _t|| _d S d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r
   )r6   r7   )r"   r#   r?   num_attention_heads
ValueErrorintattention_head_sizeall_head_sizer   LinearquerykeyvalueDropoutattention_probs_dropout_probdropoutsr_ratior9   srr;   r<   r$   configr?   rJ   sequence_reduction_ratior%   r   r   r#   ]   s,   

z#GLPNEfficientSelfAttention.__init__Fc                 C   sh  |j \}}}| ||d| j| jdd}| jdkrE|j \}}	}
|ddd||
||}| 	|}|||
dddd}| 
|}| ||d| j| jdd}| ||d| j| jdd}t||dd}|t| j }tjj|dd}| |}t||}|dddd }| d d | jf }||}|r||f}|S |f}|S )Nr
   r4   r   dimr   )r   rP   viewrJ   rM   rA   rV   permutereshaperW   r<   rQ   rR   r   matmulmathsqrtr   
functionalsoftmaxrU   
contiguoussizerN   )r$   r(   rE   rF   output_attentions
batch_size
seq_lengthrD   query_layerseq_lenr>   	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputsr   r   r   r)   x   s>   




z"GLPNEfficientSelfAttention.forwardFrG   r   r   r%   r   rH   Y   s
     rH   c                       s$   e Zd Z fddZdd Z  ZS )GLPNSelfOutputc                    s*   t    t||| _t|j| _d S r!   )r"   r#   r   rO   denserS   hidden_dropout_probrU   )r$   rY   r?   r%   r   r   r#      s   
zGLPNSelfOutput.__init__c                 C   s   |  |}| |}|S r!   )rw   rU   )r$   r(   input_tensorr   r   r   r)      s   

zGLPNSelfOutput.forwardr+   r,   r-   r#   r)   r2   r   r   r%   r   rv      s    rv   c                       &   e Zd Z fddZdddZ  ZS )GLPNAttentionc                    s.   t    t||||d| _t||d| _d S )N)rY   r?   rJ   rZ   )r?   )r"   r#   rH   r$   rv   r   rX   r%   r   r   r#      s   
zGLPNAttention.__init__Fc                 C   s6   |  ||||}| |d |}|f|dd   }|S )Nr   r
   )r$   r   )r$   r(   rE   rF   ri   self_outputsattention_outputrt   r   r   r   r)      s   zGLPNAttention.forwardru   rz   r   r   r%   r   r|      s    
r|   c                       &   e Zd Zd fdd	Zdd Z  ZS )
GLPNDWConv   c              	      s(   t    tj||dddd|d| _d S )Nr   r
   T)biasgroups)r"   r#   r   r9   dwconv)r$   r^   r%   r   r   r#      s   
zGLPNDWConv.__init__c                 C   sD   |j \}}}|dd||||}| |}|ddd}|S )Nr
   r4   )r   rA   r_   r   r@   )r$   r(   rE   rF   rj   rm   r>   r   r   r   r)      s
   
zGLPNDWConv.forward)r   rz   r   r   r%   r   r      s    r   c                       r   )
GLPNMixFFNNc                    sl   t    |p|}t||| _t|| _t|jt	r"t
|j | _n|j| _t||| _t|j| _d S r!   )r"   r#   r   rO   dense1r   r   
isinstance
hidden_actr1   r   intermediate_act_fndense2rS   rx   rU   )r$   rY   in_featureshidden_featuresout_featuresr%   r   r   r#      s   

zGLPNMixFFN.__init__c                 C   sD   |  |}| |||}| |}| |}| |}| |}|S r!   )r   r   r   rU   r   )r$   r(   rE   rF   r   r   r   r)      s   




zGLPNMixFFN.forward)NNrz   r   r   r%   r   r      s    r   c                       s*   e Zd ZdZ fddZdddZ  ZS )	GLPNLayerzCThis corresponds to the Block class in the original implementation.c                    sn   t    t|| _t||||d| _|dkrt|nt | _	t|| _
t|| }t|||d| _d S )N)r?   rJ   rZ   r   )r   r   )r"   r#   r   r;   layer_norm_1r|   	attentionr   Identityr   layer_norm_2rL   r   mlp)r$   rY   r?   rJ   r   rZ   	mlp_ratiomlp_hidden_sizer%   r   r   r#      s   
zGLPNLayer.__init__Fc           
      C   sr   | j | ||||d}|d }|dd  }| |}|| }| | |||}| |}|| }	|	f| }|S )N)ri   r   r
   )r   r   r   r   r   )
r$   r(   rE   rF   ri   self_attention_outputsr~   rt   
mlp_outputlayer_outputr   r   r   r)     s   


zGLPNLayer.forwardru   rG   r   r   r%   r   r      s    r   c                       s,   e Zd Z fddZ			dddZ  ZS )GLPNEncoderc           	         sX  t     | _dd tjd jt jddD }g }t j	D ]"}|
t j|  j| |dkr5 jn j|d   j| d q!t|| _g }d}t j	D ]@}g }|dkrd| j|d  7 }t j| D ]}|
t  j|  j| |||   j|  j| d qk|
t| qSt|| _t fd	dt j	D | _d S )
Nc                 S   s   g | ]}|  qS r   )item).0xr   r   r   
<listcomp>,  s    z(GLPNEncoder.__init__.<locals>.<listcomp>r   cpu)r   r
   )r=   r7   r>   r?   )r?   rJ   r   rZ   r   c                    s   g | ]
}t  j| qS r   )r   r;   hidden_sizes)r   irY   r   r   r   T  s    )r"   r#   rY   r   linspacedrop_path_ratesumdepthsrangenum_encoder_blocksappendr3   patch_sizesstridesr>   r   r   
ModuleListpatch_embeddingsr   rJ   	sr_ratios
mlp_ratiosblockr<   )	r$   rY   dprrC   r   blockscurlayersjr%   r   r   r#   '  sH   
$


zGLPNEncoder.__init__FTc                 C   s   |rdnd }|r
dnd }|j d }|}tt| j| j| jD ]H\}	}
|
\}}}||\}}}t|D ]\}}|||||}|d }|rJ||d f }q2||}||||ddddd }|rf||f }q|sut	dd |||fD S t
|||d	S )
Nr   r   r
   r[   r   r4   c                 s   s    | ]	}|d ur|V  qd S r!   r   )r   vr   r   r   	<genexpr>v  s    z&GLPNEncoder.forward.<locals>.<genexpr>last_hidden_stater(   
attentions)r   	enumeratezipr   r   r<   ra   r`   rg   tupler   )r$   rB   ri   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsrj   r(   idxr   embedding_layerblock_layer
norm_layerrE   rF   r   blklayer_outputsr   r   r   r)   W  s2   

 
zGLPNEncoder.forward)FFTrz   r   r   r%   r   r   &  s    3r   c                   @   s&   e Zd ZU eed< dZdZdZg ZdS )GLPNPreTrainedModelrY   glpnrB   )imageN)	r+   r,   r-   r   __annotations__base_model_prefixmain_input_nameinput_modalities_no_split_modulesr   r   r   r   r   ~  s   
 r   c                       sX   e Zd Z fddZe			ddejdedB dedB dedB dee	B f
d	d
Z
  ZS )	GLPNModelc                    s(   t  | || _t|| _|   d S r!   )r"   r#   rY   r   encoder	post_initr$   rY   r%   r   r   r#     s   
zGLPNModel.__init__NrB   ri   r   r   r   c                 K   s~   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| j||||d}|d }|s6|f|dd   S t||j|jdS )Nri   r   r   r   r
   r   )rY   ri   r   use_return_dictr   r   r(   r   )r$   rB   ri   r   r   kwargsencoder_outputssequence_outputr   r   r   r)     s$   
zGLPNModel.forward)NNN)r+   r,   r-   r#   r   r   FloatTensorboolr   r   r)   r2   r   r   r%   r   r     s"    
r   c                       *   e Zd ZdZd fdd	Zdd Z  ZS )GLPNSelectiveFeatureFusionz
    Selective Feature Fusion module, as explained in the [paper](https://huggingface.co/papers/2201.07436) (section 3.4). This
    module adaptively selects and integrates local and global features by attaining an attention map for each feature.
    @   c              	      s   t    ttjt|d |ddddt|t | _ttj|t|d ddddtt|d t | _	tjt|d ddddd| _
t | _d S )Nr4   r   r
   )in_channelsout_channelsr6   r7   r8   )r"   r#   r   
Sequentialr9   rL   BatchNorm2dReLUconvolutional_layer1convolutional_layer2convolutional_layer3Sigmoidsigmoid)r$   
in_channelr%   r   r   r#     s   
z#GLPNSelectiveFeatureFusion.__init__c                 C   s   t j||fdd}| |}| |}| |}| |}||d d dd d d d f d ||d d dd d d d f d  }|S )Nr
   r]   r   )r   catr   r   r   r   	unsqueeze)r$   local_featuresglobal_featuresfeaturesattnhybrid_featuresr   r   r   r)     s   



(z"GLPNSelectiveFeatureFusion.forward)r   rG   r   r   r%   r   r     s    r   c                       r{   )GLPNDecoderStagec                    sP   t    ||k}|stj||ddnt | _t|| _tjdddd| _	d S )Nr
   )r6   r4   bilinearFscale_factormodealign_corners)
r"   r#   r   r9   r   convolutionr   fusionUpsampleupsample)r$   r   r   should_skipr%   r   r   r#     s
   

zGLPNDecoderStage.__init__Nc                 C   s,   |  |}|d ur| ||}| |}|S r!   )r   r   r   )r$   hidden_stateresidualr   r   r   r)     s
   

zGLPNDecoderStage.forwardr!   rz   r   r   r%   r   r     s    r   c                       s:   e Zd Z fddZdeej deej fddZ  ZS )GLPNDecoderc                    s\   t    |jd d d }|j t fdd|D | _d | jd _tjdddd| _	d S )	Nr[   c                    s   g | ]}t | qS r   )r   )r   r?   r   r   r   r     s    z(GLPNDecoder.__init__.<locals>.<listcomp>r   r4   r   Fr   )
r"   r#   r   decoder_hidden_sizer   r   stagesr   r   final_upsample)r$   rY   reserved_hidden_sizesr%   r   r   r#     s   
zGLPNDecoder.__init__r(   r   c                 C   sN   g }d }t |d d d | jD ]\}}|||}|| q| ||d< |S )Nr[   )r   r   r   r   )r$   r(   stage_hidden_statesstage_hidden_stater   stager   r   r   r)     s   
zGLPNDecoder.forward	r+   r,   r-   r#   listr   r0   r)   r2   r   r   r%   r   r     s    &r   c                       r   )	SiLogLossz
    Implements the Scale-invariant log scale loss [Eigen et al., 2014](https://huggingface.co/papers/1406.2283).

    $$L=\frac{1}{n} \sum_{i} d_{i}^{2}-\frac{1}{2 n^{2}}\left(\sum_{i} d_{i}^{2}\right)$$ where $d_{i}=\log y_{i}-\log
    y_{i}^{*}$.

          ?c                    r    r!   )r"   r#   lambd)r$   r	  r%   r   r   r#     r'   zSiLogLoss.__init__c                 C   sX   |dk  }t|| t||  }tt|d | jt| d  }|S )Nr   r4   )detachr   logrd   powmeanr	  )r$   predtarget
valid_maskdiff_loglossr   r   r   r)     s   ,zSiLogLoss.forward)r  rG   r   r   r%   r   r    s    r  c                       s6   e Zd Z fddZdeej dejfddZ  ZS )GLPNDepthEstimationHeadc                    sR   t    || _|j}ttj||ddddtjddtj|ddddd| _d S )Nr   r
   r5   F)inplace)	r"   r#   rY   r   r   r   r9   r   head)r$   rY   channelsr%   r   r   r#   '  s   


z GLPNDepthEstimationHead.__init__r(   r   c                 C   s8   || j j }| |}t|| j j }|jdd}|S )Nr
   r]   )rY   head_in_indexr  r   r   	max_depthsqueeze)r$   r(   predicted_depthr   r   r   r)   3  s
   
zGLPNDepthEstimationHead.forwardr  r   r   r%   r   r  &  s    "r  zg
    GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2.
    )custom_introc                       sj   e Zd Z fddZe				ddejdejdB dedB dedB dedB d	eej	 e
B fd
dZ  ZS )GLPNForDepthEstimationc                    s6   t  | t|| _t|| _t|| _|   d S r!   )	r"   r#   r   r   r   decoderr  r  r   r   r%   r   r   r#   E  s
   


zGLPNForDepthEstimation.__init__NrB   labelsri   r   r   r   c                 K   s   |dur|n| j j}|dur|n| j j}| j||d|d}|r"|jn|d }| |}	| |	}
d}|dur>t }||
|}|s`|rL|
f|dd  }n	|
f|dd  }|dur^|f| S |S t||
|rh|jnd|j	dS )a  
        labels (`torch.FloatTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth depth estimation maps for computing the loss.

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, GLPNForDepthEstimation
        >>> import torch
        >>> import numpy as np
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("vinvino02/glpn-kitti")
        >>> model = GLPNForDepthEstimation.from_pretrained("vinvino02/glpn-kitti")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # interpolate to original size
        >>> post_processed_output = image_processor.post_process_depth_estimation(
        ...     outputs,
        ...     target_sizes=[(image.height, image.width)],
        ... )

        >>> # visualize the prediction
        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
        >>> depth = predicted_depth * 255 / predicted_depth.max()
        >>> depth = depth.detach().cpu().numpy()
        >>> depth = Image.fromarray(depth.astype("uint8"))
        ```NTr   r
   r4   )r  r  r(   r   )
rY   r   r   r   r(   r  r  r  r   r   )r$   rB   r  ri   r   r   r   rt   r(   outr  r  loss_fctr   r   r   r   r)   O  s6   1


zGLPNForDepthEstimation.forward)NNNN)r+   r,   r-   r#   r   r   r   r   r   r0   r   r)   r2   r   r   r%   r   r  ?  s(    
r  )r  r   r   r   )r   F)*r.   rc   r   r   activationsr   modeling_outputsr   r   modeling_utilsr   utilsr   r	   configuration_glpnr   
get_loggerr+   loggerr0   r/   r   r   Moduler   r3   rH   rv   r|   r   r   r   r   r   r   r   r   r   r  r  r  __all__r   r   r   r   <module>   sD   
 Z+X.,a