o
    wiq                  	   @   s   d Z ddlZddlmZmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ eeZd:dejdededejfddZG dd dejZG dd dejZ G dd dejZ!G dd dejZ"G dd dejZ#G dd  d ejZ$G d!d" d"ejZ%G d#d$ d$ejZ&G d%d& d&ejZ'eG d'd( d(eZ(eG d)d* d*e(Z)G d+d, d,ejZ*G d-d. d.ejZ+G d/d0 d0ejZ,G d1d2 d2ejZ-G d3d4 d4ejZ.ed5d6G d7d8 d8e(Z/g d9Z0dS );zPyTorch GLPN model.    N)OptionalUnion)nn   )ACT2FN)BaseModelOutputDepthEstimatorOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )
GLPNConfig        Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)shapendimtorchrandr   r   floor_div)r   r   r   	keep_probr   random_tensoroutput r    c/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/glpn/modeling_glpn.py	drop_path$   s   
r"   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )GLPNDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                       t    || _d S N)super__init__r   )selfr   	__class__r    r!   r'   <      

zGLPNDropPath.__init__hidden_statesc                 C   s   t || j| jS r%   )r"   r   r   )r(   r,   r    r    r!   forward@   s   zGLPNDropPath.forwardc                 C   s   d| j  S )Nzp=)r   )r(   r    r    r!   
extra_reprC   s   zGLPNDropPath.extra_reprr%   )__name__
__module____qualname____doc__r   floatr'   r   Tensorr-   strr.   __classcell__r    r    r)   r!   r#   9   s
    r#   c                       s(   e Zd ZdZ fddZdd Z  ZS )GLPNOverlapPatchEmbeddingsz+Construct the overlapping patch embeddings.c                    s4   t    tj|||||d d| _t|| _d S )N   kernel_sizestridepadding)r&   r'   r   Conv2dproj	LayerNorm
layer_norm)r(   
patch_sizer;   num_channelshidden_sizer)   r    r!   r'   K   s   
z#GLPNOverlapPatchEmbeddings.__init__c                 C   s>   |  |}|j\}}}}|ddd}| |}|||fS )Nr8   r   )r>   r   flatten	transposer@   )r(   pixel_values
embeddings_heightwidthr    r    r!   r-   W   s
   


z"GLPNOverlapPatchEmbeddings.forwardr/   r0   r1   r2   r'   r-   r6   r    r    r)   r!   r7   H   s    r7   c                       s4   e Zd ZdZ fddZdd Z	d	ddZ  ZS )
GLPNEfficientSelfAttentionzSegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
    paper](https://huggingface.co/papers/2102.12122).c                    s   t    || _|| _| j| j dkr td| j d| j dt| j| j | _| j| j | _t	| j| j| _
t	| j| j| _t	| j| j| _t|j| _|| _|dkrktj||||d| _t|| _d S d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   )r:   r;   )r&   r'   rC   num_attention_heads
ValueErrorintattention_head_sizeall_head_sizer   LinearquerykeyvalueDropoutattention_probs_dropout_probdropoutsr_ratior=   srr?   r@   r(   configrC   rN   sequence_reduction_ratior)   r    r!   r'   f   s,   

z#GLPNEfficientSelfAttention.__init__c                 C   s6   |  d d | j| jf }||}|ddddS )Nr   r8   r   r   )sizerN   rQ   viewpermute)r(   r,   	new_shaper    r    r!   transpose_for_scores   s   
z/GLPNEfficientSelfAttention.transpose_for_scoresFc                 C   s&  |  | |}| jdkr6|j\}}}|ddd||||}| |}|||dddd}| |}|  | |}	|  | 	|}
t
||	dd}|t| j }tjj|dd}| |}t
||
}|dddd }| d d | jf }||}|r||f}|S |f}|S )Nr   r   r8   r_   dimr   )rd   rT   rZ   r   rb   reshaper[   r@   rU   rV   r   matmulrE   mathsqrtrQ   r   
functionalsoftmaxrY   
contiguousr`   rR   ra   )r(   r,   rI   rJ   output_attentionsquery_layer
batch_sizeseq_lenrB   	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputsr    r    r!   r-      s*   




z"GLPNEfficientSelfAttention.forwardF)r/   r0   r1   r2   r'   rd   r-   r6   r    r    r)   r!   rL   b   s    
rL   c                       s$   e Zd Z fddZdd Z  ZS )GLPNSelfOutputc                    s*   t    t||| _t|j| _d S r%   )r&   r'   r   rS   denserW   hidden_dropout_probrY   )r(   r]   rC   r)   r    r!   r'      s   
zGLPNSelfOutput.__init__c                 C   s   |  |}| |}|S r%   )r|   rY   )r(   r,   input_tensorr    r    r!   r-      s   

zGLPNSelfOutput.forwardr/   r0   r1   r'   r-   r6   r    r    r)   r!   r{      s    r{   c                       s.   e Zd Z fddZdd ZdddZ  ZS )	GLPNAttentionc                    s6   t    t||||d| _t||d| _t | _d S )N)r]   rC   rN   r^   )rC   )r&   r'   rL   r(   r{   r   setpruned_headsr\   r)   r    r!   r'      s   
zGLPNAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rf   )lenr
   r(   rN   rQ   r   r   rT   rU   rV   r   r|   rR   union)r(   headsindexr    r    r!   prune_heads   s   zGLPNAttention.prune_headsFc                 C   s6   |  ||||}| |d |}|f|dd   }|S )Nr   r   )r(   r   )r(   r,   rI   rJ   ro   self_outputsattention_outputry   r    r    r!   r-      s   zGLPNAttention.forwardrz   )r/   r0   r1   r'   r   r-   r6   r    r    r)   r!   r      s    r   c                       &   e Zd Zd fdd	Zdd Z  ZS )
GLPNDWConv   c              	      s(   t    tj||dddd|d| _d S )Nr   r   T)biasgroups)r&   r'   r   r=   dwconv)r(   rg   r)   r    r!   r'      s   
zGLPNDWConv.__init__c                 C   sD   |j \}}}|dd||||}| |}|ddd}|S )Nr   r8   )r   rE   ra   r   rD   )r(   r,   rI   rJ   rq   rr   rB   r    r    r!   r-      s
   
zGLPNDWConv.forward)r   r   r    r    r)   r!   r      s    r   c                       r   )
GLPNMixFFNNc                    sl   t    |p|}t||| _t|| _t|jt	r"t
|j | _n|j| _t||| _t|j| _d S r%   )r&   r'   r   rS   dense1r   r   
isinstance
hidden_actr5   r   intermediate_act_fndense2rW   r}   rY   )r(   r]   in_featureshidden_featuresout_featuresr)   r    r!   r'      s   

zGLPNMixFFN.__init__c                 C   sD   |  |}| |||}| |}| |}| |}| |}|S r%   )r   r   r   rY   r   )r(   r,   rI   rJ   r    r    r!   r-     s   




zGLPNMixFFN.forward)NNr   r    r    r)   r!   r      s    r   c                       s*   e Zd ZdZ fddZdddZ  ZS )	GLPNLayerzCThis corresponds to the Block class in the original implementation.c                    sn   t    t|| _t||||d| _|dkrt|nt | _	t|| _
t|| }t|||d| _d S )N)rC   rN   r^   r   )r   r   )r&   r'   r   r?   layer_norm_1r   	attentionr#   Identityr"   layer_norm_2rP   r   mlp)r(   r]   rC   rN   r"   r^   	mlp_ratiomlp_hidden_sizer)   r    r!   r'     s   
zGLPNLayer.__init__Fc           
      C   sr   | j | ||||d}|d }|dd  }| |}|| }| | |||}| |}|| }	|	f| }|S )N)ro   r   r   )r   r   r"   r   r   )
r(   r,   rI   rJ   ro   self_attention_outputsr   ry   
mlp_outputlayer_outputr    r    r!   r-      s   


zGLPNLayer.forwardrz   rK   r    r    r)   r!   r     s    r   c                       s,   e Zd Z fddZ			dddZ  ZS )GLPNEncoderc           	         sX  t     | _dd tjd jt jddD }g }t j	D ]"}|
t j|  j| |dkr5 jn j|d   j| d q!t|| _g }d}t j	D ]@}g }|dkrd| j|d  7 }t j| D ]}|
t  j|  j| |||   j|  j| d qk|
t| qSt|| _t fd	dt j	D | _d S )
Nc                 S   s   g | ]}|  qS r    )item).0xr    r    r!   
<listcomp>@  s    z(GLPNEncoder.__init__.<locals>.<listcomp>r   cpu)r   r   )rA   r;   rB   rC   )rC   rN   r"   r^   r   c                    s   g | ]
}t  j| qS r    )r   r?   hidden_sizes)r   ir]   r    r!   r   h  s    )r&   r'   r]   r   linspacedrop_path_ratesumdepthsrangenum_encoder_blocksappendr7   patch_sizesstridesrB   r   r   
ModuleListpatch_embeddingsr   rN   	sr_ratios
mlp_ratiosblockr@   )	r(   r]   dprrG   r   blockscurlayersjr)   r   r!   r'   ;  sH   
$


zGLPNEncoder.__init__FTc                 C   s   |rdnd }|r
dnd }|j d }|}tt| j| j| jD ]H\}	}
|
\}}}||\}}}t|D ]\}}|||||}|d }|rJ||d f }q2||}||||ddddd }|rf||f }q|sut	dd |||fD S t
|||d	S )
Nr    r   r   r_   r   r8   c                 s   s    | ]	}|d ur|V  qd S r%   r    )r   vr    r    r!   	<genexpr>  s    z&GLPNEncoder.forward.<locals>.<genexpr>last_hidden_stater,   
attentions)r   	enumeratezipr   r   r@   rh   rb   rn   tupler   )r(   rF   ro   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsrq   r,   idxr   embedding_layerblock_layer
norm_layerrI   rJ   r   blklayer_outputsr    r    r!   r-   k  s2   

 
zGLPNEncoder.forward)FFTr   r    r    r)   r!   r   :  s    3r   c                   @   s$   e Zd ZeZdZdZg Zdd ZdS )GLPNPreTrainedModelglpnrF   c                 C   s   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjrF|jjjd| jjd |jdurD|jj|j 
  dS dS t |tjtjfr^|j	j
  |jjd dS dS )zInitialize the weightsr   )meanstdNg      ?)r   r   rS   r=   weightdatanormal_r]   initializer_ranger   zero_	Embeddingpadding_idxr?   BatchNorm2dfill_)r(   moduler    r    r!   _init_weights  s   

z!GLPNPreTrainedModel._init_weightsN)	r/   r0   r1   r   config_classbase_model_prefixmain_input_name_no_split_modulesr   r    r    r    r!   r     s    r   c                       sd   e Zd Z fddZdd Ze			ddejdee	 dee	 d	ee	 d
e
eef f
ddZ  ZS )	GLPNModelc                    s(   t  | || _t|| _|   d S r%   )r&   r'   r]   r   encoder	post_initr(   r]   r)   r    r!   r'     s   
zGLPNModel.__init__c                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   layerr   r   )r(   heads_to_pruner   r   r    r    r!   _prune_heads  s   zGLPNModel._prune_headsNrF   ro   r   r   r   c                 C   s~   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| j||||d}|d }|s6|f|dd   S t||j|jdS )Nro   r   r   r   r   r   )r]   ro   r   use_return_dictr   r   r,   r   )r(   rF   ro   r   r   encoder_outputssequence_outputr    r    r!   r-     s$   	zGLPNModel.forward)NNN)r/   r0   r1   r'   r   r   r   FloatTensorr   boolr   r   r   r-   r6   r    r    r)   r!   r     s$    

r   c                       *   e Zd ZdZd fdd	Zdd Z  ZS )GLPNSelectiveFeatureFusionz
    Selective Feature Fusion module, as explained in the [paper](https://huggingface.co/papers/2201.07436) (section 3.4). This
    module adaptively selects and integrates local and global features by attaining an attention map for each feature.
    @   c              	      s   t    ttjt|d |ddddt|t | _ttj|t|d ddddtt|d t | _	tjt|d ddddd| _
t | _d S )Nr8   r   r   )in_channelsout_channelsr:   r;   r<   )r&   r'   r   
Sequentialr=   rP   r   ReLUconvolutional_layer1convolutional_layer2convolutional_layer3Sigmoidsigmoid)r(   
in_channelr)   r    r!   r'     s   
z#GLPNSelectiveFeatureFusion.__init__c                 C   s   t j||fdd}| |}| |}| |}| |}||d d dd d d d f d ||d d dd d d d f d  }|S )Nr   rf   r   )r   catr   r   r   r   	unsqueeze)r(   local_featuresglobal_featuresfeaturesattnhybrid_featuresr    r    r!   r-     s   



(z"GLPNSelectiveFeatureFusion.forward)r   rK   r    r    r)   r!   r     s    r   c                       s&   e Zd Z fddZdddZ  ZS )GLPNDecoderStagec                    sP   t    ||k}|stj||ddnt | _t|| _tjdddd| _	d S )Nr   )r:   r8   bilinearFscale_factormodealign_corners)
r&   r'   r   r=   r   convolutionr   fusionUpsampleupsample)r(   r   r   should_skipr)   r    r!   r'     s
   

zGLPNDecoderStage.__init__Nc                 C   s,   |  |}|d ur| ||}| |}|S r%   )r  r  r  )r(   hidden_stateresidualr    r    r!   r-     s
   

zGLPNDecoderStage.forwardr%   r   r    r    r)   r!   r    s    r  c                       s:   e Zd Z fddZdeej deej fddZ  ZS )GLPNDecoderc                    s\   t    |jd d d }|j t fdd|D | _d | jd _tjdddd| _	d S )	Nr_   c                    s   g | ]}t | qS r    )r  )r   rC   r   r    r!   r   )  s    z(GLPNDecoder.__init__.<locals>.<listcomp>r   r8   r  Fr	  )
r&   r'   r   decoder_hidden_sizer   r   stagesr  r  final_upsample)r(   r]   reserved_hidden_sizesr)   r  r!   r'   "  s   
zGLPNDecoder.__init__r,   r   c                 C   sN   g }d }t |d d d | jD ]\}}|||}|| q| ||d< |S )Nr_   )r   r  r   r  )r(   r,   stage_hidden_statesstage_hidden_stater  stager    r    r!   r-   0  s   
zGLPNDecoder.forward	r/   r0   r1   r'   listr   r4   r-   r6   r    r    r)   r!   r  !  s    &r  c                       r   )	SiLogLossz
    Implements the Scale-invariant log scale loss [Eigen et al., 2014](https://huggingface.co/papers/1406.2283).

    $$L=\frac{1}{n} \sum_{i} d_{i}^{2}-\frac{1}{2 n^{2}}\left(\sum_{i} d_{i}^{2}\right)$$ where $d_{i}=\log y_{i}-\log
    y_{i}^{*}$.

          ?c                    r$   r%   )r&   r'   lambd)r(   r!  r)   r    r!   r'   E  r+   zSiLogLoss.__init__c                 C   sX   |dk  }t|| t||  }tt|d | jt| d  }|S )Nr   r8   )detachr   logrk   powr   r!  )r(   predtarget
valid_maskdiff_loglossr    r    r!   r-   I  s   ,zSiLogLoss.forward)r   rK   r    r    r)   r!   r  <  s    r  c                       s6   e Zd Z fddZdeej dejfddZ  ZS )GLPNDepthEstimationHeadc                    sR   t    || _|j}ttj||ddddtjddtj|ddddd| _d S )Nr   r   r9   F)inplace)	r&   r'   r]   r  r   r   r=   r   head)r(   r]   channelsr)   r    r!   r'   R  s   


z GLPNDepthEstimationHead.__init__r,   r   c                 C   s8   || j j }| |}t|| j j }|jdd}|S )Nr   rf   )r]   head_in_indexr,  r   r   	max_depthsqueeze)r(   r,   predicted_depthr    r    r!   r-   ^  s
   
zGLPNDepthEstimationHead.forwardr  r    r    r)   r!   r*  Q  s    "r*  zg
    GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2.
    )custom_introc                       sn   e Zd Z fddZe				ddejdeej dee dee dee d	e	e
ej ef fd
dZ  ZS )GLPNForDepthEstimationc                    s6   t  | t|| _t|| _t|| _|   d S r%   )	r&   r'   r   r   r  decoderr*  r,  r   r   r)   r    r!   r'   p  s
   


zGLPNForDepthEstimation.__init__NrF   labelsro   r   r   r   c                 C   s   |dur|n| j j}|dur|n| j j}| j||d|d}|r"|jn|d }| |}| |}	d}
|dur>t }||	|}
|s`|rL|	f|dd  }n	|	f|dd  }|
dur^|
f| S |S t|
|	|rh|jnd|j	dS )a  
        labels (`torch.FloatTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth depth estimation maps for computing the loss.

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, GLPNForDepthEstimation
        >>> import torch
        >>> import numpy as np
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("vinvino02/glpn-kitti")
        >>> model = GLPNForDepthEstimation.from_pretrained("vinvino02/glpn-kitti")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # interpolate to original size
        >>> post_processed_output = image_processor.post_process_depth_estimation(
        ...     outputs,
        ...     target_sizes=[(image.height, image.width)],
        ... )

        >>> # visualize the prediction
        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
        >>> depth = predicted_depth * 255 / predicted_depth.max()
        >>> depth = depth.detach().cpu().numpy()
        >>> depth = Image.fromarray(depth.astype("uint8"))
        ```NTr   r   r8   )r)  r1  r,   r   )
r]   r   r   r   r,   r4  r,  r  r   r   )r(   rF   r5  ro   r   r   ry   r,   outr1  r)  loss_fctr   r    r    r!   r-   z  s6   .


zGLPNForDepthEstimation.forward)NNNN)r/   r0   r1   r'   r   r   r   r   r   r   r   r4   r   r-   r6   r    r    r)   r!   r3  j  s(    
r3  )r3  r   r   r   )r   F)1r2   rj   typingr   r   r   torch.utils.checkpointr   activationsr   modeling_outputsr   r   modeling_utilsr	   pytorch_utilsr
   r   utilsr   r   configuration_glpnr   
get_loggerr/   loggerr4   r3   r   r"   Moduler#   r7   rL   r{   r   r   r   r   r   r   r   r   r  r  r  r*  r3  __all__r    r    r    r!   <module>   sJ   
 R'+X5,^