o
    iq                  	   @   s  d Z ddlZddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
mZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ eeZd:dejdededejfddZG dd dejZG dd dejZG dd dejZ G dd dejZ!G dd dejZ"G dd  d ejZ#G d!d" d"ejZ$G d#d$ d$ejZ%G d%d& d&ejZ&eG d'd( d(eZ'eG d)d* d*e'Z(G d+d, d,ejZ)G d-d. d.ejZ*G d/d0 d0ejZ+G d1d2 d2ejZ,G d3d4 d4ejZ-ed5d6G d7d8 d8e'Z.g d9Z/dS );zPyTorch GLPN model.    N)OptionalUnion)nn   )ACT2FN)BaseModelOutputDepthEstimatorOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )
GLPNConfig        Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)shapendimtorchrandr   r   floor_div)r   r   r   	keep_probr   random_tensoroutput r    c/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/glpn/modeling_glpn.py	drop_path#   s   
r"   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )GLPNDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                       t    || _d S N)super__init__r   )selfr   	__class__r    r!   r'   ;      

zGLPNDropPath.__init__hidden_statesc                 C   s   t || j| jS r%   )r"   r   r   )r(   r,   r    r    r!   forward?   s   zGLPNDropPath.forwardc                 C   s   d| j  S )Nzp=)r   )r(   r    r    r!   
extra_reprB   s   zGLPNDropPath.extra_reprr%   )__name__
__module____qualname____doc__r   floatr'   r   Tensorr-   strr.   __classcell__r    r    r)   r!   r#   8   s
    r#   c                       s(   e Zd ZdZ fddZdd Z  ZS )GLPNOverlapPatchEmbeddingsz+Construct the overlapping patch embeddings.c                    s4   t    tj|||||d d| _t|| _d S )N   kernel_sizestridepadding)r&   r'   r   Conv2dproj	LayerNorm
layer_norm)r(   
patch_sizer;   num_channelshidden_sizer)   r    r!   r'   J   s   
z#GLPNOverlapPatchEmbeddings.__init__c                 C   s>   |  |}|j\}}}}|ddd}| |}|||fS )Nr8   r   )r>   r   flatten	transposer@   )r(   pixel_values
embeddings_heightwidthr    r    r!   r-   V   s
   


z"GLPNOverlapPatchEmbeddings.forwardr/   r0   r1   r2   r'   r-   r6   r    r    r)   r!   r7   G   s    r7   c                       s,   e Zd ZdZ fddZ	dddZ  ZS )GLPNEfficientSelfAttentionzSegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
    paper](https://huggingface.co/papers/2102.12122).c                    s   t    || _|| _| j| j dkr td| j d| j dt| j| j | _| j| j | _t	| j| j| _
t	| j| j| _t	| j| j| _t|j| _|| _|dkrktj||||d| _t|| _d S d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   )r:   r;   )r&   r'   rC   num_attention_heads
ValueErrorintattention_head_sizeall_head_sizer   LinearquerykeyvalueDropoutattention_probs_dropout_probdropoutsr_ratior=   srr?   r@   r(   configrC   rN   sequence_reduction_ratior)   r    r!   r'   e   s,   

z#GLPNEfficientSelfAttention.__init__Fc                 C   sh  |j \}}}| ||d| j| jdd}| jdkrE|j \}}	}
|ddd||
||}| 	|}|||
dddd}| 
|}| ||d| j| jdd}| ||d| j| jdd}t||dd}|t| j }tjj|dd}| |}t||}|dddd }| d d | jf }||}|r||f}|S |f}|S )Nr   r8   r   dimr   )r   rT   viewrN   rQ   rE   rZ   permutereshaper[   r@   rU   rV   r   matmulmathsqrtr   
functionalsoftmaxrY   
contiguoussizerR   )r(   r,   rI   rJ   output_attentions
batch_size
seq_lengthrH   query_layerseq_lenrB   	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputsr    r    r!   r-      s>   




z"GLPNEfficientSelfAttention.forwardFrK   r    r    r)   r!   rL   a   s
     rL   c                       s$   e Zd Z fddZdd Z  ZS )GLPNSelfOutputc                    s*   t    t||| _t|j| _d S r%   )r&   r'   r   rS   denserW   hidden_dropout_probrY   )r(   r]   rC   r)   r    r!   r'      s   
zGLPNSelfOutput.__init__c                 C   s   |  |}| |}|S r%   )r{   rY   )r(   r,   input_tensorr    r    r!   r-      s   

zGLPNSelfOutput.forwardr/   r0   r1   r'   r-   r6   r    r    r)   r!   rz      s    rz   c                       s.   e Zd Z fddZdd ZdddZ  ZS )	GLPNAttentionc                    s6   t    t||||d| _t||d| _t | _d S )N)r]   rC   rN   r^   )rC   )r&   r'   rL   r(   rz   r   setpruned_headsr\   r)   r    r!   r'      s   
zGLPNAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   ra   )lenr
   r(   rN   rQ   r   r   rT   rU   rV   r   r{   rR   union)r(   headsindexr    r    r!   prune_heads   s   zGLPNAttention.prune_headsFc                 C   s6   |  ||||}| |d |}|f|dd   }|S )Nr   r   )r(   r   )r(   r,   rI   rJ   rm   self_outputsattention_outputrx   r    r    r!   r-      s   zGLPNAttention.forwardry   )r/   r0   r1   r'   r   r-   r6   r    r    r)   r!   r      s    r   c                       &   e Zd Zd fdd	Zdd Z  ZS )
GLPNDWConv   c              	      s(   t    tj||dddd|d| _d S )Nr   r   T)biasgroups)r&   r'   r   r=   dwconv)r(   rb   r)   r    r!   r'      s   
zGLPNDWConv.__init__c                 C   sD   |j \}}}|dd||||}| |}|ddd}|S )Nr   r8   )r   rE   rc   r   rD   )r(   r,   rI   rJ   rn   rq   rB   r    r    r!   r-      s
   
zGLPNDWConv.forward)r   r~   r    r    r)   r!   r      s    r   c                       r   )
GLPNMixFFNNc                    sl   t    |p|}t||| _t|| _t|jt	r"t
|j | _n|j| _t||| _t|j| _d S r%   )r&   r'   r   rS   dense1r   r   
isinstance
hidden_actr5   r   intermediate_act_fndense2rW   r|   rY   )r(   r]   in_featureshidden_featuresout_featuresr)   r    r!   r'      s   

zGLPNMixFFN.__init__c                 C   sD   |  |}| |||}| |}| |}| |}| |}|S r%   )r   r   r   rY   r   )r(   r,   rI   rJ   r    r    r!   r-     s   




zGLPNMixFFN.forward)NNr~   r    r    r)   r!   r      s    r   c                       s*   e Zd ZdZ fddZdddZ  ZS )	GLPNLayerzCThis corresponds to the Block class in the original implementation.c                    sn   t    t|| _t||||d| _|dkrt|nt | _	t|| _
t|| }t|||d| _d S )N)rC   rN   r^   r   )r   r   )r&   r'   r   r?   layer_norm_1r   	attentionr#   Identityr"   layer_norm_2rP   r   mlp)r(   r]   rC   rN   r"   r^   	mlp_ratiomlp_hidden_sizer)   r    r!   r'     s   
zGLPNLayer.__init__Fc           
      C   sr   | j | ||||d}|d }|dd  }| |}|| }| | |||}| |}|| }	|	f| }|S )N)rm   r   r   )r   r   r"   r   r   )
r(   r,   rI   rJ   rm   self_attention_outputsr   rx   
mlp_outputlayer_outputr    r    r!   r-   '  s   


zGLPNLayer.forwardry   rK   r    r    r)   r!   r     s    r   c                       s,   e Zd Z fddZ			dddZ  ZS )GLPNEncoderc           	         sX  t     | _dd tjd jt jddD }g }t j	D ]"}|
t j|  j| |dkr5 jn j|d   j| d q!t|| _g }d}t j	D ]@}g }|dkrd| j|d  7 }t j| D ]}|
t  j|  j| |||   j|  j| d qk|
t| qSt|| _t fd	dt j	D | _d S )
Nc                 S   s   g | ]}|  qS r    )item).0xr    r    r!   
<listcomp>G  s    z(GLPNEncoder.__init__.<locals>.<listcomp>r   cpu)r   r   )rA   r;   rB   rC   )rC   rN   r"   r^   r   c                    s   g | ]
}t  j| qS r    )r   r?   hidden_sizes)r   ir]   r    r!   r   o  s    )r&   r'   r]   r   linspacedrop_path_ratesumdepthsrangenum_encoder_blocksappendr7   patch_sizesstridesrB   r   r   
ModuleListpatch_embeddingsr   rN   	sr_ratios
mlp_ratiosblockr@   )	r(   r]   dprrG   r   blockscurlayersjr)   r   r!   r'   B  sH   
$


zGLPNEncoder.__init__FTc                 C   s   |rdnd }|r
dnd }|j d }|}tt| j| j| jD ]H\}	}
|
\}}}||\}}}t|D ]\}}|||||}|d }|rJ||d f }q2||}||||ddddd }|rf||f }q|sut	dd |||fD S t
|||d	S )
Nr    r   r   r_   r   r8   c                 s   s    | ]	}|d ur|V  qd S r%   r    )r   vr    r    r!   	<genexpr>  s    z&GLPNEncoder.forward.<locals>.<genexpr>last_hidden_stater,   
attentions)r   	enumeratezipr   r   r@   re   rd   rk   tupler   )r(   rF   rm   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsrn   r,   idxr   embedding_layerblock_layer
norm_layerrI   rJ   r   blklayer_outputsr    r    r!   r-   r  s2   

 
zGLPNEncoder.forward)FFTr~   r    r    r)   r!   r   A  s    3r   c                   @   s*   e Zd ZU eed< dZdZg Zdd ZdS )GLPNPreTrainedModelr]   glpnrF   c                 C   s   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjrF|jjjd| jjd |jdurD|jj|j 
  dS dS t |tjtjfr^|j	j
  |jjd dS dS )zInitialize the weightsr   )meanstdNg      ?)r   r   rS   r=   weightdatanormal_r]   initializer_ranger   zero_	Embeddingpadding_idxr?   BatchNorm2dfill_)r(   moduler    r    r!   _init_weights  s   

z!GLPNPreTrainedModel._init_weightsN)	r/   r0   r1   r   __annotations__base_model_prefixmain_input_name_no_split_modulesr   r    r    r    r!   r     s   
 r   c                       sd   e Zd Z fddZdd Ze			ddejdee	 dee	 d	ee	 d
e
eef f
ddZ  ZS )	GLPNModelc                    s(   t  | || _t|| _|   d S r%   )r&   r'   r]   r   encoder	post_initr(   r]   r)   r    r!   r'     s   
zGLPNModel.__init__c                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   layerr   r   )r(   heads_to_pruner   r   r    r    r!   _prune_heads  s   zGLPNModel._prune_headsNrF   rm   r   r   r   c                 C   s~   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| j||||d}|d }|s6|f|dd   S t||j|jdS )Nrm   r   r   r   r   r   )r]   rm   r   use_return_dictr   r   r,   r   )r(   rF   rm   r   r   encoder_outputssequence_outputr    r    r!   r-     s$   	zGLPNModel.forward)NNN)r/   r0   r1   r'   r   r   r   FloatTensorr   boolr   r   r   r-   r6   r    r    r)   r!   r     s$    

r   c                       *   e Zd ZdZd fdd	Zdd Z  ZS )GLPNSelectiveFeatureFusionz
    Selective Feature Fusion module, as explained in the [paper](https://huggingface.co/papers/2201.07436) (section 3.4). This
    module adaptively selects and integrates local and global features by attaining an attention map for each feature.
    @   c              	      s   t    ttjt|d |ddddt|t | _ttj|t|d ddddtt|d t | _	tjt|d ddddd| _
t | _d S )Nr8   r   r   )in_channelsout_channelsr:   r;   r<   )r&   r'   r   
Sequentialr=   rP   r   ReLUconvolutional_layer1convolutional_layer2convolutional_layer3Sigmoidsigmoid)r(   
in_channelr)   r    r!   r'     s   
z#GLPNSelectiveFeatureFusion.__init__c                 C   s   t j||fdd}| |}| |}| |}| |}||d d dd d d d f d ||d d dd d d d f d  }|S )Nr   ra   r   )r   catr   r   r   r   	unsqueeze)r(   local_featuresglobal_featuresfeaturesattnhybrid_featuresr    r    r!   r-     s   



(z"GLPNSelectiveFeatureFusion.forward)r   rK   r    r    r)   r!   r     s    r   c                       s&   e Zd Z fddZdddZ  ZS )GLPNDecoderStagec                    sP   t    ||k}|stj||ddnt | _t|| _tjdddd| _	d S )Nr   )r:   r8   bilinearFscale_factormodealign_corners)
r&   r'   r   r=   r   convolutionr   fusionUpsampleupsample)r(   r   r   should_skipr)   r    r!   r'     s
   

zGLPNDecoderStage.__init__Nc                 C   s,   |  |}|d ur| ||}| |}|S r%   )r  r  r  )r(   hidden_stateresidualr    r    r!   r-     s
   

zGLPNDecoderStage.forwardr%   r~   r    r    r)   r!   r    s    r  c                       s:   e Zd Z fddZdeej deej fddZ  ZS )GLPNDecoderc                    s\   t    |jd d d }|j t fdd|D | _d | jd _tjdddd| _	d S )	Nr_   c                    s   g | ]}t | qS r    )r  )r   rC   r   r    r!   r   0  s    z(GLPNDecoder.__init__.<locals>.<listcomp>r   r8   r  Fr  )
r&   r'   r   decoder_hidden_sizer   r   stagesr  r  final_upsample)r(   r]   reserved_hidden_sizesr)   r  r!   r'   )  s   
zGLPNDecoder.__init__r,   r   c                 C   sN   g }d }t |d d d | jD ]\}}|||}|| q| ||d< |S )Nr_   )r   r  r   r  )r(   r,   stage_hidden_statesstage_hidden_stater  stager    r    r!   r-   7  s   
zGLPNDecoder.forward	r/   r0   r1   r'   listr   r4   r-   r6   r    r    r)   r!   r  (  s    &r  c                       r   )	SiLogLossz
    Implements the Scale-invariant log scale loss [Eigen et al., 2014](https://huggingface.co/papers/1406.2283).

    $$L=\frac{1}{n} \sum_{i} d_{i}^{2}-\frac{1}{2 n^{2}}\left(\sum_{i} d_{i}^{2}\right)$$ where $d_{i}=\log y_{i}-\log
    y_{i}^{*}$.

          ?c                    r$   r%   )r&   r'   lambd)r(   r   r)   r    r!   r'   L  r+   zSiLogLoss.__init__c                 C   sX   |dk  }t|| t||  }tt|d | jt| d  }|S )Nr   r8   )detachr   logrh   powr   r   )r(   predtarget
valid_maskdiff_loglossr    r    r!   r-   P  s   ,zSiLogLoss.forward)r  rK   r    r    r)   r!   r  C  s    r  c                       s6   e Zd Z fddZdeej dejfddZ  ZS )GLPNDepthEstimationHeadc                    sR   t    || _|j}ttj||ddddtjddtj|ddddd| _d S )Nr   r   r9   F)inplace)	r&   r'   r]   r  r   r   r=   r   head)r(   r]   channelsr)   r    r!   r'   Y  s   


z GLPNDepthEstimationHead.__init__r,   r   c                 C   s8   || j j }| |}t|| j j }|jdd}|S )Nr   ra   )r]   head_in_indexr+  r   r   	max_depthsqueeze)r(   r,   predicted_depthr    r    r!   r-   e  s
   
zGLPNDepthEstimationHead.forwardr  r    r    r)   r!   r)  X  s    "r)  zg
    GLPN Model transformer with a lightweight depth estimation head on top e.g. for KITTI, NYUv2.
    )custom_introc                       sn   e Zd Z fddZe				ddejdeej dee dee dee d	e	e
ej ef fd
dZ  ZS )GLPNForDepthEstimationc                    s6   t  | t|| _t|| _t|| _|   d S r%   )	r&   r'   r   r   r  decoderr)  r+  r   r   r)   r    r!   r'   w  s
   


zGLPNForDepthEstimation.__init__NrF   labelsrm   r   r   r   c                 C   s   |dur|n| j j}|dur|n| j j}| j||d|d}|r"|jn|d }| |}| |}	d}
|dur>t }||	|}
|s`|rL|	f|dd  }n	|	f|dd  }|
dur^|
f| S |S t|
|	|rh|jnd|j	dS )a  
        labels (`torch.FloatTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth depth estimation maps for computing the loss.

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, GLPNForDepthEstimation
        >>> import torch
        >>> import numpy as np
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("vinvino02/glpn-kitti")
        >>> model = GLPNForDepthEstimation.from_pretrained("vinvino02/glpn-kitti")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # interpolate to original size
        >>> post_processed_output = image_processor.post_process_depth_estimation(
        ...     outputs,
        ...     target_sizes=[(image.height, image.width)],
        ... )

        >>> # visualize the prediction
        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
        >>> depth = predicted_depth * 255 / predicted_depth.max()
        >>> depth = depth.detach().cpu().numpy()
        >>> depth = Image.fromarray(depth.astype("uint8"))
        ```NTr   r   r8   )r(  r0  r,   r   )
r]   r   r   r   r,   r3  r+  r  r   r   )r(   rF   r4  rm   r   r   rx   r,   outr0  r(  loss_fctr   r    r    r!   r-     s6   .


zGLPNForDepthEstimation.forward)NNNN)r/   r0   r1   r'   r   r   r   r   r   r   r   r4   r   r-   r6   r    r    r)   r!   r2  q  s(    
r2  )r2  r   r   r   )r   F)0r2   rg   typingr   r   r   r   activationsr   modeling_outputsr   r   modeling_utilsr	   pytorch_utilsr
   r   utilsr   r   configuration_glpnr   
get_loggerr/   loggerr4   r3   r   r"   Moduler#   r7   rL   rz   r   r   r   r   r   r   r   r   r  r  r  r)  r2  __all__r    r    r    r!   <module>   sH   
 Z'+X5,^