o
    i8                     @   s  d Z ddlmZmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZ G dd dejZG dd dejZG dd dejZG dd dejZG dd dejZeG dd deZeddG dd deZddgZdS )zrPyTorch UperNet model. Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.    )OptionalUnionN)nn)CrossEntropyLoss   )SemanticSegmenterOutput)PreTrainedModel)auto_docstring)load_backbone   )UperNetConfigc                       s   e Zd ZdZ			ddededeeeeef f deeeeef ef d	ed
eeeeef f ddf fddZ	de
jde
jfddZ  ZS )UperNetConvModulez
    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
    r   Fr   in_channelsout_channelskernel_sizepaddingbiasdilationreturnNc                    s<   t    tj||||||d| _t|| _t | _d S )N)r   r   r   r   r   r   )	super__init__r   Conv2dconvBatchNorm2d
batch_normReLU
activation)selfr   r   r   r   r   r   	__class__ i/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/upernet/modeling_upernet.pyr   $   s   
	zUperNetConvModule.__init__inputc                 C   s"   |  |}| |}| |}|S N)r   r   r   )r   r"   outputr    r    r!   forward9   s   


zUperNetConvModule.forward)r   Fr   )__name__
__module____qualname____doc__intr   tuplestrboolr   torchTensorr%   __classcell__r    r    r   r!   r      s*    
r   c                       sD   e Zd Zdedededdf fddZdejdejfd	d
Z  ZS )UperNetPyramidPoolingBlock
pool_scaler   channelsr   Nc                    sL   t    t|t||ddg| _t| jD ]\}}| t|| qd S )Nr   r   )	r   r   r   AdaptiveAvgPool2dr   layers	enumerate
add_moduler,   )r   r2   r   r3   ilayerr   r    r!   r   B   s   
z#UperNetPyramidPoolingBlock.__init__r"   c                 C   s   |}| j D ]}||}q|S r#   )r6   )r   r"   hidden_stater:   r    r    r!   r%   K   s   

z"UperNetPyramidPoolingBlock.forward)	r&   r'   r(   r*   r   r.   r/   r%   r0   r    r    r   r!   r1   A   s    	r1   c                
       sX   e Zd ZdZdeedf dedededdf
 fd	d
Zdej	de
ej	 fddZ  ZS )UperNetPyramidPoolingModulea}  
    Pyramid Pooling Module (PPM) used in PSPNet.

    Args:
        pool_scales (`tuple[int]`):
            Pooling scales used in Pooling Pyramid Module.
        in_channels (`int`):
            Input channels.
        channels (`int`):
            Channels after modules, before conv_seg.
        align_corners (`bool`):
            align_corners argument of F.interpolate.
    pool_scales.r   r3   align_cornersr   Nc                    sh   t    || _|| _|| _|| _g | _t|D ]\}}t|||d}| j	| | 
t|| qd S )N)r2   r   r3   )r   r   r=   r>   r   r3   blocksr7   r1   appendr8   r,   )r   r=   r   r3   r>   r9   r2   blockr   r    r!   r   a   s   
z$UperNetPyramidPoolingModule.__init__xc                 C   sH   g }| j D ]}||}tjj|| dd  d| jd}|| q|S )N   bilinearsizemoder>   )r?   r   
functionalinterpolaterF   r>   r@   )r   rB   ppm_outsppmppm_outupsampled_ppm_outr    r    r!   r%   m   s   
z#UperNetPyramidPoolingModule.forward)r&   r'   r(   r)   r+   r*   r-   r   r.   r/   listr%   r0   r    r    r   r!   r<   R   s    *"r<   c                       s>   e Zd ZdZ fddZdd Zdejdejfdd	Z  Z	S )
UperNetHeadz
    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
    [UPerNet](https://huggingface.co/papers/1807.10221).
    c                    s  t    || _|j| _|| _|j| _d| _tj	| j|j
dd| _t| j| jd | j| jd| _t| jd t| j| j  | jddd| _t | _t | _| jd d D ] }t|| jdd}t| j| jddd}| j| | j| qVtt| j| j | jddd| _d S )NFr   r4   )r>   r   r   r   )r   r   configr=   r   hidden_sizer3   r>   r   r   
num_labels
classifierr<   psp_modulesr   len
bottleneck
ModuleListlateral_convs	fpn_convsr@   fpn_bottleneck)r   rR   r   l_convfpn_convr   r    r!   r   ~   s@   


zUperNetHead.__init__c                 C   s:   |d }|g}| | | tj|dd}| |}|S )NrP   r   dim)extendrV   r.   catrX   )r   inputsrB   psp_outsr$   r    r    r!   psp_forward   s   
zUperNetHead.psp_forwardencoder_hidden_statesr   c                    s   fddt jD   t}t|d ddD ]$}|d  jdd  }|d  tjj	| |dj
d |d < q fd	dt|d D }|d  t|d ddD ]}tjj	|| |d jdd  dj
d||< qbtj|dd
}|}|}|S )Nc                    s   g | ]
\}}| | qS r    r    ).0r9   lateral_conv)rf   r    r!   
<listcomp>   s    z'UperNetHead.forward.<locals>.<listcomp>r   r   rP   rC   rD   rE   c                    s   g | ]}j |  | qS r    )r[   )rg   r9   )lateralsr   r    r!   ri      s    r_   )r7   rZ   r@   re   rW   rangeshaper   rH   rI   r>   r.   rb   r\   rU   )r   rf   used_backbone_levelsr9   
prev_shapefpn_outsr$   r    )rf   rj   r   r!   r%      s$   

zUperNetHead.forward)
r&   r'   r(   r)   r   re   r.   r/   r%   r0   r    r    r   r!   rO   x   s
    '	rO   c                       s\   e Zd ZdZ	ddededeeeeef f dd	f fd
dZdej	dej	fddZ
  ZS )UperNetFCNHeada  
    Fully Convolution Networks for Semantic Segmentation. This head is the implementation of
    [FCNNet](https://huggingface.co/papers/1411.4038>).

    Args:
        config:
            Configuration.
        in_channels (int):
            Number of input channels.
        kernel_size (int):
            The kernel size for convs in the head. Default: 3.
        dilation (int):
            The dilation rate for convs in the head. Default: 1.
    rC   r   r   in_indexr   r   r   Nc           	   
      s  t    || _|jd u r|| n|j| _|j| _|j| _|j	| _
|| _|d | }g }|t| j| j|||d t| jd D ]}|t| j| j|||d qA| jdkr]t | _ntj| | _| j
rvt| j| j | j||d d| _tj| j|jdd| _d S )NrC   )r   r   r   r   r   rQ   r4   )r   r   rR   auxiliary_in_channelsr   auxiliary_channelsr3   auxiliary_num_convs	num_convsauxiliary_concat_inputconcat_inputrq   r@   r   rk   r   Identityconvs
Sequentialconv_catr   rT   rU   )	r   rR   r   rq   r   r   conv_paddingry   r9   r   r    r!   r      s:   

zUperNetFCNHead.__init__rf   c                 C   s@   || j  }| |}| jr| tj||gdd}| |}|S )Nr   r_   )rq   ry   rw   r{   r.   rb   rU   )r   rf   hidden_statesr$   r    r    r!   r%     s   


zUperNetFCNHead.forward)rC   r   r   )r&   r'   r(   r)   r*   r   r+   r   r.   r/   r%   r0   r    r    r   r!   rp      s    &rp   c                   @   s&   e Zd ZU eed< dZg Zdd ZdS )UperNetPreTrainedModelrR   pixel_valuesc                 C   sn   t |tjr |jjjd| jjd |jd ur|jj	  d S d S t |tj
r5|jjd |jj	  d S d S )Ng        )meanstdg      ?)
isinstancer   r   weightdatanormal_rR   initializer_ranger   zero_r   fill_)r   moduler    r    r!   _init_weights  s   
z$UperNetPreTrainedModel._init_weightsN)r&   r'   r(   r   __annotations__main_input_name_no_split_modulesr   r    r    r    r!   r~     s
   
 r~   zW
    UperNet framework leveraging any vision backbone e.g. for ADE20k, CityScapes.
    )custom_introc                       sn   e Zd Z fddZe					ddeej dee dee deej dee d	e	e
ef fd
dZ  ZS )UperNetForSemanticSegmentationc                    sP   t  | t|| _t|| jjd| _|jrt|| jjdnd | _	| 
  d S )N)r   )r   r   r
   backbonerO   r3   decode_headuse_auxiliary_headrp   auxiliary_head	post_init)r   rR   r   r    r!   r   "  s   
z'UperNetForSemanticSegmentation.__init__Nr   output_attentionsoutput_hidden_stateslabelsreturn_dictr   c                 C   sl  |dur| j jdkrtd|dur|n| j j}|dur|n| j j}|dur(|n| j j}| jj|||d}|j}| 	|}t
jj||jdd ddd}d}	| jdurg| |}	t
jj|	|jdd ddd}	d}
|durt| j jd	}|||}
|	dur||	|}|
| j j| 7 }
|s|r|f|dd  }n	|f|dd  }|
dur|
f| S |S t|
||j|jd
S )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, UperNetForSemanticSegmentation
        >>> from PIL import Image
        >>> from huggingface_hub import hf_hub_download

        >>> image_processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-convnext-tiny")
        >>> model = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-convnext-tiny")

        >>> filepath = hf_hub_download(
        ...     repo_id="hf-internal-testing/fixtures_ade20k", filename="ADE_val_00000001.jpg", repo_type="dataset"
        ... )
        >>> image = Image.open(filepath).convert("RGB")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)

        >>> logits = outputs.logits  # shape (batch_size, num_labels, height, width)
        >>> list(logits.shape)
        [1, 150, 512, 512]
        ```Nr   z/The number of labels should be greater than one)r   r   rC   rD   FrE   )ignore_index)losslogitsr}   
attentions)rR   rT   
ValueErroruse_return_dictr   r   r   forward_with_filtered_kwargsfeature_mapsr   r   rH   rI   rl   r   r   loss_ignore_indexauxiliary_loss_weightr   r}   r   )r   r   r   r   r   r   outputsfeaturesr   auxiliary_logitsr   loss_fctauxiliary_lossr$   r    r    r!   r%   0  sH   $




z&UperNetForSemanticSegmentation.forward)NNNNN)r&   r'   r(   r   r	   r   r.   r/   r-   r   r+   r   r%   r0   r    r    r   r!   r     s*    
r   )r)   typingr   r   r.   r   torch.nnr   modeling_outputsr   modeling_utilsr   utilsr	   utils.backbone_utilsr
   configuration_upernetr   Moduler   r1   r<   rO   rp   r~   r   __all__r    r    r    r!   <module>   s,   #&T@c