o
    iz)                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
mZ dd	lmZ d
dlmZ eeddG dd de
ZeG dd deZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZeddG dd deZddgZdS )zPyTorch ViTMatte model.    )	dataclass)OptionalN)nn   )PreTrainedModel)ModelOutputauto_docstring)load_backbone   )VitMatteConfigz4
    Class for outputs of image matting models.
    )custom_introc                   @   sb   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dS )ImageMattingOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Loss.
    alphas (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Estimated alpha values.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
        (also called feature maps) of the model at the output of each stage.
    Nlossalphashidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   tupler    r   r   k/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/vitmatte/modeling_vitmatte.pyr      s   
 r   c                   @   s2   e Zd ZU eed< dZdZg Zdej	fddZ
dS )VitMattePreTrainedModelconfigpixel_valuesTmodulec                 C   sJ   t |tjtjfr!|jjjd| jjd |j	d ur#|j	j
  d S d S d S )Ng        )meanstd)
isinstancer   Conv2dBatchNorm2dweightdatanormal_r   initializer_rangebiaszero_)selfr   r   r   r   _init_weights<   s   
z%VitMattePreTrainedModel._init_weightsN)r   r   r   r   r   main_input_namesupports_gradient_checkpointing_no_split_modulesr   Moduler,   r   r   r   r   r   5   s   
 r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	VitMatteBasicConv3x3zP
    Basic convolution layers including: Conv3x3, BatchNorm2d, ReLU layers.
       r
   c                    sB   t    tj||d||dd| _tj||jd| _t | _	d S )Nr   F)in_channelsout_channelskernel_sizestridepaddingr)   )eps)
super__init__r   r#   convr$   batch_norm_eps
batch_normReLUrelu)r+   r   r3   r4   r6   r7   	__class__r   r   r:   H   s   
zVitMatteBasicConv3x3.__init__c                 C   s"   |  |}| |}| |}|S N)r;   r=   r?   r+   hidden_stater   r   r   forwardU   s   


zVitMatteBasicConv3x3.forward)r2   r
   r   r   r   r   r:   rE   __classcell__r   r   r@   r   r1   C   s    r1   c                       (   e Zd ZdZ fddZdd Z  ZS )VitMatteConvStreamzc
    Simple ConvStream containing a series of basic conv3x3 layers to extract detail features.
    c                    s   t    d}|jd ur|jj}|j}t | _|g| | _t	t
| jd D ]}| j| }| j|d  }| jt||| q'd S )N   r
   )r9   r:   backbone_confignum_channelsconvstream_hidden_sizesr   
ModuleListconvs
conv_chansrangelenappendr1   )r+   r   r3   r4   iin_chan_	out_chan_r@   r   r   r:   b   s   



zVitMatteConvStream.__init__c                 C   sJ   d|i}|}t t| jD ]}| j| |}dt|d  }|||< q|S )Ndetailed_feature_map_0detailed_feature_map_r
   )rQ   rR   rO   str)r+   r   out_dict
embeddingsrT   name_r   r   r   rE   u   s   
zVitMatteConvStream.forwardrF   r   r   r@   r   rI   ]   s    rI   c                       rH   )VitMatteFusionBlockz\
    Simple fusion block to fuse features from ConvStream and Plain Vision Transformer.
    c                    s"   t    t|||ddd| _d S )Nr
   )r6   r7   )r9   r:   r1   r;   )r+   r   r3   r4   r@   r   r   r:      s   
zVitMatteFusionBlock.__init__c                 C   s4   t jj|dddd}tj||gdd}| |}|S )Nr2   bilinearF)scale_factormodealign_cornersr
   )dim)r   
functionalinterpolater   catr;   )r+   featuresdetailed_feature_mapupscaled_featuresoutr   r   r   rE      s   
zVitMatteFusionBlock.forwardrF   r   r   r@   r   r]      s    r]   c                       rH   )VitMatteHeadzJ
    Simple Matting Head, containing only conv3x3 and conv1x1 layers.
    c                    sZ   t    |jd }d}ttj||ddddt|tdtj|ddddd| _d S )N   r   r
   )r5   r6   r7   Tr   )	r9   r:   fusion_hidden_sizesr   
Sequentialr#   r$   r>   matting_convs)r+   r   r3   mid_channelsr@   r   r   r:      s   


zVitMatteHead.__init__c                 C   s   |  |}|S rB   )ro   rC   r   r   r   rE      s   
zVitMatteHead.forwardrF   r   r   r@   r   rj      s    rj   c                       rH   )VitMatteDetailCaptureModulezG
    Simple and lightweight Detail Capture Module for ViT Matting.
    c              	      s   t    t|jt|jd krtd|| _t|| _| jj	| _	t
 | _|jg|j | _tt| jd D ]}| jt|| j| | j	|d    | j|d  d q8t|| _d S )Nr
   z_The length of fusion_hidden_sizes should be equal to the length of convstream_hidden_sizes + 1.)r   r3   r4   )r9   r:   rR   rm   rM   
ValueErrorr   rI   
convstreamrP   r   rN   fusion_blockshidden_sizefusion_channelsrQ   rS   r]   rj   matting_head)r+   r   rT   r@   r   r   r:      s&   



z$VitMatteDetailCaptureModule.__init__c                 C   s`   |  |}tt| jD ]}dtt| j| d  }| j| ||| }qt| |}|S )NrX   r
   )rs   rQ   rR   rt   rY   r   sigmoidrw   )r+   rf   r   detail_featuresrT   detailed_feature_map_namer   r   r   r   rE      s   
z#VitMatteDetailCaptureModule.forwardrF   r   r   r@   r   rq      s    rq   zX
    ViTMatte framework leveraging any vision backbone e.g. for ADE20k, CityScapes.
    c                       sb   e Zd Z fddZe					ddeej dee dee deej dee f
d	d
Z	  Z
S )VitMatteForImageMattingc                    s2   t  | || _t|| _t|| _|   d S rB   )r9   r:   r   r	   backbonerq   decoder	post_init)r+   r   r@   r   r   r:      s
   

z VitMatteForImageMatting.__init__Nr   output_attentionsoutput_hidden_stateslabelsreturn_dictc                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}d}|dur(td| jj|||d}|jd }| ||}	|sR|	f|dd  }
|durP|f|
 S |
S t	||	|j
|jdS )a8  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth image matting for computing the loss.

        Examples:

        ```python
        >>> from transformers import VitMatteImageProcessor, VitMatteForImageMatting
        >>> import torch
        >>> from PIL import Image
        >>> from huggingface_hub import hf_hub_download

        >>> processor = VitMatteImageProcessor.from_pretrained("hustvl/vitmatte-small-composition-1k")
        >>> model = VitMatteForImageMatting.from_pretrained("hustvl/vitmatte-small-composition-1k")

        >>> filepath = hf_hub_download(
        ...     repo_id="hf-internal-testing/image-matting-fixtures", filename="image.png", repo_type="dataset"
        ... )
        >>> image = Image.open(filepath).convert("RGB")
        >>> filepath = hf_hub_download(
        ...     repo_id="hf-internal-testing/image-matting-fixtures", filename="trimap.png", repo_type="dataset"
        ... )
        >>> trimap = Image.open(filepath).convert("L")

        >>> # prepare image + trimap for the model
        >>> inputs = processor(images=image, trimaps=trimap, return_tensors="pt")

        >>> with torch.no_grad():
        ...     alphas = model(**inputs).alphas
        >>> print(alphas.shape)
        torch.Size([1, 1, 640, 960])
        ```NzTraining is not yet supported)r   r   rk   r
   )r   r   r   r   )r   use_return_dictr   r   NotImplementedErrorr|   forward_with_filtered_kwargsfeature_mapsr}   r   r   r   )r+   r   r   r   r   r   r   outputsrf   r   outputr   r   r   rE      s*   )
zVitMatteForImageMatting.forward)NNNNN)r   r   r   r:   r   r   r   TensorboolrE   rG   r   r   r@   r   r{      s&    
r{   )r   dataclassesr   typingr   r   r   modeling_utilsr   utilsr   r   utils.backbone_utilsr	   configuration_vitmatter   r   r   r0   r1   rI   r]   rj   rq   r{   __all__r   r   r   r   <module>   s4   #)Q