o
    eiy*                     @   s  d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZmZ d
dlmZ eeddG dd deZeG dd deZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZeddG dd deZddgZdS )zPyTorch ViTMatte model.    )	dataclassN)nn   )initialization)load_backbone)PreTrainedModel)ModelOutputauto_docstring   )VitMatteConfigz4
    Class for outputs of image matting models.
    )custom_introc                   @   sb   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ej dB ed< dZe
ej dB ed< dS )ImageMattingOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Loss.
    alphas (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Estimated alpha values.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
        (also called feature maps) of the model at the output of each stage.
    Nlossalphashidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   tupler    r   r   l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/vitmatte/modeling_vitmatte.pyr      s   
 r   c                   @   s>   e Zd ZU eed< dZdZdZg Ze	
 dejfddZdS )	VitMattePreTrainedModelconfigpixel_values)imageTmodulec                 C   s~   t |tjtjfr;tj|jd| jjd |j	d urt
|j	 t|dd d ur=t
|j t|j t
|j d S d S d S )Ng        )meanstdrunning_mean)
isinstancer   Conv2dBatchNorm2dinitnormal_weightr   initializer_rangebiaszeros_getattrr#   ones_running_varnum_batches_tracked)selfr    r   r   r   _init_weights<   s   
z%VitMattePreTrainedModel._init_weightsN)r   r   r   r   r   main_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modulesr   no_gradr   Moduler2   r   r   r   r   r   4   s   
 r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	VitMatteBasicConv3x3zP
    Basic convolution layers including: Conv3x3, BatchNorm2d, ReLU layers.
       r
   c                    sB   t    tj||d||dd| _tj||jd| _t | _	d S )Nr   F)in_channelsout_channelskernel_sizestridepaddingr+   )eps)
super__init__r   r%   convr&   batch_norm_eps
batch_normReLUrelu)r1   r   r;   r<   r>   r?   	__class__r   r   rB   M   s   
zVitMatteBasicConv3x3.__init__c                 C   s"   |  |}| |}| |}|S N)rC   rE   rG   r1   hidden_stater   r   r   forwardZ   s   


zVitMatteBasicConv3x3.forward)r:   r
   r   r   r   r   rB   rM   __classcell__r   r   rH   r   r9   H   s    r9   c                       (   e Zd ZdZ fddZdd Z  ZS )VitMatteConvStreamzc
    Simple ConvStream containing a series of basic conv3x3 layers to extract detail features.
    c                    s   t    d}|jd ur|jj}|j}t | _|g| | _t	t
| jd D ]}| j| }| j|d  }| jt||| q'd S )N   r
   )rA   rB   backbone_confignum_channelsconvstream_hidden_sizesr   
ModuleListconvs
conv_chansrangelenappendr9   )r1   r   r;   r<   iin_chan_	out_chan_rH   r   r   rB   g   s   



zVitMatteConvStream.__init__c                 C   sJ   d|i}|}t t| jD ]}| j| |}dt|d  }|||< q|S )Ndetailed_feature_map_0detailed_feature_map_r
   )rY   rZ   rW   str)r1   r   out_dict
embeddingsr\   name_r   r   r   rM   z   s   
zVitMatteConvStream.forwardrN   r   r   rH   r   rQ   b   s    rQ   c                       rP   )VitMatteFusionBlockz\
    Simple fusion block to fuse features from ConvStream and Plain Vision Transformer.
    c                    s"   t    t|||ddd| _d S )Nr
   )r>   r?   )rA   rB   r9   rC   )r1   r   r;   r<   rH   r   r   rB      s   
zVitMatteFusionBlock.__init__c                 C   s4   t jj|dddd}tj||gdd}| |}|S )Nr:   bilinearF)scale_factormodealign_cornersr
   )dim)r   
functionalinterpolater   catrC   )r1   featuresdetailed_feature_mapupscaled_featuresoutr   r   r   rM      s   
zVitMatteFusionBlock.forwardrN   r   r   rH   r   re      s    re   c                       rP   )VitMatteHeadzJ
    Simple Matting Head, containing only conv3x3 and conv1x1 layers.
    c                    sZ   t    |jd }d}ttj||ddddt|tdtj|ddddd| _d S )N   r   r
   )r=   r>   r?   Tr   )	rA   rB   fusion_hidden_sizesr   
Sequentialr%   r&   rF   matting_convs)r1   r   r;   mid_channelsrH   r   r   rB      s   


zVitMatteHead.__init__c                 C   s   |  |}|S rJ   )rw   rK   r   r   r   rM      s   
zVitMatteHead.forwardrN   r   r   rH   r   rr      s    rr   c                       rP   )VitMatteDetailCaptureModulezG
    Simple and lightweight Detail Capture Module for ViT Matting.
    c              	      s   t    t|jt|jd krtd|| _t|| _| jj	| _	t
 | _|jg|j | _tt| jd D ]}| jt|| j| | j	|d    | j|d  d q8t|| _d S )Nr
   z_The length of fusion_hidden_sizes should be equal to the length of convstream_hidden_sizes + 1.)r   r;   r<   )rA   rB   rZ   ru   rU   
ValueErrorr   rQ   
convstreamrX   r   rV   fusion_blockshidden_sizefusion_channelsrY   r[   re   rr   matting_head)r1   r   r\   rH   r   r   rB      s&   



z$VitMatteDetailCaptureModule.__init__c                 C   s`   |  |}tt| jD ]}dtt| j| d  }| j| ||| }qt| |}|S )Nr`   r
   )r{   rY   rZ   r|   ra   r   sigmoidr   )r1   rn   r   detail_featuresr\   detailed_feature_map_namer   r   r   r   rM      s   
z#VitMatteDetailCaptureModule.forwardrN   r   r   rH   r   ry      s    ry   zX
    ViTMatte framework leveraging any vision backbone e.g. for ADE20k, CityScapes.
    c                       sb   e Zd Z fddZe					ddejdB dedB dedB dejdB dedB f
d	d
Z  Z	S )VitMatteForImageMattingc                    s2   t  | || _t|| _t|| _|   d S rJ   )rA   rB   r   r   backbonery   decoder	post_init)r1   r   rH   r   r   rB      s
   

z VitMatteForImageMatting.__init__Nr   output_attentionsoutput_hidden_stateslabelsreturn_dictc                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}d}|dur(td| jj|||d}|jd }	| |	|}
|sR|
f|dd  }|durP|f| S |S t	||
|j
|jdS )a8  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth image matting for computing the loss.

        Examples:

        ```python
        >>> from transformers import VitMatteImageProcessor, VitMatteForImageMatting
        >>> import torch
        >>> from PIL import Image
        >>> from huggingface_hub import hf_hub_download

        >>> processor = VitMatteImageProcessor.from_pretrained("hustvl/vitmatte-small-composition-1k")
        >>> model = VitMatteForImageMatting.from_pretrained("hustvl/vitmatte-small-composition-1k")

        >>> filepath = hf_hub_download(
        ...     repo_id="hf-internal-testing/image-matting-fixtures", filename="image.png", repo_type="dataset"
        ... )
        >>> image = Image.open(filepath).convert("RGB")
        >>> filepath = hf_hub_download(
        ...     repo_id="hf-internal-testing/image-matting-fixtures", filename="trimap.png", repo_type="dataset"
        ... )
        >>> trimap = Image.open(filepath).convert("L")

        >>> # prepare image + trimap for the model
        >>> inputs = processor(images=image, trimaps=trimap, return_tensors="pt")

        >>> with torch.no_grad():
        ...     alphas = model(**inputs).alphas
        >>> print(alphas.shape)
        torch.Size([1, 1, 640, 960])
        ```NzTraining is not yet supported)r   r   rs   r
   )r   r   r   r   )r   use_return_dictr   r   NotImplementedErrorr   forward_with_filtered_kwargsfeature_mapsr   r   r   r   )r1   r   r   r   r   r   kwargsr   outputsrn   r   outputr   r   r   rM      s*   *
zVitMatteForImageMatting.forward)NNNNN)
r   r   r   rB   r	   r   TensorboolrM   rO   r   r   rH   r   r      s&    
r   )r   dataclassesr   r   r    r   r'   backbone_utilsr   modeling_utilsr   utilsr   r	   configuration_vitmatter   r   r   r8   r9   rQ   re   rr   ry   r   __all__r   r   r   r   <module>   s4   #)R