o
    oiT8                     @  s  d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlZddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZm Z  G dd deZ!eG dd dZ"G dd dee" Z#d ddZ$dS )!aP  Based from the original code from Meta Platforms, Inc. and affiliates.

https://github.com/facebookresearch/segment-
anything/blob/3518c86b78b3bc9cf4fbe3d18e682fad1c79dc51/segment_anything/build_sam.py

https://github.com/facebookresearch/segment-
anything/blob/3518c86b78b3bc9cf4fbe3d18e682fad1c79dc51/segment_anything/modeling/sam.py
    )annotationsN)	dataclass)Enum)AnyOptional)SegmentationResults)	ModelBase)	LayerNorm)ImageEncoderViT)MaskDecoder)PromptEncoder)TwoWayTransformer)TinyViT)Tensor)KORNIA_CHECKKORNIA_CHECK_SHAPEc                   @  s    e Zd ZdZdZdZdZdZdS )SamModelTypezMap the SAM model types.r            N)__name__
__module____qualname____doc__vit_hvit_lvit_b
mobile_sam r   r   S/home/ubuntu/.local/lib/python3.10/site-packages/kornia/contrib/models/sam/model.pyr   0   s    r   c                   @  sf   e Zd ZU dZdZded< dZded< dZded	< dZd
ed< dZ	d
ed< dZ
d
ed< dZded< dS )	SamConfiga  Encapsulate the Config to build a SAM model.

    Args:
        model_type: the available models are:

            - 0, 'vit_h' or :func:`kornia.contrib.sam.SamModelType.vit_h`
            - 1, 'vit_l' or :func:`kornia.contrib.sam.SamModelType.vit_l`
            - 2, 'vit_b' or :func:`kornia.contrib.sam.SamModelType.vit_b`
            - 3, 'mobile_sam', or :func:`kornia.contrib.sam.SamModelType.mobile_sam`

        checkpoint: URL or a path for a file with the weights of the model
        encoder_embed_dim: Patch embedding dimension.
        encoder_depth: Depth of ViT.
        encoder_num_heads: Number of attention heads in each ViT block.
        encoder_global_attn_indexes: Encoder indexes for blocks using global attention.

    Nz"Optional[str | int | SamModelType]
model_typezOptional[str]
checkpointFbool
pretrainedzOptional[int]encoder_embed_dimencoder_depthencoder_num_headszOptional[tuple[int, ...]]encoder_global_attn_indexes)r   r   r   r   r!   __annotations__r"   r$   r%   r&   r'   r(   r   r   r   r   r    9   s   
 r    c                      sZ   e Zd ZU dZded< d fddZed ddZed!ddZe	
 d"ddZ  ZS )#Samg        floatmask_thresholdimage_encoderImageEncoderViT | TinyViTprompt_encoderr   mask_decoderr   returnNonec                   s    t    || _|| _|| _dS )a  SAM predicts object masks from an image and input prompts.

        Args:
            image_encoder: The backbone used to encode the image into image embeddings that allow for efficient mask
                           prediction.
            prompt_encoder: Encodes various types of input prompts.
            mask_decoder: Predicts masks from the image embeddings and encoded prompts.

        N)super__init__r-   r/   r0   )selfr-   r/   r0   	__class__r   r   r4   Z   s   

zSam.__init__namestrc                 C  s$   | dv rt t| S td|  )a'  Build/load the SAM model based on it's name.

        Args:
            name: The name of the SAM model. Valid names are:
                - 'vit_b'
                - 'vit_l'
                - 'vit_h'
                - 'mobile_sam'

        Returns:
            The respective SAM model

        )r   r   r   r   zInvalid SAM model name: )r*   from_configr    
ValueError)r8   r   r   r   	from_namek   s   zSam.from_nameconfigr    c           	      C  s  | j }t|trt|}nt|tr!tjtjtjtjd}|| }|tjkr/t	ddddd}ny|tjkr=t	dddd	d}nk|tjkrKt	d
dddd}n]|tjkrd}d}d}|| }t
tjd|ddt|||f||fddtdtd|ddd|dddd}n)t| jtrt| jtrt| jtrt| jtrt	| j| j| j| jd}ntd| j}| jr|du rtjdtjdtjdtjdi| }ntjdd d! |r|| |S )"a  Build/load the SAM model based on it's config.

        Args:
            config: The SamConfig data structure. If the model_type is available, build from it, otherwise will use
                    the parameters set.

        Returns:
            The respective SAM model

        Example:
            >>> from kornia.contrib.models.sam import SamConfig
            >>> sam_model = Sam.from_config(SamConfig('vit_b'))

        )r   r   r   r   i      )r            )r%   r&   r'   r(            )r?   rA         i       )      rF         5mT)img_sizer   	embed_dimimage_embedding_sizeinput_image_sizemask_in_chansr   r      r@   depthembedding_dimmlp_dim	num_headsnum_multimask_outputstransformertransformer_dimiou_head_depthiou_head_hidden_dimr-   r/   r0   zKUnexpected config. The model_type should be provide or the encoder configs.NzDhttps://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pthzDhttps://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pthzDhttps://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pthzmhttps://github.com/ChaoningZhang/MobileSAM/raw/a509aac54fdd7af59f843135f2f7cee307283c88/weights/mobile_sam.ptz2checkpoint is not None. pretrained=True is ignoredr   )
stacklevel)r!   
isinstanceintr   r9   r   r   r   r   
_build_samr*   r   r:   r   r   r   r%   r&   r'   r(   rX   NotImplementedErrorr"   r$   warningswarnload_checkpoint)	r=   r!   _map_sam_typemodelprompt_embed_dim
image_sizevit_patch_sizerP   r"   r   r   r   r:      s   











zSam.from_configimagesr   batched_promptslist[dict[str, Any]]multimask_outputr#   list[SegmentationResults]c                 C  s   t |g d t|jd t|kd | |}g }t||D ]5\}}| j|dd|dd|ddd\}}	| j|d	 | j	 ||	|d
\}
}|
t|
|| j q|S )a
  Predicts masks end-to-end from provided images and prompts.

        This method expects that the images have already been pre-processed, at least been normalized, resized and
        padded to be compatible with the `self.image_encoder`.

        .. note:: For each image :math:`(3, H, W)`, it is possible to input a batch (:math:`K`) of :math:`N` prompts,
                 the results are batched by the number of prompts batch. So given a prompt with :math:`K=5`, and
                 :math:`N=10`, the results will look like :math:`5xCxHxW` where :math:`C` is determined by
                 multimask_output. And within each of these masks :math:`(5xC)`, it should be possible to find
                 :math:`N` instances if the model succeed.

        Args:
            images: The image as a torch tensor in :math:`(B, 3, H, W)` format, already transformed for input to the
                    model.
            batched_prompts: A list over the batch of images (list length should be :math:`B`), each a dictionary with
                             the following keys. If it does not have the respective prompt, it should not be included
                             in this dictionary. The options are:

                - "points": tuple of (Tensor, Tensor) within the coordinate keypoints and their respective labels.
                            the tuple should look like (keypoints, labels), where:

                            - The keypoints (a tensor) are a batched point prompts for this image, with shape
                              :math:`(K, N, 2)`. Already transformed to the input frame of the model.
                            - The labels (a tensor) are a batched labels for point prompts, with shape :math:`(K, N)`.
                              Where 1 indicates a foreground point and 0 indicates a background point.

                - "boxes": (Tensor) Batched box inputs, with shape :math:`(K, 4)`. Already transformed to the input
                           frame of the model.
                - "mask_inputs": (Tensor) Batched mask inputs to the model, in the form :math:`(K, 1, H, W)`.

            multimask_output: Whether the model should predict multiple disambiguating masks, or return a single mask.

        Returns:
            A list over input images, where each element is as SegmentationResults the following.
                - logits: Low resolution logits with shape :math:`(K, C, H, W)`. Can be passed as mask input to
                          subsequent iterations of prediction. Where :math:`K` is the number of input prompts,
                          :math:`C` is determined by multimask_output, and :math:`H=W=256` are the model output size.
                - scores: The model's predictions of mask quality (iou prediction), in shape BxC.

        )B3HWr   zCThe number of images (`B`) should match with the length of prompts!pointsNboxesmask_inputs)rv   rw   masks)N.)image_embeddingsimage_pesparse_prompt_embeddingsdense_prompt_embeddingsrp   )r   r   shapelenr-   zipr/   getr0   get_dense_peappendr   r,   )r5   rm   rn   rp   rz   outputsprompt_recordcurr_embeddingsparse_embeddingsdense_embeddingslow_res_logitsiou_predictionsr   r   r   forward   s,   ,





	zSam.forward)r-   r.   r/   r   r0   r   r1   r2   )r8   r9   r1   r*   )r=   r    r1   r*   )rm   r   rn   ro   rp   r#   r1   rq   )r   r   r   r,   r)   r4   staticmethodr<   r:   torchno_gradr   __classcell__r   r   r6   r   r*   W   s   
 lr*   r%   rb   r&   r'   r(   tuple[int, ...]r1   c                 C  sl   d}d}d}|| }t t|| |dt||dd|d|dt|||f||fddtd	td
|ddd|d	dddS )NrK   rB   rD      T   )rU   rO   rM   	mlp_ratio
norm_layerrX   
patch_sizeqkv_biasuse_rel_posglobal_attn_indexeswindow_size	out_chansrN   r   r   rS   r@   rT   rY   r_   )r*   r
   r	   r   r   r   )r%   r&   r'   r(   rj   rk   rl   rP   r   r   r   rc   8  sB   rc   )
r%   rb   r&   rb   r'   rb   r(   r   r1   r*   )%r   
__future__r   re   dataclassesr   enumr   typingr   r   r   kornia.contrib.modelsr   kornia.contrib.models.baser   -kornia.contrib.models.sam.architecture.commonr	   4kornia.contrib.models.sam.architecture.image_encoderr
   3kornia.contrib.models.sam.architecture.mask_decoderr   5kornia.contrib.models.sam.architecture.prompt_encoderr   2kornia.contrib.models.sam.architecture.transformerr   kornia.contrib.models.tiny_vitr   kornia.corer   kornia.core.checkr   r   r   r    r*   rc   r   r   r   r   <module>   s.   		 b