o
    .wi.                     @   sJ  d dl mZ d dlmZmZmZmZmZmZ d dl	Z	d dl	m
Z
 d dlmZmZmZmZmZ d dlmZ d dlmZmZ d dlmZ d d	lmZmZmZ d d
lmZmZ esZddgZes_dgZer}er}d dl m!Z" d dl m#Z$ dddZ%ee%s|ddgZnddgZG dd deZ&ere&Z'e'dd e'dd e'dd e'dd dS dS )    )Sequence)TYPE_CHECKINGAnyListLiteralOptionalUnionN)Tensor)_clip_iqa_compute_clip_iqa_format_prompts_clip_iqa_get_anchor_vectors_clip_iqa_update!_get_clip_iqa_model_and_processor)Metric)_SKIP_SLOW_DOCTEST_try_proceed_with_timeout)dim_zero_cat)_MATPLOTLIB_AVAILABLE_PIQ_GREATER_EQUAL_0_8 _TRANSFORMERS_GREATER_EQUAL_4_10)_AX_TYPE_PLOT_OUT_TYPECLIPImageQualityAssessmentCLIPImageQualityAssessment.plot)	CLIPModel)CLIPProcessorreturnc                   C   s    t jddd tjddd d S )Nopenai/clip-vit-large-patch14T)resume_download)
_CLIPModelfrom_pretrained_CLIPProcessor r"   r"   ]/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/torchmetrics/multimodal/clip_iqa.py_download_clip_iqa_metric/   s   r$   c                       s  e Zd ZU dZdZeed< dZeed< dZeed< dZ	dZ
eed	< ee ed
< dZeed< 			d#ded dedeeeeeef f df deddf
 fddZdeddfddZdeeeeef f fddZd$deeee df d ee defd!d"Z  ZS )%r   a>  Calculates `CLIP-IQA`_, that can be used to measure the visual content of images.

    The metric is based on the `CLIP`_ model, which is a neural network trained on a variety of (image, text) pairs to
    be able to generate a vector representation of the image and the text that is similar if the image and text are
    semantically similar.

    The metric works by calculating the cosine similarity between user provided images and pre-defined prompts. The
    prompts always comes in pairs of "positive" and "negative" such as "Good photo." and "Bad photo.". By calculating
    the similartity between image embeddings and both the "positive" and "negative" prompt, the metric can determine
    which prompt the image is more similar to. The metric then returns the probability that the image is more similar
    to the first prompt than the second prompt.

    Build in prompts are:
        * quality: "Good photo." vs "Bad photo."
        * brightness: "Bright photo." vs "Dark photo."
        * noisiness: "Clean photo." vs "Noisy photo."
        * colorfullness: "Colorful photo." vs "Dull photo."
        * sharpness: "Sharp photo." vs "Blurry photo."
        * contrast: "High contrast photo." vs "Low contrast photo."
        * complexity: "Complex photo." vs "Simple photo."
        * natural: "Natural photo." vs "Synthetic photo."
        * happy: "Happy photo." vs "Sad photo."
        * scary: "Scary photo." vs "Peaceful photo."
        * new: "New photo." vs "Old photo."
        * warm: "Warm photo." vs "Cold photo."
        * real: "Real photo." vs "Abstract photo."
        * beautiful: "Beautiful photo." vs "Ugly photo."
        * lonely: "Lonely photo." vs "Sociable photo."
        * relaxing: "Relaxing photo." vs "Stressful photo."

    As input to ``forward`` and ``update`` the metric accepts the following input

    - ``images`` (:class:`~torch.Tensor`): tensor with images feed to the feature extractor with shape ``(N,C,H,W)``

    As output of `forward` and `compute` the metric returns the following output

    - ``clip_iqa`` (:class:`~torch.Tensor` or dict of tensors): tensor with the CLIP-IQA score. If a single prompt is
      provided, a single tensor with shape ``(N,)`` is returned. If a list of prompts is provided, a dict of tensors
      is returned with the prompt as key and the tensor with shape ``(N,)`` as value.

    Args:
        model_name_or_path: string indicating the version of the CLIP model to use. Available models are:

            - `"clip_iqa"`, model corresponding to the CLIP-IQA paper.
            - `"openai/clip-vit-base-patch16"`
            - `"openai/clip-vit-base-patch32"`
            - `"openai/clip-vit-large-patch14-336"`
            - `"openai/clip-vit-large-patch14"`

        data_range: The maximum value of the input tensor. For example, if the input images are in range [0, 255],
            data_range should be 255. The images are normalized by this value.
        prompts: A string, tuple of strings or nested tuple of strings. If a single string is provided, it must be one
            of the available prompts (see above). Else the input is expected to be a tuple, where each element can
            be one of two things: either a string or a tuple of strings. If a string is provided, it must be one of the
            available prompts (see above). If tuple is provided, it must be of length 2 and the first string must be a
            positive prompt and the second string must be a negative prompt.
        kwargs: Additional keyword arguments, see :ref:`Metric kwargs` for more info.

    .. hint::
        If using the default `clip_iqa` model, the package `piq` must be installed. Either install with
        `pip install piq` or `pip install torchmetrics[image]`.

    Raises:
        ModuleNotFoundError:
            If transformers package is not installed or version is lower than 4.10.0
        ValueError:
            If `prompts` is a tuple and it is not of length 2
        ValueError:
            If `prompts` is a string and it is not one of the available prompts
        ValueError:
            If `prompts` is a list of strings and not all strings are one of the available prompts

    Example::
        Single prompt:

        >>> from torch import randint
        >>> from torchmetrics.multimodal import CLIPImageQualityAssessment
        >>> imgs = randint(255, (2, 3, 224, 224)).float()
        >>> metric = CLIPImageQualityAssessment()
        >>> metric(imgs)
        tensor([0.8894, 0.8902])

    Example::
        Multiple prompts:

        >>> from torch import randint
        >>> from torchmetrics.multimodal import CLIPImageQualityAssessment
        >>> imgs = randint(255, (2, 3, 224, 224)).float()
        >>> metric = CLIPImageQualityAssessment(prompts=("quality", "brightness"))
        >>> metric(imgs)
        {'quality': tensor([0.8693, 0.8705]), 'brightness': tensor([0.5722, 0.4762])}

    Example::
        Custom prompts. Must always be a tuple of length 2, with a positive and negative prompt.

        >>> from torch import randint
        >>> from torchmetrics.multimodal import CLIPImageQualityAssessment
        >>> imgs = randint(255, (2, 3, 224, 224)).float()
        >>> metric = CLIPImageQualityAssessment(prompts=(("Super good photo.", "Super bad photo."), "brightness"))
        >>> metric(imgs)
        {'user_defined_0': tensor([0.9578, 0.9654]), 'brightness': tensor([0.5495, 0.5764])}

    Fis_differentiableThigher_is_betterfull_state_updateg        g      Y@anchors
probs_listmodelfeature_networkclip_iqa      ?qualitymodel_name_or_path)r,   zopenai/clip-vit-base-patch16zopenai/clip-vit-base-patch32z!openai/clip-vit-large-patch14-336r   
data_rangeprompts.kwargsr   Nc                    s   t  jdi | t|ttfr|dkstd|| _t|\}}|| _|| _	t
|\| _| _|| _t  t|| j| j| j| j}W d    n1 sMw   Y  | d| | jdg dd d S )Nr   z2Argument `data_range` should be a positive number.r(   r)   cat)dist_reduce_fxr"   )super__init__
isinstanceintfloat
ValueErrorr1   r   prompts_listprompts_namer   r*   	processorr0   torchinference_moder   deviceregister_buffer	add_state)selfr0   r1   r2   r3   r<   r=   r(   	__class__r"   r#   r7      s    
z#CLIPImageQualityAssessment.__init__imagesc                 C   s|   t  0 t| j|| j| j| j| j}t|| j	| j
dd}t|ts&td| j| W d   dS 1 s7w   Y  dS )z"Update metric state with new data.F)format_as_dictzOutput probs should be a tensorN)r?   r@   r   r0   r*   r>   r1   rA   r
   r(   r=   r8   r	   r;   r)   append)rD   rG   img_featuresprobsr"   r"   r#   update   s   

"z!CLIPImageQualityAssessment.updatec                    s8   t | j t| jdkr  S  fddt| jD S )zCompute metric.   c                    s"   i | ]\}}| d d |f qS )Nr"   ).0iprK   r"   r#   
<dictcomp>   s   " z6CLIPImageQualityAssessment.compute.<locals>.<dictcomp>)r   r)   lenr=   squeeze	enumerate)rD   r"   rQ   r#   compute   s   
z"CLIPImageQualityAssessment.computevalaxc                 C   s   |  ||S )a>  Plot a single or multiple values from the metric.

        Args:
            val: Either a single result from calling `metric.forward` or `metric.compute` or a list of these results.
                If no value is provided, will automatically call `metric.compute` and plot that result.
            ax: An matplotlib axis object. If provided will add plot to that axis

        Returns:
            Figure and Axes object

        Raises:
            ModuleNotFoundError:
                If `matplotlib` is not installed

        .. plot::
            :scale: 75

            >>> # Example plotting a single value
            >>> import torch
            >>> from torchmetrics.multimodal.clip_iqa import CLIPImageQualityAssessment
            >>> metric = CLIPImageQualityAssessment()
            >>> metric.update(torch.rand(1, 3, 224, 224))
            >>> fig_, ax_ = metric.plot()

        .. plot::
            :scale: 75

            >>> # Example plotting multiple values
            >>> import torch
            >>> from torchmetrics.multimodal.clip_iqa import CLIPImageQualityAssessment
            >>> metric = CLIPImageQualityAssessment()
            >>> values = [ ]
            >>> for _ in range(10):
            ...     values.append(metric(torch.rand(1, 3, 224, 224)))
            >>> fig_, ax_ = metric.plot(values)

        )_plot)rD   rW   rX   r"   r"   r#   plot   s   &r   )r,   r-   r.   )NN)__name__
__module____qualname____doc__r%   bool__annotations__r&   r'   plot_lower_boundplot_upper_boundr	   r   r+   strr   r:   tupler   r   r7   rL   dictrV   r   r   r   r   rZ   __classcell__r"   r"   rE   r#   r   9   s:   
 h
	
!2)colorfullness)r2   )r/   
brightness	noisiness)r/   rh   ri   rg   ))zPhoto of a catzPhoto of a dogr/   )zColorful photozBlack and white photo)r   N)(collections.abcr   typingr   r   r   r   r   r   r?   r	   +torchmetrics.functional.multimodal.clip_iqar
   r   r   r   r   torchmetrics.metricr   torchmetrics.utilities.checksr   r   torchmetrics.utilities.datar   torchmetrics.utilities.importsr   r   r   torchmetrics.utilities.plotr   r   __doctest_skip__transformersr   r   r   r!   r$   r   fr"   r"   r"   r#   <module>   sF    
 P
