o
    .wiF>                     @   s  U d dl mZmZmZ d dlZd dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZmZ er9d dlmZ d dlmZ e	rVerVd dlmZ d dlmZ dVd
dZe
esUdgZndgZes^dgZi dddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,Zeeeeef f ed-< d.ed/ d	ed0 fd1d2Z	3dWd4eeeeeef f d5f d	eee ee f fd6d7Zd.ed8d9d:d;d<ee d=eeejf d	efd>d?Zd.ed@ed8d9d:d;dAed=eeejf d	efdBdCZ 	DdXdEedFedGee dHe!d	eeeeef f f
dIdJZ"	K	L	3dYd@ed.ed/ dAed4eeeeeef f d5f d	eeeeef f f
dMdZ#ernd dNl$m%Z% d dOl m&Z&m'Z' e'e&dZ(e%e#e(dPZ)e)dQdR e)dSdR e)dTdR e)dUdR dS dS )Z    )TYPE_CHECKINGLiteralUnionN)Tensor)_get_clip_model_and_processor)_SKIP_SLOW_DOCTEST_try_proceed_with_timeout)_PIQ_GREATER_EQUAL_0_8 _TRANSFORMERS_GREATER_EQUAL_4_10)	CLIPModelCLIPProcessorreturnc                   C   s    t jddd tjddd d S )Nopenai/clip-vit-base-patch16T)resume_download)
_CLIPModelfrom_pretrained_CLIPProcessor r   r   h/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/torchmetrics/functional/multimodal/clip_iqa.py_download_clip_for_iqa_metric   s   r   clip_image_quality_assessmentquality)zGood photo.z
Bad photo.
brightness)zBright photo.zDark photo.	noisiness)zClean photo.zNoisy photo.colorfullness)zColorful photo.zDull photo.	sharpness)zSharp photo.zBlurry photo.contrast)zHigh contrast photo.zLow contrast photo.
complexity)zComplex photo.zSimple photo.natural)zNatural photo.zSynthetic photo.happy)zHappy photo.z
Sad photo.scary)zScary photo.zPeaceful photo.new)z
New photo.z
Old photo.warm)zWarm photo.zCold photo.real)zReal photo.zAbstract photo.	beautiful)zBeautiful photo.zUgly photo.lonely)zLonely photo.zSociable photo.relaxing)zRelaxing photo.zStressful photo._PROMPTSmodel_name_or_path)clip_iqar   zopenai/clip-vit-base-patch32z!openai/clip-vit-large-patch14-336zopenai/clip-vit-large-patch14)r   r   c                 C   sR   ddl m} | dkr%tstdddl}|jj  }|	d}||fS t
| S )zAExtract the CLIP model and processor from the model name or path.r   r   r*   zFor metric `clip_iqa` to work with argument `model_name_or_path` set to default value `'clip_iqa'`, package `piq` version v0.8.0 or later must be installed. Either install with `pip install piq` or`pip install torchmetrics[multimodal]`Nr   )transformersr   r	   
ValueErrorpiqr*   cliploadevalr   r   )r)   r   r-   model	processorr   r   r   !_get_clip_iqa_model_and_processor?   s   

r3   r   prompts.c                 C   s   t | ts	tdg }g }d}| D ]T}t |ttfstdt |tr@|tvr4tdt  d| d|| |t|  t |trOt|dkrOtdt |tre|d|  || |d	7 }q||fS )
a}  Converts the provided keywords into a list of prompts for the model to calculate the anchor vectors.

    Args:
        prompts: A string, tuple of strings or nested tuple of strings. If a single string is provided, it must be one
            of the available prompts (see above). Else the input is expected to be a tuple, where each element can
            be one of two things: either a string or a tuple of strings. If a string is provided, it must be one of the
            available prompts (see above). If tuple is provided, it must be of length 2 and the first string must be a
            positive prompt and the second string must be a negative prompt.

    Returns:
        Tuple containing a list of prompts and a list of the names of the prompts. The first list is double the length
        of the second list.

    Examples::

        >>> # single prompt
        >>> _clip_iqa_format_prompts(("quality",))
        (['Good photo.', 'Bad photo.'], ['quality'])
        >>> # multiple prompts
        >>> _clip_iqa_format_prompts(("quality", "brightness"))
        (['Good photo.', 'Bad photo.', 'Bright photo.', 'Dark photo.'], ['quality', 'brightness'])
        >>> # Custom prompts
        >>> _clip_iqa_format_prompts(("quality", ("Super good photo.", "Super bad photo.")))
        (['Good photo.', 'Bad photo.', 'Super good photo.', 'Super bad photo.'], ['quality', 'user_defined_0'])

    zJArgument `prompts` must be a tuple containing strings or tuples of stringsr   z)All elements of `prompts` must be one of z" if not custom tuple prompts, got .   zDIf a tuple is provided in argument `prompts`, it must be of length 2user_defined_   )	
isinstancetupler,   strr(   keysappendextendlen)r5   prompts_namesprompts_listcountpr   r   r   _clip_iqa_format_prompts\   s.   




rE   r1   r   r2   r   rB   devicec           
      C   s   | dkr;||d}t jt||jjt j|d}t|d D ]\}}t j|t j|d||dt|f< q||	 }	n||ddd}|
|d ||d	 |}	|	|	jd
ddd S )a=  Calculates the anchor vectors for the CLIP IQA metric.

    Args:
        model_name_or_path: string indicating the version of the CLIP model to use.
        model: The CLIP model
        processor: The CLIP processor
        prompts_list: A list of prompts
        device: The device to use for the calculation

    r*   )text)dtyperF   	input_idsNptT)rG   return_tensorspaddingattention_maskr7   rD   dimkeepdim)torchzerosr@   	tokenizermodel_max_lengthlong	enumeratetensorencode_textfloatget_text_featurestonorm)
r)   r1   r2   rB   rF   text_processedanchors_textitpanchorsr   r   r   _clip_iqa_get_anchor_vectors   s   
$rc   images
data_rangec           
      C   s   |t | }	 | dkr;tjg d|ddddd}tjg d|ddddd}|| | }|j|  dd  }n|d	d
 |D ddd}	||	d |}||jdddd S )Nr*   )g3<4'?gwgM?gy{ ?)rF   r9      )gB91?gwt.?g	U?F)pos_embeddingc                 S   s   g | ]}|  qS r   )cpu).0r`   r   r   r   
<listcomp>   s    z$_clip_iqa_update.<locals>.<listcomp>rJ   T)rd   rK   rL   pixel_valuesr7   rN   rO   )rZ   rR   rX   viewencode_imageget_image_featuresr\   r]   )
r)   rd   r1   r2   re   rF   default_meandefault_stdimg_featuresprocessed_inputr   r   r   _clip_iqa_update   s   rs   Trq   rb   rA   format_as_dictc                    sn   d|  |   }||jd ddddddddf  t|dkr(  S |r5 fddt|D S  S )	zFinal computation of CLIP IQA.d   r   rN   r7   Nr9   c                    s"   i | ]\}}| d d |f qS )Nr   )ri   r`   rD   probsr   r   
<dictcomp>   s   " z%_clip_iqa_compute.<locals>.<dictcomp>)treshapeshapesoftmaxr@   squeezerW   )rq   rb   rA   rt   logits_per_imager   rv   r   _clip_iqa_compute   s   ,r   r*         ?c                 C   s   t |\}}t|\}}| j}||}t  t|||||}	t|| ||||}
t|
|	|W  d   S 1 s:w   Y  dS )a  Calculates `CLIP-IQA`_, that can be used to measure the visual content of images.

    The metric is based on the `CLIP`_ model, which is a neural network trained on a variety of (image, text) pairs to
    be able to generate a vector representation of the image and the text that is similar if the image and text are
    semantically similar.

    The metric works by calculating the cosine similarity between user provided images and pre-defined prompts. The
    prompts always come in pairs of "positive" and "negative" such as "Good photo." and "Bad photo.". By calculating
    the similartity between image embeddings and both the "positive" and "negative" prompt, the metric can determine
    which prompt the image is more similar to. The metric then returns the probability that the image is more similar
    to the first prompt than the second prompt.

    Build in prompts are:
        * quality: "Good photo." vs "Bad photo."
        * brightness: "Bright photo." vs "Dark photo."
        * noisiness: "Clean photo." vs "Noisy photo."
        * colorfullness: "Colorful photo." vs "Dull photo."
        * sharpness: "Sharp photo." vs "Blurry photo."
        * contrast: "High contrast photo." vs "Low contrast photo."
        * complexity: "Complex photo." vs "Simple photo."
        * natural: "Natural photo." vs "Synthetic photo."
        * happy: "Happy photo." vs "Sad photo."
        * scary: "Scary photo." vs "Peaceful photo."
        * new: "New photo." vs "Old photo."
        * warm: "Warm photo." vs "Cold photo."
        * real: "Real photo." vs "Abstract photo."
        * beautiful: "Beautiful photo." vs "Ugly photo."
        * lonely: "Lonely photo." vs "Sociable photo."
        * relaxing: "Relaxing photo." vs "Stressful photo."

    Args:
        images: Either a single ``[N, C, H, W]`` tensor or a list of ``[C, H, W]`` tensors
        model_name_or_path: string indicating the version of the CLIP model to use. By default this argument is set to
            ``clip_iqa`` which corresponds to the model used in the original paper. Other available models are
            `"openai/clip-vit-base-patch16"`, `"openai/clip-vit-base-patch32"`, `"openai/clip-vit-large-patch14-336"`
            and `"openai/clip-vit-large-patch14"`
        data_range: The maximum value of the input tensor. For example, if the input images are in range [0, 255],
            data_range should be 255. The images are normalized by this value.
        prompts: A string, tuple of strings or nested tuple of strings. If a single string is provided, it must be one
            of the available prompts (see above). Else the input is expected to be a tuple, where each element can
            be one of two things: either a string or a tuple of strings. If a string is provided, it must be one of the
            available prompts (see above). If tuple is provided, it must be of length 2 and the first string must be a
            positive prompt and the second string must be a negative prompt.

    .. hint::
        If using the default `clip_iqa` model, the package `piq` must be installed. Either install with
        `pip install piq` or `pip install torchmetrics[multimodal]`.

    Returns:
        A tensor of shape ``(N,)`` if a single prompts is provided. If a list of prompts is provided, a dictionary of
        with the prompts as keys and tensors of shape ``(N,)`` as values.

    Raises:
        ModuleNotFoundError:
            If transformers package is not installed or version is lower than 4.10.0
        ValueError:
            If not all images have format [C, H, W]
        ValueError:
            If prompts is a tuple and it is not of length 2
        ValueError:
            If prompts is a string and it is not one of the available prompts
        ValueError:
            If prompts is a list of strings and not all strings are one of the available prompts

    Example::
        Single prompt:

        >>> from torch import randint
        >>> from torchmetrics.functional.multimodal import clip_image_quality_assessment
        >>> imgs = randint(255, (2, 3, 224, 224)).float()
        >>> clip_image_quality_assessment(imgs, prompts=("quality",))
        tensor([0.8894, 0.8902])

    Example::
        Multiple prompts:

        >>> from torch import randint
        >>> from torchmetrics.functional.multimodal import clip_image_quality_assessment
        >>> imgs = randint(255, (2, 3, 224, 224)).float()
        >>> clip_image_quality_assessment(imgs, prompts=("quality", "brightness"))
        {'quality': tensor([0.8693, 0.8705]), 'brightness': tensor([0.5722, 0.4762])}

    Example::
        Custom prompts. Must always be a tuple of length 2, with a positive and negative prompt.

        >>> from torch import rand
        >>> from torchmetrics.functional.multimodal import clip_image_quality_assessment
        >>> imgs = randint(255, (2, 3, 224, 224)).float()
        >>> clip_image_quality_assessment(imgs, prompts=(("Super good photo.", "Super bad photo."), "brightness"))
        {'user_defined_0': tensor([0.9578, 0.9654]), 'brightness': tensor([0.5495, 0.5764])}

    N)	rE   r3   rF   r\   rR   inference_moderc   rs   r   )rd   r)   re   r5   rB   rA   r1   r2   rF   rb   rq   r   r   r   r      s   h


$)partial)Anycast)rd   )r   )r5   )r   r   r   )r   r   r   r   ))zPhoto of a catzPhoto of a dogr   )zColorful photozBlack and white photo)r   N)r4   )T)r*   r   r4   )*typingr   r   r   rR   r   -torchmetrics.functional.multimodal.clip_scorer   torchmetrics.utilities.checksr   r   torchmetrics.utilities.importsr	   r
   r+   r   r   r   r   r   __doctest_skip__r(   dictr<   r;   __annotations__r3   listrE   rF   rc   rZ   rs   boolr   r   	functoolsr   r   r   rd   fr   r   r   r   <module>   s  
	


7
"

	

t

