o
    5tiaY                     @   s(  d dl mZmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZ d dlmZmZ d dlmZ z
d dlmZ dZW n eyA   dZY nw zd d	lZd d
lmZmZ dZW n ey]   dZY nw d dlmZ d dlmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$ e"e%Z&dZ'dZ(G dd deZ)d	S )    )ABCabstractmethod)Number)AnyCallableDictListOptionalUnion)Datasetload_dataset)choose_split)	bootstrapTFN)Pipelinepipeline)perf_counter)Literal   )load)EvaluationModule)
get_logger   )DatasetColumna  
    Compute the metric for a given pipeline and dataset combination.
    Args:
        model_or_pipeline (`str` or `Pipeline` or `Callable` or `PreTrainedModel` or `TFPreTrainedModel`, defaults to `None`):
            If the argument in not specified, we initialize the default pipeline for the task (in this case
            `text-classification` or its alias - `sentiment-analysis`). If the argument is of the type `str` or
            is a model instance, we use it to initialize a new `Pipeline` with the given model. Otherwise we assume the
            argument specifies a pre-initialized pipeline.
        data (`str` or `Dataset`, defaults to `None`):
            Specifies the dataset we will run evaluation on. If it is of type `str`, we treat it as the dataset
            name, and load it. Otherwise we assume it represents a pre-loaded dataset.
        subset (`str`, defaults to `None`):
            Defines which dataset subset to load. If `None` is passed the default subset is loaded.
        split (`str`, defaults to `None`):
            Defines which dataset split to load. If `None` is passed, infers based on the `choose_split` function.
        metric (`str` or `EvaluationModule`, defaults to `None`):
            Specifies the metric we use in evaluator. If it is of type `str`, we treat it as the metric name, and
            load it. Otherwise we assume it represents a pre-loaded metric.
        tokenizer (`str` or `PreTrainedTokenizer`, *optional*, defaults to `None`):
            Argument can be used to overwrite a default tokenizer if `model_or_pipeline` represents a model for
            which we build a pipeline. If `model_or_pipeline` is `None` or a pre-initialized pipeline, we ignore
            this argument.
        strategy (`Literal["simple", "bootstrap"]`, defaults to "simple"):
            specifies the evaluation strategy. Possible values are:
            - `"simple"` - we evaluate the metric and return the scores.
            - `"bootstrap"` - on top of computing the metric scores, we calculate the confidence interval for each
            of the returned metric keys, using `scipy`'s `bootstrap` method
            https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.bootstrap.html.
        confidence_level (`float`, defaults to `0.95`):
            The `confidence_level` value passed to `bootstrap` if `"bootstrap"` strategy is chosen.
        n_resamples (`int`, defaults to `9999`):
            The `n_resamples` value passed to `bootstrap` if `"bootstrap"` strategy is chosen.
        device (`int`, defaults to `None`):
            Device ordinal for CPU/GPU support of the pipeline. Setting this to -1 will leverage CPU, a positive
            integer will run the model on the associated CUDA device ID. If `None` is provided it will be inferred and
            CUDA:0 used if available, CPU otherwise.
        random_state (`int`, *optional*, defaults to `None`):
            The `random_state` value passed to `bootstrap` if `"bootstrap"` strategy is chosen. Useful for
            debugging.
a`  
    Return:
        A `Dict`. The keys represent metric keys calculated for the `metric` spefied in function arguments. For the
        `"simple"` strategy, the value is the metric score. For the `"bootstrap"` strategy, the value is a `Dict`
        containing the score, the confidence interval and the standard error calculated for each metric key.
c                #   @   sj  e Zd ZdZi Zi ZdEdedefddZe			dFd	e	e d
e
dedee deeef f
ddZede
de
dedeeef fddZedefddZedd Z															dGdeededdf d eeef d!ee d"ee d#eeef d$eeed%f  d&eeed'f  d(ed) d
e
ded*edee d+ed,ed-eeeef  deee
f f d.d/Zed0d1 Zd eeef d2eeef fd3d4ZedHd5d6ZdHd eeef d!ed"efd7d8Zd ed+ed,efd9d:Z			dIdeededdf d$ed; d&ed; d*efd<d=Z d#eeef fd>d?Z!d@dA Z"				dJd#edBed(ed) d
e
dedee fdCdDZ#dS )K	Evaluatorz
    The [`Evaluator`] class is the class from which all evaluators inherit. Refer to this class for methods shared across
    different evaluators.
    Base class implementing evaluator operations.
    Ntaskdefault_metric_namec                 C   s(   t stdtstd|| _|| _d S )NzbIf you want to use the `Evaluator` you need `transformers`. Run `pip install evaluate[evaluator]`.zbIf you want to use the `Evaluator` you need `scipy>=1.7.1`. Run `pip install evaluate[evaluator]`.)TRANSFORMERS_AVAILABLEImportErrorSCIPY_AVAILABLEr   r   )selfr   r    r    K/home/ubuntu/.local/lib/python3.10/site-packages/evaluate/evaluator/base.py__init__p   s   
zEvaluator.__init__ffffff?'  metric_keysconfidence_leveln_resamplesrandom_statereturnc           
   
   C   sb   dd }i }|D ]&}t t| || |fi |dd|||d}	|	jj|	jjf|	jd||< q|S )z
        A utility function enabling the confidence interval calculation for metrics computed
        by the evaluator based on `scipy`'s `bootstrap` method.
        c                    s    fdd}|S )Nc                     s(   j di dd t | D   S )Nc                 S   s   i | ]\}}||qS r    r    ).0kvr    r    r!   
<dictcomp>   s    zjEvaluator._compute_confidence_interval.<locals>.build_args_metric.<locals>.args_metric.<locals>.<dictcomp>r    )computezipkeys)argskeykwargsmetricr    r!   args_metric   s   (zVEvaluator._compute_confidence_interval.<locals>.build_args_metric.<locals>.args_metricr    )r5   r3   r4   r6   r    r2   r!   build_args_metric   s   zAEvaluator._compute_confidence_interval.<locals>.build_args_metricTF)data	statisticpaired
vectorizedr&   r'   r(   )confidence_intervalstandard_error)r   listvaluesr<   lowhighr=   )
r5   metric_inputsr%   r&   r'   r(   r7   bootstrap_dictr3   bsr    r    r!   _compute_confidence_interval|   s    

z&Evaluator._compute_confidence_interval
start_timeend_timenum_samplesc                 C   s$   ||  }|| }d| }|||dS )a}  
        A utility function computing time performance metrics:
            - `total_time_in_seconds` - pipeline inference runtime for the evaluation data in seconds,
            - `samples_per_second` - pipeline throughput in the number of samples per second.
            - `latency_in_seconds` - pipeline inference runtime for the evaluation data in seconds per sample,

        g      ?)total_time_in_secondssamples_per_secondlatency_in_secondsr    )rF   rG   rH   latency
throughputlatency_sampler    r    r!   _compute_time_perf   s   	zEvaluator._compute_time_perfc                  C   s   zddl } | j rd}nd}W n* ty:   zddl}t|jddkr)d}nd}W n ty7   d}Y nw Y nw |dkrFt	d |S t	d |S )zBHelper function to check if GPU or CPU is available for inference.r   NGPUzFNo GPU found. The default device for pipeline inference is set to CPU.zLGPU found. The default device for pipeline inference is set to GPU (CUDA:0).)
torchcudais_availabler   
tensorflowlenconfiglist_physical_devicesloggerinfo)rR   devicetfr    r    r!   _infer_device   s.   


zEvaluator._infer_devicec                 O   s   t  )z
        A core method of the `Evaluator` class, which processes the pipeline outputs for compatibility with the metric.
        )NotImplementedError)r   r1   r4   r    r    r!   predictions_processor   s   zEvaluator.predictions_processorsimpletextlabelmodel_or_pipeliner   PreTrainedModelTFPreTrainedModelr8   subsetsplitr5   	tokenizerPreTrainedTokenizerfeature_extractorFeatureExtractionMixinstrategy)r`   r   r[   input_columnlabel_columnlabel_mappingc                 C   s   i }|  || | j|||d}| j|||d\}}| j||||d}| |}| ||\}}| ||}|| | j||||	|
|d}t	|t
u rQ|j|i}|| || |S )Nr8   rf   rg   )r8   rm   rn   )rc   rh   rj   r[   )r5   rB   rl   r&   r'   r(   )"check_for_mismatch_in_device_setup	load_dataprepare_dataprepare_pipelineprepare_metriccall_pipeliner_   updatecompute_metrictypefloatname)r   rc   r8   rf   rg   r5   rh   rj   rl   r&   r'   r[   r(   rm   rn   ro   resultrB   pipe_inputspipepredictionsperf_resultsmetric_resultsr    r    r!   r.      s6   




zEvaluator.computec                 C   sd   | d ur*| dkr,t |tr.|jjdkrtd| |jjkr0td|jj d|  dd S d S d S d S )NrP   cpua  The value of the `device` kwarg passed to `compute` suggests that this pipe should be run on an accelerator, but the pipe was instantiated on CPU. Pass `device` to the pipeline during initialization to use an accelerator, or pass `device=None` to `compute`. z)This pipeline was instantiated on device z but device=z was passed to `compute`.)
isinstancer   r[   ry   
ValueErrorindex)r[   rc   r    r    r!   rq     s   z,Evaluator.check_for_mismatch_in_device_setupcolumns_namesc              	   C   s>   |  D ]\}}||jvrtd| d| d|j dqdS )a  
        Ensure the columns required for the evaluation are present in the dataset.

        Args:
            data (`str` or [`Dataset`]):
                Specifies the dataset we will run evaluation on.
            columns_names (`List[str]`):
                List of column names to check in the dataset. The keys are the arguments to the [`evaluate.EvaluationModule.compute`] method,
                while the values are the column names to check.

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> from evaluate import evaluator
        >>> data = load_dataset("rotten_tomatoes', split="train")
        >>> evaluator.check_required_columns(data, {"input_column": "text", "label_column": "label"})
        ```
        z	Invalid `z` z8 specified. The dataset contains the following columns: .N)itemscolumn_namesr   )r   r8   r   
input_namecolumn_namer    r    r!   check_required_columns'  s   
z Evaluator.check_required_columnsc                 C   s*   |du rt | |}td|   |S )a  
        Infers which split to use if `None` is given.

        Args:
             data (`str`):
                Name of dataset.
             subset (`str`):
                Name of config for datasets with multiple configurations (e.g. 'glue/cola').
             split (`str`, defaults to `None`):
                Split to use.
        Returns:
            `split`: `str` containing which split to use

        Example:

        ```py
        >>> from evaluate import evaluator
        >>> evaluator("text-classification").get_dataset_split(data="rotten_tomatoes")
        WARNING:evaluate.evaluator.base:Dataset split not defined! Automatically evaluating with split: TEST
        'test'
        ```
        Nz@Dataset split not defined! Automatically evaluating with split: )r   rY   warningupperrp   r    r    r!   get_dataset_splitA  s   
zEvaluator.get_dataset_splitc                 C   sZ   t |tr| |||}t|||d}|S t |tr)|dus"|dur'td |S td)a  
        Load dataset with given subset and split.
        Args:
            data ([`Dataset`] or `str`, defaults to `None`):
                Specifies the dataset we will run evaluation on. If it is of
                type `str`, we treat it as the dataset name, and load it. Otherwise we assume it represents a pre-loaded dataset.
            subset (`str`, defaults to `None`):
                Specifies dataset subset to be passed to `name` in `load_dataset`. To be
                used with datasets with several configurations (e.g. glue/sst2).
            split (`str`, defaults to `None`):
                User-defined dataset split by name (e.g. train, validation, test). Supports slice-split (`test[:n]`).
                If not defined and data is a `str` type, will automatically select the best one via `choose_split()`.
        Returns:
            data ([`Dataset`]): Loaded dataset which will be used for evaluation.

        Example:

        ```py
        >>> from evaluate import evaluator
        >>> evaluator("text-classification").load_data(data="rotten_tomatoes", split="train")
        Dataset({
            features: ['text', 'label'],
            num_rows: 8530
        })
        ```
        )r{   rg   Nz=`data` is a preloaded Dataset! Ignoring `subset` and `split`.zXPlease specify a valid `data` object - either a `str` with a name or a `Dataset` object.)r   strr   r   r   rY   r   r   )r   r8   rf   rg   r    r    r!   rr   ^  s   


zEvaluator.load_datac                 O   s(   |  |||d d|| it||fS )a  
        Prepare data.

        Args:
            data ([`Dataset`]):
                Specifies the dataset we will run evaluation on.
            input_column (`str`, defaults to `"text"`):
                The name of the column containing the text feature in the dataset specified by `data`.
            second_input_column(`str`, *optional*):
                The name of the column containing the second text feature if there is one. Otherwise, set to `None`.
            label_column (`str`, defaults to `"label"`):
                The name of the column containing the labels in the dataset specified by `data`.
        Returns:
            `dict`:  metric inputs.
            `list`:  pipeline inputs.

        Example:

        ```py
        >>> from evaluate import evaluator
        >>> from datasets import load_dataset

        >>> ds = load_dataset("rotten_tomatoes", split="train")
        >>> evaluator("text-classification").prepare_data(ds, input_column="text", second_input_column=None, label_column="label")
        ```
        )rm   rn   
references)r   r   )r   r8   rm   rn   r1   r4   r    r    r!   rs     s   zEvaluator.prepare_data)PreTrainedTokenizerBaserk   c                 C   s   |du r|   }t|tst|tjst|tjr$t| j||||d}n|du r0t| j|d}n|}|dur?|dur?t	d |j| jkrY| jdkrP|j
dsYtd| j d|S )a  
        Prepare pipeline.

        Args:
            model_or_pipeline (`str` or [`~transformers.Pipeline`] or `Callable` or [`~transformers.PreTrainedModel`] or [`~transformers.TFPreTrainedModel`], defaults to `None`):
                If the argument in not specified, we initialize the default pipeline for the task. If the argument is of the type `str` or
                is a model instance, we use it to initialize a new [`~transformers.Pipeline`] with the given model. Otherwise we assume the
                argument specifies a pre-initialized pipeline.
            preprocessor ([`~transformers.PreTrainedTokenizerBase`] or [`~transformers.FeatureExtractionMixin`], *optional*, defaults to `None`):
                Argument can be used to overwrite a default preprocessor if `model_or_pipeline` represents a model for
                which we build a pipeline. If `model_or_pipeline` is `None` or a pre-initialized pipeline, we ignore
                this argument.
        Returns:
            The initialized pipeline.

        Example:

        ```py
        >>> from evaluate import evaluator
        >>> evaluator("text-classification").prepare_pipeline(model_or_pipeline="distilbert-base-uncased")
        ```
        N)modelrh   rj   r[   )r[   zUIgnoring the value of the preprocessor argument (`tokenizer` or `feature_extractor`).translationzZIncompatible `model_or_pipeline`. Please specify `model_or_pipeline` compatible with the `z` task.)r]   r   r   transformersrd   re   r   r   rY   r   
startswithr   )r   rc   rh   rj   r[   r~   r    r    r!   rt     s2   


"zEvaluator.prepare_pipelinec                 C   s>   |du r| j du rtdt| j }|S t|trt|}|S )a  
        Prepare metric.

        Args:
            metric (`str` or [`EvaluationModule`], defaults to `None`):
                Specifies the metric we use in evaluator. If it is of type `str`, we treat it as the metric name, and
                load it. Otherwise we assume it represents a pre-loaded metric.

        Returns:
            The loaded metric.

        Example:

        ```py
        >>> from evaluate import evaluator
        >>> evaluator("text-classification").prepare_metric("accuracy")
        ```
        NzW`Evaluator` doesn't specify a default metric. Please specify a valid `metric` argument.)r   r   r   r   r   )r   r5   r    r    r!   ru     s   


zEvaluator.prepare_metricc                 O   s6   t  }||i || j}t  }|| ||t|fS N)r   PIPELINE_KWARGSrO   rV   )r   r~   r1   r4   rF   pipe_outputrG   r    r    r!   rv     s   zEvaluator.call_pipelinerB   c                 C   s\   |j di || j}|dkr,| }| ||||||}	|D ]
}
||
 |	|
 d< q|	S |S )zCompute and return metrics.r   scoreNr    )r.   METRIC_KWARGSr0   rE   )r   r5   rB   rl   r&   r'   r(   r|   r%   rC   r3   r    r    r!   rx     s   
zEvaluator.compute_metricr   )r#   r$   N)NNNNNNNr`   r#   r$   NNra   rb   N)NN)NNN)r`   r#   r$   N)$__name__
__module____qualname____doc__r   r   r   r"   staticmethodr   rz   intr	   r   r   rE   rO   r]   r   r_   r
   r   r   r   r   r   r.   rq   r   r   rr   rs   rt   ru   rv   rx   r    r    r    r!   r   f   s    
%$



	


?
" (#
:
r   )*abcr   r   numbersr   typingr   r   r   r   r	   r
   datasetsr   r   evaluate.evaluator.utilsr   scipy.statsr   r   r   r   r   r   r   timer   typing_extensionsr   loadingr   moduler   utils.loggingr   utilsr   r   rY    EVALUTOR_COMPUTE_START_DOCSTRING"EVALUATOR_COMPUTE_RETURN_DOCSTRINGr   r    r    r    r!   <module>   s8    )