o
    5tiG                     @   s  d dl Z d dlZd dlZd dlmZmZmZmZm	Z	m
Z
mZmZ d dlZddlmZmZmZ e jdkZedZG dd dZd	ejd
eeef fddZ	d%de	e	e
eef   deded
eee	e f fddZdejded
efddZ			d&deeeejef f dedee deeee   deeef dededee d
eeeeef f fdd Z			d'deeeejef f dedee deeee   deeef dededee d
eeeeef f fd!d"Z G d#d$ d$Z!dS )(    N)SequenceDictOptionalTupleListUnionAnyMapping   )MetricScore	Signaturent	sacrebleuc                
   @   sD   e Zd ZdZ		ddedee dee dee fddZd	d
 ZdS )Resulta=  A container to represent results from a particular statistical
    significance test.
    :param score: The floating point score for the system at hand.
    :param p_value: If exists, represents the p-value when the system at
    hand is compared to a baseline using a paired test.
    :param mean: When paired bootstrap test is applied, this represents
    the true mean score estimated from bootstrap resamples of the system.
    :param ci: When paired bootstrap test is applied, this represents
    the 95% confidence interval around the true mean score `sys_mean`.
    Nscorep_valuemeancic                 C   s   || _ || _|| _|| _d S )N)r   r   r   r   )selfr   r   r   r    r   J/home/ubuntu/.local/lib/python3.10/site-packages/sacrebleu/significance.py__init__   s   
zResult.__init__c                 C   s   d dd | j D S )N,c                 S   s"   g | ]\}}| d t | qS )=)str).0kvr   r   r   
<listcomp>#   s   " z#Result.__repr__.<locals>.<listcomp>)join__dict__itemsr   r   r   r   __repr__"   s   zResult.__repr__)NNN)__name__
__module____qualname____doc__floatr   r   r$   r   r   r   r   r      s    

r   scoresreturnc                 C   sP   t | } t| }|d }|| d }| | | | }}d||  }|  |fS )zTakes a list of scores and returns mean and 95% confidence
    interval around the mean.

    :param scores: A list of floating point scores.
    :return: A tuple of mean and the 95% CI.
    (   r
   g      ?)npsortlenr   )r*   n	lower_idx	upper_idxlowerupperr   r   r   r   estimate_ci&   s   
r5     statsmetric	n_samplesc           	         s   t jdd}| dkrdnt|}tj|}|jt	| |t	| fdd}tj
| dd} fd	d
|| D }t| |fS )a  Performs bootstrap resampling for a single system to estimate
    a confidence interval around the true mean.
    :param stats: A list of statistics extracted from the system's hypotheses.
    :param metric: The `Metric` instance to be used for score computation.
    :n_samples: Number of bootstrap resamples to use.

    :return: A tuple of the seed choice as string and the list of `Score`
    instances for all bootstrap resamples.
    SACREBLEU_SEED12345noneNTsizereplacefloat32dtypec                    s   g | ]
}  |d qS r   )_compute_score_from_statssumr   _sr8   r   r   r   S   s    z'_bootstrap_resample.<locals>.<listcomp>)osenvirongetr3   intr-   randomdefault_rngchoicer/   arrayr   )	r7   r8   r9   seed_seedrngidxsstats_npr*   r   rH   r   _bootstrap_resample9   s   
rV   real_differencec                 C   s*   t | |k }|d t| d  }|S )zComputes the p-value given the sample statistics and the real statistic.
    :param stats: A numpy array with the sample statistics.
    :real_difference: The real statistic.
    :return: The p-value.
    r
   )r-   rE   itemr/   )r7   rW   cpr   r   r   _compute_p_valueY   s   r[   '  baseline_infosys_name
hypotheses
referencesmetricsn_ar_confidencerQ   c                    s  t j|}|jd|t|ftd}	|	 }
|dkr'|jt||t|fdd}i }| D ]\} | | \}}t	d| d|d  
||} |}t|j|j }t	d	| d
 |	| |
|  }|
| |	|  }t  fdd|dddf D }t  fdd|dddf D }tt t |t | |}t|j|}|dkrt	d| d
 t j|dd}t  fdd|| D }t|\|_|_|||< q-||fS )a>  Paired two-sided approximate randomization (AR) test for MT evaluation.

    :param baseline_info: A dictionary with `Metric` instances as the keys,
    that contains sufficient statistics and a `Result` instance for the baseline system.
    :param sys_name: The name of the system to be evaluated.
    :param hypotheses: A sequence of string hypotheses for the system.
    :param references: A sequence of reference documents with document being
    defined as a sequence of reference strings. If `None`, references
    will be used through each metric's internal cache.
    :param metrics: A dictionary of `Metric` instances that will be computed
    for each system.
    :param n_samples: The number of AR trials.
    :param n_ar_confidence: The number of bootstrap resamples to use for
    confidence estimation. A value of -1 disables confidence estimation.
    :param seed: The seed value for the RNG. If `None`, the RNG will not be
    fixed to a particular seed.

    :return: A tuple with first element being the system name and the second
    being a `Result` namedtuple.
       )r>   rB   r   Tr=   
Computing  for % and extracting sufficient statisticsz8 > Performing approximate randomization test (# trials: )c                       g | ]}  |jqS r   _aggregate_and_computer   r   xrH   r   r   r          z#_paired_ar_test.<locals>.<listcomp>Nc                    ri   r   rj   rl   rH   r   r   r      rn   zI > Performing bootstrap resampling for confidence interval (# resamples: r@   rA   c                       g | ]}  |d jqS rC   rD   rE   r   rF   rH   r   r   r      s    )r-   rM   rN   integersr/   boolrO   r"   sacreloggerinfo_extract_corpus_statisticsrk   absr   rP   r[   r   r5   r   r   )r^   r_   r`   ra   rb   r9   rc   rQ   rS   pos_selneg_selbs_idxsresultsnamebl_stats	bl_result	sys_stats	sys_scorediffshuf_ashuf_bscores_ascores_brZ   res
sys_scoresr   rH   r   _paired_ar_testp   sH   

r   c                    s:  t j|}i }	|jt||t|fdd}
| D ]~\} | | \}}td| d|d  ||} 	|}t j
|dd}t j
|dd}t|j|j }td| d	 t 
 fd
d||
 D }t 
 fdd||
 D }t|\}}t || }||  }t||}t|j||||	|< q||	fS )a  Paired bootstrap resampling test for MT evaluation. This function
    replicates the behavior of the Moses script called
    `bootstrap-hypothesis-difference-significance.pl`.

    :param baseline_info: A dictionary with `Metric` instances as the keys,
    that contains sufficient statistics and a `Result` instance for the baseline system.
    :param sys_name: The name of the system to be evaluated.
    :param hypotheses: A sequence of string hypotheses for the system.
    :param references: A sequence of reference documents with document being
    defined as a sequence of reference strings. If `None`, references
    will be used through each metric's internal cache.
    :param metrics: A dictionary of `Metric` instances that will be computed
    for each system.
    :param n_samples: The number of bootstrap resamples.
    :param n_ar_confidence: This parameter is not used for this function but
    is there for signature compatibility in the API.
    :param seed: The seed value for the RNG. If `None`, the RNG will not be
    fixed to a particular seed.

    :return: A tuple with first element being the system name and the second
    being a `Result` namedtuple.
    Tr=   re   rf   rg   r@   rA   z= > Performing paired bootstrap resampling test (# resamples: rh   c                    ro   rC   rp   rF   rH   r   r   r         z#_paired_bs_test.<locals>.<listcomp>c                    ro   rC   rp   rF   rH   r   r   r     r   )r-   rM   rN   rO   r/   r"   rs   rt   ru   rk   rP   rv   r   r5   r   r[   r   )r^   r_   r`   ra   rb   r9   rc   rQ   rS   rz   rT   r{   r|   r}   r~   r   r   	scores_bl
scores_syssys_meansys_cisample_diffsr7   rZ   r   rH   r   _paired_bs_test   s4   

r   c                   @   s   e Zd ZdZdddZ				dd	eeeee f  d
e	ee
f deeee   dedededefddZdeeeef eeeeeef  f f fddZdS )
PairedTesta
  This is the manager class that will call the actual standalone implementation
    for approximate randomization or paired bootstrap resampling, based on the
    `test_type` argument.

    :param named_systems: A lisf of (system_name, system_hypotheses) tuples on
    which the test will be applied.
    :param metrics: A dictionary of `Metric` instances that will be computed
    for each system.
    :param references: A sequence of reference documents with document being
    defined as a sequence of reference strings. If `None`, already cached references
    will be used through each metric's internal cache.
    :param test_type: `ar` for approximate randomization, `bs` for paired bootstrap.
    :param n_samples: The number of AR trials (for `ar`) or bootstrap resamples (for `bs`).
    The defaults (10000 or 1000 respectively) will be used if 0 is passed.
    :param n_ar_confidence: If `approximate randomization` is selected, the number
    of bootstrap resamples to use for confidence estimation. A value of -1 disables
    confidence estimation. 0 will use the default of 1000.
    :param n_jobs: If 0, a worker process will be spawned for each system variant.
    If > 0, the number of workers will be set accordingly. The default of 1
    does not use multi-processing.
    r\   r6   arbsr   r   r]   r
   named_systemsrb   ra   	test_typer9   rc   n_jobsc                 C   s8  |dv sJ d||| _ | j dkrt| _n| j dkrt| _tjdd}| dkr.d nt|| _	|| _
|| _|| _|dkrB|n| jd | _|dkrN|n| j| j  | _t|d	 | _trgtd
 d	| _
n| j
dkrt d }	|	dkrzd	| _
nt|	| j| _
i | _i | _i | _| jd \}
}| D ]\}}td| d|
 ||| j}||}| j dkr| jn| j}d\}}|dkrt|||\}}t t!"dd |D \}}t#|j$||d}||f| j|j%< || j|j%< |& }|'dt(| j	  |'| j | j | jdkr|'d| j || j|j%< qd S )Nr   zUnknown test type r   r   r:   r;   r<   r   r
   z,Parallel tests are not supported on Windows.rd   zPre-computing z statistics for )NNc                 S   s   g | ]}|j qS r   )r   rl   r   r   r   r   v  s    z'PairedTest.__init__.<locals>.<listcomp>)r   r   rQ   ))r   r   _fnr   rI   rJ   rK   r3   rL   rR   r   ra   r   _DEFAULT_SAMPLESrc   r9   r/   	n_systems
IS_WINDOWSrs   warningmp	cpu_countmin_signatures_baseline_inforb   r"   rt   ru   rk   rV   r5   r-   rP   r   r   r{   get_signatureupdater   )r   r   rb   ra   r   r9   rc   r   rQ   
n_max_jobsbl_namebl_hypsr{   r8   r|   bl_scoreconfidence_nbl_meanbl_ci_	bl_scoresresultsigr   r   r   r   /  sf   





zPairedTest.__init__r+   c                    sP  g }i }dd j D |d< j D ]\}\}}|g||< qtj dd D ] \}\}}jr4jnd}	|j||jjjj	|	f q(j
dkrXfdd|D }
n4tdj
 d td	j
  fd
d|D }dd |D }
W d   n1 sw   Y  |
D ]\}}| D ]\}}|| | qqj|fS )zCRuns the paired test either on single or multiple worker processes.c                 S   s   g | ]}|d  qS rC   r   )r   nsr   r   r   r         z'PairedTest.__call__.<locals>.<listcomp>Systemr
   Nc                    s   g | ]} j | qS r   )r   r   argsr#   r   r   r     s    z
Launching z parallel workers.forkc                    s   g | ]	}  j|qS r   )apply_asyncr   r   poolr   r   r   r     s    c                 S   s   g | ]}|  qS r   )rK   )r   jr   r   r   r     r   )r   r   r"   	enumeraterR   appendra   rb   r9   rc   r   rs   rt   r   get_contextPoolr   )r   tasksr*   r8   r   r   idxr{   hypsrQ   rz   jobsr_   sys_results_resultr   r   r   __call__  s0   

zPairedTest.__call__N)r   r   r]   r
   )r%   r&   r'   r(   r   r   r   r   r   r	   r   r   rL   r   r   r   r   r   r   r   r   r   r   r     s.    

6Yr   )r6   )r\   r]   N)r6   r]   N)"rI   loggingmultiprocessingr   typingr   r   r   r   r   r   r   r	   numpyr-   metrics.baser   r   r   r{   r   	getLoggerrs   r   ndarrayr)   r5   rL   r   rV   r[   r   r   r   r   r   r   r   <module>   sx    (


 

]

L