o
    7ti9                     @   sP  d dl Z d dlmZ d dlmZ d dlZd dlm  m	  m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d
Zdd Zdd Zdd Zdd ZG dd deZG dd deZG dd deZG dd deZG dd deZ G dd deZ!G dd  d eZ"G d!d" d"eZ#G d#d$ d$eZ$G d%d& d&eZ%dS )'    N)abstractmethod)reduce)Dataset)load)AutoTokenizerInstancemean)ConfigurableTaska  
@inproceedings{shaham-etal-2022-scrolls,
    title = "{SCROLLS}: Standardized {C}ompa{R}ison Over Long Language Sequences",
    author = "Shaham, Uri  and
      Segal, Elad  and
      Ivgi, Maor  and
      Efrat, Avia  and
      Yoran, Ori  and
      Haviv, Adi  and
      Gupta, Ankit  and
      Xiong, Wenhan  and
      Geva, Mor  and
      Berant, Jonathan  and
      Levy, Omer",
    booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
    month = dec,
    year = "2022",
    address = "Abu Dhabi, United Arab Emirates",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.emnlp-main.823",
    pages = "12007--12021"
}
c                  C   s`   dd l } dd l}ddlm} |ddddd}| j|| j|dd	 d
 }||| |S )Nr   )hf_hub_downloadtau/scrollsdatasetzmetrics/scrolls.pyz	refs/pr/5)repo_id	repo_typefilenamerevision._z.py)	osshutilhuggingface_hubr   pathdirnamebasenamereplacecopy)r   r   r   scrolls_metric_pathupdated_scrolls_metric_path r   N/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/tasks/scrolls/task.py_download_metric-   s"   
r!   c                 C   sD   | d }| d}| d | d || d |d| ||d d  dS )	Ninput

idpidoutputsr      )r$   r%   r"   r&   questiontext)find)docr"   splitr   r   r    _process_doc_prepended_questionB   s   

r-   c                 C   s   g }i }g }t t| d | d D ]%\}\}}||v r%|||  | q|| t|||< ||g q| | } | d} | d|} | S )Nr$   outputr&   )	enumeratezipappendlenselectflatten_indicesremove_columns
add_column)untokenized_datasetindices_to_keep	id_to_idxr&   iid_r.   r   r   r    _drop_duplicates_in_inputR   s    

r<   c                  C   s@   zdd l } | jddW S  ty   dd l}t|d Y S w )Nr   F)logical)psutil	cpu_countImportErrorr   r2   sched_getaffinity)r>   r   r   r   r    _num_cpu_coresg   s   rB   c                       s   e Zd ZdZdZdZdZdZdZd& fdd	Z	dd Z
dd	 Zd
d Zdd Zdd Zdd Zdd Z fddZdd Zdd Zdd Zdd Zdd Zed d! Zd"d# Zd$d% Z  ZS )'_SCROLLSTaskr'   r   Nc                    s<   t  jdd| jiid | jd urtt | jd| _d S d S )Nmetadataversion)config)config_name)super__init__VERSIONDATASET_NAMEr   r!   metric)selfrF   	__class__r   r    rI   {   s   
z_SCROLLSTask.__init__c                 C      dS NTr   rM   r   r   r    has_training_docs      z_SCROLLSTask.has_training_docsc                 C   rP   rQ   r   rR   r   r   r    has_validation_docs   rT   z _SCROLLSTask.has_validation_docsc                 C   rP   NFr   rR   r   r   r    has_test_docs   rT   z_SCROLLSTask.has_test_docsc                    D   t t| j| jd  dd  D   fdd d D }t|S )Ntrainc                 S      g | ]	}|D ]}|qqS r   r   .0sublistitemr   r   r    
<listcomp>       z._SCROLLSTask.training_docs.<locals>.<listcomp>c                        i | ]   fd dD qS )c                       g | ]}|  qS r   r   r\   dkeyr   r    r_          z9_SCROLLSTask.training_docs.<locals>.<dictcomp>.<listcomp>r   r\   processed_docsre   r    
<dictcomp>       z._SCROLLSTask.training_docs.<locals>.<dictcomp>r   listmap_process_docr   r   	from_dictrM   processed_dictr   ri   r    training_docs      

z_SCROLLSTask.training_docsc                    rX   )N
validationc                 S   rZ   r   r   r[   r   r   r    r_      r`   z0_SCROLLSTask.validation_docs.<locals>.<listcomp>c                    ra   )c                    rb   r   r   rc   re   r   r    r_      rg   z;_SCROLLSTask.validation_docs.<locals>.<dictcomp>.<listcomp>r   rh   ri   re   r    rk      rl   z0_SCROLLSTask.validation_docs.<locals>.<dictcomp>r   rm   rr   r   ri   r    validation_docs   ru   z_SCROLLSTask.validation_docsc                 C   rP   rQ   r   rR   r   r   r    should_decontaminate   rT   z!_SCROLLSTask.should_decontaminatec                 C   s   |d S )Nr"   r   rM   r+   r   r   r    doc_to_decontamination_query      z)_SCROLLSTask.doc_to_decontamination_queryc                    sT   t  j|i | | jd= | jD ]}t| j| | j|< q| jd ur(|   d S d S )Ntest)rH   downloadr   r<   PRUNE_TOKENIZERSprune)rM   argskwargsr,   rN   r   r    r}      s   

z_SCROLLSTask.downloadc                 C   s   |  | |d S )Nr   )doc_to_textrp   )rM   sampler   r   r    _get_prune_text      z_SCROLLSTask._get_prune_textc                    s<   dd j D i   fdd}jj|jd_dS )zCreate a pruned version of a SCROLLS task dataset containing only inputs
        that are less than `max_tokens` when tokenized by each tokenizer
        c                 S      g | ]}t |qS r   )r   from_pretrained)r\   	tokenizerr   r   r    r_          z&_SCROLLSTask.prune.<locals>.<listcomp>c                    sZ    | } |d }|d u r+D ]}t||jjkr$d |<  dS qd |< dS |S )NFT)r   getr2   	input_idsPRUNE_MAX_TOKENS)r   r)   cachedr   cacherM   
tokenizersr   r    _filter   s   
z#_SCROLLSTask.prune.<locals>._filter)num_procN)r~   r   filterPRUNE_NUM_PROC)rM   r   r   r   r    r      s   z_SCROLLSTask.prunec                 C   s   dd |d  S )N z, r&   )joinry   r   r   r    doc_to_target   s   z_SCROLLSTask.doc_to_targetc                 C      |d  d|d  dS Nr)   z

Question: r(   z
Answer:r   ry   r   r   r    r         z_SCROLLSTask.doc_to_textc                 C   s   dd |    D S )Nc                 S   s   i | ]}|d qS )Tr   )r\   xr   r   r    rk      s    z1_SCROLLSTask.higher_is_better.<locals>.<dictcomp>)_scrolls_metricskeysrR   r   r   r    higher_is_better   s   z_SCROLLSTask.higher_is_betterc                 C      d S Nr   rR   r   r   r    r      s   z_SCROLLSTask._scrolls_metricsc                    s    fdd}|S )Nc                    s$   t |  \}} jj||d}| S )N)predictions
references)r0   rL   compute)samplesr   r   computedrM   valuer   r    compute_metrics   s
   z;_SCROLLSTask._make_compute_metrics.<locals>.compute_metricsr   )rM   r   r   r   r   r    _make_compute_metrics   s   z"_SCROLLSTask._make_compute_metricsc                    s    fdd    D S )Nc                    s   i | ]
\}}|  |qS r   )r   )r\   rf   r   rR   r   r    rk      s    
z,_SCROLLSTask.aggregation.<locals>.<dictcomp>)r   itemsrR   r   rR   r    aggregation   s   

z_SCROLLSTask.aggregationr   )__name__
__module____qualname__rJ   DATASET_PATHrK   r~   r   r   rI   rS   rU   rW   rt   rw   rx   rz   r}   r   r   r   r   r   r   r   r   r   __classcell__r   r   rN   r    rC   s   s2    

rC   c                   @   s@   e Zd Zdd Zdd Zdd Zdd Zd	d
 Z	dddZdS )_SCROLLSMultipleChoiceTaskc                 C   s
   d | _ d S r   )rL   rR   r   r   r    __post_init__      
z(_SCROLLSMultipleChoiceTask.__post_init__c                 C   r   r   r   rR   r   r   r    r      rT   z+_SCROLLSMultipleChoiceTask._scrolls_metricsc                 C   s   t t t dS )Nemaccacc_normr	   rR   r   r   r    r         z&_SCROLLSMultipleChoiceTask.aggregationc                 C   s   ddddS )NTr   r   rR   r   r   r    r      r   z+_SCROLLSMultipleChoiceTask.higher_is_betterc           	      C   sl   |d }t | \}}t||krdnd}tdd |d D }t|| |kr,dnd}|||d dS )	Ngoldg      ?g        c                 S   s   g | ]}t t|qS r   )floatr2   )r\   r:   r   r   r    r_      s    z>_SCROLLSMultipleChoiceTask.process_results.<locals>.<listcomp>choicesg      Y@)r   r   r   )r0   npargmaxarray)	rM   r+   resultsr   llsr   r   completion_lenr   r   r   r    process_results   s   z*_SCROLLSMultipleChoiceTask.process_resultsNFc                    s$    fddt d D }|S )Nc              
      sD   g | ]\}}t dd  sd|fnd|f|dqS )loglikelihoodz {}z{}request_typer+   	argumentsidxr   )r   format)r\   r:   choiceapply_chat_templatectxr+   r   r   r    r_     s    
zA_SCROLLSMultipleChoiceTask.construct_requests.<locals>.<listcomp>r   )r/   )rM   r+   r   chat_templater   r   request_listr   r   r    construct_requests  s   

z-_SCROLLSMultipleChoiceTask.construct_requestsrV   )	r   r   r   r   r   r   r   r   r   r   r   r   r    r      s    r   c                   @   s8   e Zd Zdd Zdd Zdd Z	dd	d
Zdd ZdS )_SCROLLSSummaryTaskc                 C   s   |gS r   r   ry   r   r   r    rp     s   z _SCROLLSSummaryTask._process_docc                 C   s   ddddS )Nzrouge/rouge1zrouge/rouge2zrouge/rougeLrouge1rouge2rougeLr   rR   r   r   r    r     s   z$_SCROLLSSummaryTask._scrolls_metricsc                 C   s0   |d |d f|d |d f|d |d fdS )Nr   r&   r   r   rM   r+   r   r   r   r    r      s   z#_SCROLLSSummaryTask.process_resultsNFc                 K   "   t dd||ddgifdd|S Ngenerate_untiluntil
r   r   r   r   rM   r+   r   r   r   r   r   r   r    r   '     z&_SCROLLSSummaryTask.construct_requestsc                 C   s   |d  dS )Nr"   z<

Question: What is a summary of the preceding text?
Answer:r   ry   r   r   r    r   2  s   z_SCROLLSSummaryTask.doc_to_textrV   )r   r   r   rp   r   r   r   r   r   r   r   r    r     s    
r   c                   @   s8   e Zd ZdZdZdd Zdd Zdd Z	
dddZd	S )Qasperz|A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers
    https://arxiv.org/abs/2105.03011
    qasperc                 C   s&   t |}tdd |d d|d< |gS )Nc                 S   s   | ot |dv S )N)yesno)squad_metricsnormalize_answer)prevcurr   r   r    <lambda>@  s    z%Qasper._process_doc.<locals>.<lambda>r&   T	is_yes_no)r-   r   ry   r   r   r    rp   =  s   zQasper._process_docc                 C      ddiS Nf1r   rR   r   r   r    r   G  r{   zQasper._scrolls_metricsc                 C   sT   |d r|d |d krdnd}nt |d  dkrd}n|d }d||d fiS )	Nr   r       yes noUnanswerabler   r&   )r2   strip)rM   r+   r   
predictionr   r   r    r   J  s   zQasper.process_resultsNFc                 K   sz   |d r,t dd||s|dfn|dfdd|t dd||s!|dfn|dfd	d|gS t dd
||ddgifdd|S )Nr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r   S  s8   
	zQasper.construct_requestsrV   )	r   r   r   __doc__rK   rp   r   r   r   r   r   r   r    r   6  s    

r   c                   @   s2   e Zd ZdZdZedZedd Z	dd Z
dS )	QuALITYzaQuALITY: Question Answering with Long Input Texts, Yes!
    https://arxiv.org/abs/2112.08608
    qualityz *\([A-D]\) *c                 C   s   d |   S )Nr   )r   r,   r   )r)   r   r   r    _normalize_answer{  s   zQuALITY._normalize_answerc                 C   s   t |}|d d|d d}|d d | }|d |d   |d< dd ttj|dd  D |d< |d t|d d	 |d
< |gS )Nr)   r#   z(D)c                 S   r   r   )r   r   )r\   r   r   r   r    r_     r   z(QuALITY._process_doc.<locals>.<listcomp>r   r   r&   r   r   )	r-   r*   r   rer,   r   _multiple_choice_patternindexr   )rM   r+   r,   choices_textr   r   r    rp     s   
 zQuALITY._process_docN)r   r   r   r   rK   r   compiler   staticmethodr   rp   r   r   r   r    r   s  s    

r   c                   @   s@   e Zd ZdZdZdd Zdd Zdd Zd	d
 Z	dddZ	dS )NarrativeQAzYThe NarrativeQA Reading Comprehension Challenge
    https://arxiv.org/abs/1712.07040
    narrative_qac                 C   
   t |gS r   r-   ry   r   r   r    rp     r   zNarrativeQA._process_docc                 C   r   r   r   rR   r   r   r    r     r{   zNarrativeQA._scrolls_metricsc                 C   s   |  |d d S )Nr   r)   )rp   ry   r   r   r    r     s   zNarrativeQA._get_prune_textc                 C   s   d|d |d fiS )Nr   r   r&   r   r   r   r   r    r     r   zNarrativeQA.process_resultsNFc                 K   r   r   r   r   r   r   r    r     r   zNarrativeQA.construct_requestsrV   )
r   r   r   r   rK   rp   r   r   r   r   r   r   r   r    r     s    r   c                   @   s,   e Zd ZdZdZg dZdd Zdd ZdS )	ContractNLIz|ContractNLI: A Dataset for Document-level Natural Language Inference for Contracts
    https://arxiv.org/abs/1712.07040
    contract_nli)zNot mentioned
EntailmentContradictionc                 C   s0   t |}tj|d< tj|d d |d< |gS )Nr   r&   r   r   )r-   r   CHOICESr   ry   r   r   r    rp     s   
zContractNLI._process_docc                 C   r   )Nr)   z

Hypothesis: r(   z
Conclusion:r   ry   r   r   r    r     r   zContractNLI.doc_to_textN)r   r   r   r   rK   r  rp   r   r   r   r   r    r     s    r   c                   @      e Zd ZdZdZdS )	GovReportae  Efficient Attentions for Long Document Summarization
    https://arxiv.org/abs/2104.02112

    Note: The average length of the reference summaries is ~3,000
    characters, or ~600 tokens as tokenized by GPT-NeoX. For causal models,
    it is recommended to set `max_gen_toks` sufficiently large (e.g. 1024)
    to allow a full summary to be generated.
    
gov_reportNr   r   r   r   rK   r   r   r   r    r    s    	r  c                   @   r  )SummScreenFDzhSummScreen: A Dataset for Abstractive Screenplay Summarization
    https://arxiv.org/abs/2104.07091
    summ_screen_fdNr  r   r   r   r    r    s    r  c                   @   s$   e Zd ZdZdZdd Zdd ZdS )QMSumzxQMSum: A New Benchmark for Query-based Multi-domain
    Meeting Summarization

    https://arxiv.org/abs/2104.05938
    qmsumc                 C   r   r   r   ry   r   r   r    rp     r   zQMSum._process_docc                 C   r   r   r   ry   r   r   r    r     r   zQMSum.doc_to_textN)r   r   r   r   rK   rp   r   r   r   r   r    r	    s
    r	  )&r   abcr   	functoolsr   numpyr   'transformers.data.metrics.squad_metricsdatametricsr   datasetsr   evaluater   transformersr   lm_eval.api.instancer   lm_eval.api.metricsr
   lm_eval.api.taskr   	_CITATIONr!   r-   r<   rB   rC   r   r   r   r   r   r   r  r  r	  r   r   r   r    <module>   s4    u-!=#