o
    Ni2                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlm  mZ	 ddl
mZ ddlm  mZ ddlmZ dZdZd	Zd
dddddddddddddddZdd ZG dd dejjZG dd dejjZdS ) z>QA4MRE (CLEF 2011/2012/2013): a reading comprehension dataset.    )absolute_import)division)print_functionN)loggingaN  
@InProceedings{10.1007/978-3-642-40802-1_29,
author="Pe{\~{n}}as, Anselmo
and Hovy, Eduard
and Forner, Pamela
and Rodrigo, {\'A}lvaro
and Sutcliffe, Richard
and Morante, Roser",
editor="Forner, Pamela
and M{\"u}ller, Henning
and Paredes, Roberto
and Rosso, Paolo
and Stein, Benno",
title="QA4MRE 2011-2013: Overview of Question Answering for Machine Reading Evaluation",
booktitle="Information Access Evaluation. Multilinguality, Multimodality, and Visualization",
year="2013",
publisher="Springer Berlin Heidelberg",
address="Berlin, Heidelberg",
pages="303--320",
abstract="This paper describes the methodology for testing the performance of Machine Reading systems through Question Answering and Reading Comprehension Tests. This was the attempt of the QA4MRE challenge which was run as a Lab at CLEF 2011--2013. The traditional QA task was replaced by a new Machine Reading task, whose intention was to ask questions that required a deep knowledge of individual short texts and in which systems were required to choose one answer, by analysing the corresponding test document in conjunction with background text collections provided by the organization. Four different tasks have been organized during these years: Main Task, Processing Modality and Negation for Machine Reading, Machine Reading of Biomedical Texts about Alzheimer's disease, and Entrance Exams. This paper describes their motivation, their goals, their methodology for preparing the data sets, their background collections, their metrics used for the evaluation, and the lessons learned along these three years.",
isbn="978-3-642-40802-1"
}
a  
QA4MRE dataset was created for the CLEF 2011/2012/2013 shared tasks to promote research in 
question answering and reading comprehension. The dataset contains a supporting 
passage and a set of questions corresponding to the passage. Multiple options 
for answers are provided for each question, of which only one is correct. The 
training and test datasets are available for the main track.
Additional gold standard documents are available for two pilot studies: one on 
alzheimers data, and the other on entrance exams data.
z}http://nlp.uned.es/clef-qa/repository/js/scripts/downloadFile.php?file=/var/www/html/nlp/clef-qa/repository/resources/QA4MRE/mainz52011/Training_Data/Goldstandard/QA4MRE-2011-{}_GS.xml)DEENESITRO)_TRACKS_PATH_TMPL_MAIN_GS_LANGUAGES_MAIN)r   
alzheimerszR2012/Main_Task/Training_Data/Goldstandard/Used_in_Evaluation/QA4MRE-2012-{}_GS.xml)ARBGr   r   r	   r
   r   zd2012/Pilot_Tasks/Biomedical_About_Alzheimer/Training_Data/Goldstandard/QA4MRE-2012_BIOMEDICAL_GS.xml)r   r   r   _PATH_ALZHEIMER)r   r   entrance_examz?2013/Main_Task/Training_Data/Goldstandard/QA4MRE-2013-{}_GS.xml)r   r   r   r	   r   zU2013/Biomedical_About_Alzheimer/Training_Data/Goldstandard/QA4MRE-2013_BIO_GS-RUN.xmlzN2013/Entrance_Exams/Training_Data/Goldstandard/qa4mre-exam-test-withanswer.xml)r   r   r   r   _PATH_ENTRANCE_EXAM)201120122013c                 C   s   |j d }|dD ]}|j}q
t }	|dD ]}
|
j d }|
j}|	||d d|
j v r3|}|}qd| |||g}td| | |||||||	||d	
}||fS )
a/  Gets instance ID and features for every question.

  Args:
    topic_id: string
    topic_name: string
    test_id: string
    document_id: string
    document_str: string
    question: XML element for question

  Returns:
    id_: string. Unique ID for instance.
    feats: dict of instance features
  q_idq_stranswera_id	answer_id
answer_strcorrect_zID: %s
topic_id
topic_nametest_iddocument_iddocument_strquestion_idquestion_stranswer_optionscorrect_answer_idcorrect_answer_str)attribitertextlistappendjoinr   info)r"   r#   r$   r%   r&   questionr'   q_textr(   possible_answersr   r   r   r*   r+   id_feats r8   S/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/text/qa4mre.py_get_question_   s4   


r:   c                       s*   e Zd ZdZejjd fdd	Z  ZS )Qa4mreConfigzBuilderConfig for Qa4mre.r   r   c                    s   |  t| d vrtdt| d |  dkr&| dkr&td d}|  dkr?| t| d vr?tdt| d || _|  | _| | _| jd | j d | j }t	}|d	
| j| j| j7 }tt| jd||tjd
d| dS )a  BuilderConfig for Qa4Mre.

    Args:
      year: string, year of dataset
      track: string, the task track from PATHS[year]['_TRACKS'].
      language: string, Acronym for language in the main task.
      **kwargs: keyword arguments forwarded to super.
    r   z7Incorrect track. Track should be one of the following: r   r   zNOnly English documents available for pilot tracks. Setting English by default.r   z8Incorrect language for the main track. Correct options: .zDThis configuration includes the {} track for {} language in {} year.z0.1.0)namedescriptionversionNr8   )lowerPATHS
ValueErrorupperr   warnyeartracklang_DESCRIPTIONformatsuperr;   __init__tfdscoreVersion)selfrE   rF   languagekwargsr=   r>   	__class__r8   r9   rK      s<   








zQa4mreConfig.__init__)r   r   )	__name__
__module____qualname____doc__rL   rM   disallow_positional_argsrK   __classcell__r8   r8   rR   r9   r;      s    r;   c                   @   s  e Zd ZdZeddddeddddeddddeddddeddd	ded
ddded
ddded
ddded
ddded
ddded
ddded
dd	ded
dddeddddeddddeddddeddddeddd	deddddeddddgZdd Zdd Zdd ZdS )Qa4mrez7QA4MRE dataset from CLEF shared tasks 2011, 2012, 2013.r   r   r   )rE   rF   rP   r   r	   r
   r   r   r   r   r   r   r   c                 C   s   t jj| tt jt j t j t j t j t j t j t j t jt j t j dt j t j d
d dtdS )Nr   r!   z7http://nlp.uned.es/clef-qa/repository/pastCampaigns.php)builderr>   featuressupervised_keyshomepagecitation)	rL   rM   DatasetInforH   r\   FeaturesDictTextSequence	_CITATION)rO   r8   r8   r9   _info   s,   zQa4mre._infoc              	   C   s   | j }t }|jdkr#tjtt|j d 	|j
|d	|j|j
< |jdv r?|jdkr?tjtt|j d |d	|j< |jdkrW|jd	krWtjtt|j d
 |d< ||}tjjtjjd|d	|j|j|j
 idgS )zReturns SplitGenerators.r   r   z
{}.main.{})r   r   r   r   z{}.alzheimers.ENr   r   r   z2013.entrance_exam.ENfilepathz{}.{}.{})r=   
gen_kwargs)builder_configdictrF   ospathr1   	_BASE_URLrA   rE   rI   rG   download_and_extractrL   rM   SplitGeneratorSplitTRAIN)rO   
dl_managercfgdownload_urlsdownloaded_filesr8   r8   r9   _split_generators  s0   

zQa4mre._split_generatorsc                 c   s    t jj|dM}t|}| }|D ]8}|jd }|jd }|D ])}|jd }	|dD ]
}
|
jd }|
j	}q/|dD ]}t
|||	|||V  q?q#qW d   dS 1 sYw   Y  dS )	zYields examples.rbt_idt_namer_iddocd_idqN)tfiogfileGFileETparsegetrootr,   r-   r.   r:   )rO   rf   ftreeroottopicr"   r#   testr$   documentr%   r&   r3   r8   r8   r9   _generate_examples/  s(   





"zQa4mre._generate_examplesN)	rT   rU   rV   rW   r;   BUILDER_CONFIGSre   ru   r   r8   r8   r8   r9   rZ      s    +'rZ   )rW   
__future__r   r   r   rj   xml.etree.ElementTreeetreeElementTreer   abslr   tensorflow.compat.v2compatv2r}   tensorflow_datasets.public_api
public_apirL   rd   rH   rl   rA   r:   rM   BuilderConfigr;   GeneratorBasedBuilderrZ   r8   r8   r8   r9   <module>   s>   
	0.