o
    Ni                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddlm	  m
Z ddlmZ ejr<ddlZe jZnddlZejZdZdZdZd	Zd
d edD dd edD dZG dd dejjZdS )?Natural Questions: A Benchmark for Question Answering Research.    )absolute_import)division)print_functionNa  
@article{47761,
title	= {Natural Questions: a Benchmark for Question Answering Research},
author	= {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov},
year	= {2019},
journal	= {Transactions of the Association of Computational Linguistics}
}
a~  
The NQ corpus contains questions from real users, and it requires QA systems to
read and comprehend an entire Wikipedia article that may or may not contain the
answer to the question. The inclusion of real user questions, and the
requirement that solutions should read an entire page to find the answer, cause
NQ to be a more realistic and challenging task than prior QA datasets.
z7https://ai.google.com/research/NaturalQuestions/datasetz5https://storage.googleapis.com/natural_questions/v1.0c                 C      g | ]}d t |f qS )z%s/train/nq-train-%02d.jsonl.gz_BASE_DOWNLOAD_URL.0i r   l/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/question_answering/natural_questions.py
<listcomp>:       
r   2   c                 C   r   )z%s/dev/nq-dev-%02d.jsonl.gzr   r	   r   r   r   r   >   r      )train
validationc                   @   sB   e Zd ZdZejdZejdgZdd Z	dd Z
dd	 Zd
S )NaturalQuestionsr   z0.0.2z0.0.1c                 C   s   t jj| tt jtjt j t j t j t j	t j tj
ddt j t j	tjdt j	tjtjtjtjtjdt j	tjtjtjtjt j dt jjddgdd	d
d ttdS )Ntokenis_htmltitleurlhtmltokenstextr   start_token	end_token
start_byteend_byter    r!   r"   r#   r   NOYES)namesidlong_answershort_answersyes_no_answerr)   documentquestionannotations)builderdescriptionfeaturessupervised_keyshomepagecitation)tfdscoreDatasetInfo_DESCRIPTIONr3   FeaturesDicttfstringTextSequenceboolint64
ClassLabel_URL	_CITATION)selfr   r   r   _infoK   sN   
"zNaturalQuestions._infoc                 C   sB   | t}tjjtjjd|d idtjjtjjd|d idgS )zReturns SplitGenerators.	filepathsr   )name
gen_kwargsr   )download_DOWNLOAD_URLSr7   r8   SplitGeneratorSplitTRAIN
VALIDATION)rE   
dl_managerfilesr   r   r   _split_generatorsv   s   


z"NaturalQuestions._split_generatorsc                    s8   t jjj  fdd}| |B  j B  |B S )zBuild PCollection of examples.c                    s   t | }|d dfddfdd jjdd  t|d	 }|||d
 |d dd |d D d|d |d d fdd|d D dfS )z2Parse a single json line and emit an example dict.document_htmlutf-8c                    s\    | d | d  }| dd}|d}tddt|}| d | d	 | d | d |d
S )z"Extract text of short answer.r"   r#   s        rT   z	<([^>]*)> r    r!   r$   )replacedecoderesubhtml_unescape)	short_ans	ans_bytesr   )
html_bytesr   r   _parse_short_answer   s   
zXNaturalQuestions._build_pcollection.<locals>._parse_example.<locals>._parse_short_answerc                    sj   t | d | d d | d d | d d | d d d fdd	| d
 D | d dkr/ddS | d dS )Nannotation_idr*   r    r!   r"   r#   r   c                       g | ]} |qS r   r   )r
   ansr_   r   r   r          zjNaturalQuestions._build_pcollection.<locals>._parse_example.<locals>._parse_annotation.<locals>.<listcomp>r+   r,   NONEr(   )str)an_jsonrc   r   r   _parse_annotation   s   





zVNaturalQuestions._build_pcollection.<locals>._parse_example.<locals>._parse_annotationnqexamples
example_iddocument_titledocument_urlc                 S   s   g | ]}|d  |d dqS )r   
html_tokenr   r   )r
   tr   r   r   r      s    zONaturalQuestions._build_pcollection.<locals>._parse_example.<locals>.<listcomp>document_tokensr   question_textquestion_tokensr   c                    ra   r   r   )r
   rh   )ri   r   r   r      rd   r0   r-   )jsonloadsencodemetricsMetricscounterincrg   )lineex_jsonid_beam)ri   r_   r^   r   _parse_example   s,   


z;NaturalQuestions._build_pcollection.<locals>._parse_example)r7   r8   lazy_importsapache_beamCreateioReadAllFromTextMap)rE   pipelinerG   r   r   r~   r   _build_pcollection   s   
@z#NaturalQuestions._build_pcollectionN)__name__
__module____qualname____doc__r7   r8   VersionVERSIONSUPPORTED_VERSIONSrF   rR   r   r   r   r   r   r   E   s    +r   )r   
__future__r   r   r   rt   rY   sixtensorflow.compat.v2compatv2r<   tensorflow_datasets.public_api
public_apir7   PY2
HTMLParserhtml_parserunescaper[   r   rD   r:   rC   r   rangerK   r8   BeamBasedBuilderr   r   r   r   r   <module>   s4   	