o
    Niu/                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlmZ ddl	Z	ddl
m  mZ ddlmZ dZdZd	Zd
ZdZdZdZdZdZdZdZdd Zdd ZG dd dejjZ G dd dejj!Z"dS )z*TriviaQA: A Reading Comprehension Dataset.    )absolute_import)division)print_functionN)logginga  
@article{2017arXivtriviaqa,
       author = {{Joshi}, Mandar and {Choi}, Eunsol and {Weld},
                 Daniel and {Zettlemoyer}, Luke},
        title = "{triviaqa: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension}",
      journal = {arXiv e-prints},
         year = 2017,
          eid = {arXiv:1705.03551},
        pages = {arXiv:1705.03551},
archivePrefix = {arXiv},
       eprint = {1705.03551},
}
z=http://nlp.cs.washington.edu/triviaqa/data/triviaqa-{}.tar.gzz*-train.jsonz
*-dev.jsonz*test-without-answers.jsonzevidence/webzevidence/wikipediaaG  TriviaqQA is a reading comprehension dataset containing over 650K
question-answer-evidence triples. TriviaqQA includes 95K question-answer
pairs authored by trivia enthusiasts and independently gathered evidence
documents, six per question on average, that provide high quality distant
supervision for answering the questions.
z]Question-answer pairs where all documents for a given question contain the
answer string(s).
z110k question-answer pairs for open domain QA where not all documents for a
given question contain the answer string(s). This makes the unfiltered dataset
more appropriate for IR-style QA.
z3Includes context from Wikipedia and search results.c                 C      t jjtj| tS N)tfiogfileglobospathjoin_WEB_EVIDENCE_DIRtmp_dir r   d/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/question_answering/trivia_qa.py_web_evidence_dirK      r   c                 C   r   r   )r   r	   r
   r   r   r   r   _WIKI_EVIDENCE_DIRr   r   r   r   _wiki_evidence_dirO   r   r   c                       s*   e Zd ZdZejjd fdd	Z  ZS )TriviaQAConfigzBuilderConfig for TriviaQA.Fc                    sf   |rdnd}|r|d7 }|rt nt}|s|t7 }tt| jd||tjdd| || _	|| _
dS )a;  BuilderConfig for TriviaQA.

    Args:
      unfiltered: bool, whether to use the unfiltered version of the dataset,
        intended for open-domain QA.
      exclude_context: bool, whether to exclude Wikipedia and search context for
        reduced size.
      **kwargs: keyword arguments forwarded to super.
    
unfilteredrcz
.nocontextz1.1.0)namedescriptionversionNr   )_UNFILTERED_DESCRIPTION_RC_DESCRIPTION_CONTEXT_ADDENDUMsuperr   __init__tfdscoreVersionr   exclude_context)selfr   r&   kwargsr   r   	__class__r   r   r"   V   s   

zTriviaQAConfig.__init__)FF)	__name__
__module____qualname____doc__r#   r$   disallow_positional_argsr"   __classcell__r   r   r)   r   r   S   s    r   c                   @   sT   e Zd ZdZedddedddedddedddgZdd Zdd Zd	d
 ZdS )TriviaQAzkTriviaQA is a reading comprehension dataset.

  It containss over 650K question-answer-evidence triples.
  F)r   r&   Tc                 C   s   t jj| tt jt j t j t j t jt j t j t j t j dt jt j t j tj	t j t j t j dt jt jt j t jt j t j t j t j t j t j ddd dt
dS )N)
doc_sourcefilenametitlewiki_context)r   r3   rankr4   urlsearch_contextaliasesnormalized_aliasesmatched_wiki_entity_name#normalized_matched_wiki_entity_namenormalized_valuetypevalue)questionquestion_idquestion_sourceentity_pagessearch_resultsanswerz&http://nlp.cs.washington.edu/triviaqa/)builderr   featuressupervised_keyshomepagecitation)r#   r$   DatasetInfo_DESCRIPTIONrH   FeaturesDictTextSequencer   int32	_CITATION)r'   r   r   r   _info~   sF   4zTriviaQA._infoc                 C   s<  | j }t }|jr|jstd|d< |jrtd|d< ||}|jr.tj	|d dntj	|d d}t
jjtj	|t}t
jjtj	|t}t
jjtj	|t}|jrcd}	d}
ntj	|d t}	tj	|d t}
tjjtjj||	|
ddtjjtjj||	|
ddtjjtjj||	|
ddgS )zReturns SplitGenerators.r   r   ztriviaqa-unfilteredqaN)filesweb_dirwiki_dir)r   
gen_kwargs)builder_configdictr   r&   _DOWNLOAD_URL_TMPLformatdownload_and_extractr   r   r   r   r	   r
   r   _TRAIN_FILE_FORMAT_VALIDATION_FILE_FORMAT_TEST_FILE_FORMATr   r   r#   r$   SplitGeneratorSplitTRAIN
VALIDATIONTEST)r'   
dl_managercfgdownload_urls
file_pathsqa_dirtrain_filesvalid_files
test_filesweb_evidence_dirwiki_evidence_dirr   r   r   _split_generators   sV   
zTriviaQA._split_generatorsc              	   #   s     fdd}|D ]U}t d| tj|}tjj|8}d}|D ]+}	|	dkr.|	}q%|		drLt
|d }
d}||
}d||d	 f |fV  q%||	7 }q%W d
   n1 s[w   Y  qd
S )z#This function returns the examples.c                    s$  dd }d| v r7| d }||d ||d | dd | dd |d	  |d
  |d  d}n
g g dddddd}jjrMg | d< g | d< dd }dd   fdd}|||  dg dg d}|||  dg dg d}| d  }| d }	| d  }
||||	|
|dS )z4Return a single example from an article JSON record.c                 S   s   dd | D S )Nc                 S   s   g | ]}|  qS r   )strip).0itemr   r   r   
<listcomp>   s    zVTriviaQA._generate_examples.<locals>.parse_example.<locals>._strip.<locals>.<listcomp>r   )
collectionr   r   r   _strip   s   zBTriviaQA._generate_examples.<locals>.parse_example.<locals>._stripAnswerAliasesNormalizedAliasesMatchedWikiEntryName NormalizedMatchedWikiEntryNameNormalizedValueTypeValuer9   z<unk>SearchResultsEntityPagesc              
   S   s   g }| D ]Q}d|vrt d q| }|d }z$tjjtj	||}|
 ||< W d   n1 s6w   Y  W n ttjjfyO   t d| Y qw || q|S )z8Adds context from file, or skips if file does not exist.Filenamez%Missing context 'Filename', skipping.Nz!File does not exist, skipping: %s)r   infocopyr   r	   r
   GFiler   r   r   readIOErrorerrorsNotFoundErrorappend)ru   context_fieldfile_dir	new_itemsrs   new_itemfnamefr   r   r   _add_context  s$   
zHTriviaQA._generate_examples.<locals>.parse_example.<locals>._add_contextc                 S   s   t | tjr
|  S | S r   )
isinstancesixstring_typesrq   )vr   r   r   _strip_if_str*  r   zITriviaQA._generate_examples.<locals>.parse_example.<locals>._strip_if_strc                    s    fdd|D S )Nc                    s,   i | ] t jj  fd dD qS )c                    s   g | ]} | qS r   r   )rr   d)r   kr   r   rt   /  s    zuTriviaQA._generate_examples.<locals>.parse_example.<locals>._transpose_and_strip_dicts.<locals>.<dictcomp>.<listcomp>)r#   r$   namingcamelcase_to_snakecase)rr   )r   dicts)r   r   
<dictcomp>-  s
    zjTriviaQA._generate_examples.<locals>.parse_example.<locals>._transpose_and_strip_dicts.<locals>.<dictcomp>r   )r   field_namesr   )r   r   _transpose_and_strip_dicts,  s   zVTriviaQA._generate_examples.<locals>.parse_example.<locals>._transpose_and_strip_dictsSearchContext)Descriptionr   RankTitleUrlr   WikiContext)	DocSourcer   r   r   Question
QuestionIdQuestionSource)rD   rE   rA   rB   rC   rF   )getrq   rY   r&   )articlerv   rF   answer_dictr   r   rE   rD   rA   rB   rC   r'   rV   rW   r   r   parse_example   s^   




z2TriviaQA._generate_examples.<locals>.parse_examplezgenerating examples from = %sr{   z
        {
z	        }}z%s_%srB   N)r   r   r   r   basenamer   r	   r
   r   
startswithjsonloads)r'   rU   rV   rW   r   filepathr   r   current_recordliner   exampler   r   r   _generate_examples   s(   a

zTriviaQA._generate_examplesN)	r+   r,   r-   r.   r   BUILDER_CONFIGSrS   rp   r   r   r   r   r   r1   p   s    



=,r1   )#r.   
__future__r   r   r   r   r   abslr   r   tensorflow.compat.v2compatv2r   tensorflow_datasets.public_api
public_apir#   rR   r[   r^   r_   r`   r   r   rM   r   r   r    r   r   r$   BuilderConfigr   GeneratorBasedBuilderr1   r   r   r   r   <module>   s4   