o
    Ni                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddlm	Z	 ddl
m  mZ ddlmZ dZdZd	Zg d
ZG dd dejjZdZdZdZdZdd ZG dd dejjZdS )z.CFQ (Compositional Freebase Question) dataset.    )absolute_import)division)print_functionN)logginga  
@inproceedings{Keysers2020,
  title={Measuring Compositional Generalization: A Comprehensive Method on
         Realistic Data},
  author={Daniel Keysers and Nathanael Sch"{a}rli and Nathan Scales and
          Hylke Buisman and Daniel Furrer and Sergii Kashubin and
          Nikola Momchev and Danila Sinopalnikov and Lukasz Stafiniak and
          Tibor Tihon and Dmitry Tsarkov and Xiao Wang and Marc van Zee and
          Olivier Bousquet},
  booktitle={ICLR},
  year={2020},
  url={https://arxiv.org/abs/1912.09713.pdf},
}
a   
The CFQ dataset (and it's splits) for measuring compositional generalization.

See https://arxiv.org/abs/1912.09713.pdf for background.

A note about the validation set: Since it has the same distribution as the test
set and we are interested in measuring the compositional generalization of a
*model* with respect to an *unknown* test distribution we suggest that any
tuning should be done on a subset of the train set only (see section 5.1 of the
paper).

Example usage:

```
data = tfds.load('cfq/mcd1')
```
z5https://storage.googleapis.com/cfq_dataset/cfq.tar.gz)	4_04_42z4.5_0z4.5_455_50r   z5.5_0z5.5_556_0c                       s2   e Zd ZdZejj				d fdd	Z  ZS )	CFQConfigzBuilderConfig for CFQ splits.Nc                    s   |dur>|du rd}t d }n |tddv r|}t |d  }n|t v r)t |}ntd| d}d||f }d||f }nd	}|}tt| jd|tj	d
t
d| tj||d | _dS )aC  BuilderConfig for CFQ.

    Can be constucted in two ways:
    1. With directory and name in which case these determine the split file.
    2. With compound_divergence (and optionally random_seed).

    Args:
      name: Unique name of the split.
      directory: Which subdirectory to read the split from.
      compound_divergence: The desired compound divergence.
      random_seed: The random seed. Can be either the specific random-seeds used
        to generate the split as string or an index in the range [1, 9].
      **kwargs: keyword arguments forwarded to super.
    N   r   
   zInvalid random seed: %szsplits/all_divergence_splitszdivergence_split_s0.4_d%s_r%szcd%s_r%ssplitsz1.2.0)nameversiondescriptionz.json )_RANDOM_SEEDSrangeindex
ValueErrorsuperr
   __init__tfdscoreVersion_DESCRIPTIONospathjoin
split_file)selfr   	directorycompound_divergencerandom_seedkwargsrandom_seed_index
split_name	__class__r   P/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/text/cfq.pyr   I   s2   

zCFQConfig.__init__)NNNN)	__name__
__module____qualname____doc__r   r   disallow_positional_argsr   __classcell__r   r   r'   r)   r
   F   s    r
   questionqueryquestionPatternModEntitiessparqlPatternModEntitiesc                  C   s4   g } dD ]}t ddD ]}| t||d qq| S )zEGenerate configs for different compound divergences and random seeds.)r   g?g?g333333?g?g      ?g333333?r   r   r   )r"   r#   )r   appendr
   )configsr"   r#   r   r   r)   -_generate_compound_divergence_builder_configs~   s   r6   c                
   @   sz   e Zd ZdZeddeddeddeddeddedded	ded
dge  Zdd Zdd Zdd Z	dd Z
dS )CFQzCFQ task / splits.mcd1)r   mcd2mcd3question_complexity_splitquestion_pattern_splitquery_complexity_splitquery_pattern_splitrandom_splitc              
   C   s8   t jj| tt jtt j tt j ittfdt	dS )NzBhttps://github.com/google-research/google-research/tree/master/cfq)builderr   featuressupervised_keyshomepagecitation)
r   r   DatasetInfor   rA   FeaturesDict	_QUESTIONText_QUERY	_CITATION)r    r   r   r)   _info   s   

z	CFQ._infoc                 C   sv   | t}tj|d}tjjtjj	|| j
jdddtjjtjj|| j
jdddtjjtjj|| j
jdddgS )zReturns SplitGenerators.cfq	trainIdxs)base_directorysplits_filesplit_id)r   
gen_kwargsdevIdxstestIdxs)download_and_extract	_DATA_URLr   r   r   r   r   SplitGeneratorSplitTRAINbuilder_configr   
VALIDATIONTEST)r    
dl_managerdata_dirr   r   r)   _split_generators   s0   
zCFQ._split_generatorsc                 C   s8   t dttf t j}dddd ||D  d S )z9Reduce JSON by filtering out only the fields of interest.z%("%s":\s*"[^"]*").*?("%s":\s*"[^"]*")[,c                 S   s,   g | ]}d | d d | d d qS ){r   r`      })group).0mr   r   r)   
<listcomp>   s    z#CFQ._scrub_json.<locals>.<listcomp>])recompile_QUESTION_FIELD_QUERY_FIELDDOTALLr   finditer)r    contentregexr   r   r)   _scrub_json   s   zCFQ._scrub_jsonc              	   c   s   t j|d}t j||}tjj|c}tjj|C}td| t	
| | }tdt| td| t	|}|| D ]}	||	 }
|	t|
t t|
t ifV  qFW d   n1 sdw   Y  W d   dS W d   dS 1 s|w   Y  dS )zYields examples.zdataset.jsonz#Reading json from %s into memory...z%d samples loadedzLoaded json data from %s.N)r   r   r   tfiogfileGFiler   infojsonloadsrq   readlenloadrG   rk   rI   rl   )r    rN   rO   rP   samples_pathsplits_pathsamples_filesamplesr   idxsampler   r   r)   _generate_examples   s*   

"zCFQ._generate_examplesN)r*   r+   r,   r-   r
   r6   BUILDER_CONFIGSrK   r^   rq   r   r   r   r   r)   r7      s"    	r7   )r-   
__future__r   r   r   rw   r   ri   abslr   tensorflow.compat.v2compatv2rr   tensorflow_datasets.public_api
public_apir   rJ   r   rU   r   r   BuilderConfigr
   rG   rI   rk   rl   r6   GeneratorBasedBuilderr7   r   r   r   r)   <module>   s*   2