o
    NiO                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlm  m	Z
 ddlmZ dZdZdZd	ZG d
d dejjZdZdZG dd dejjZdS )z)SCAN tasks with various different splits.    )absolute_import)division)print_functionNai  
@inproceedings{Lake2018GeneralizationWS,
  title={Generalization without Systematicity: On the Compositional Skills of
         Sequence-to-Sequence Recurrent Networks},
  author={Brenden M. Lake and Marco Baroni},
  booktitle={ICML},
  year={2018},
  url={https://arxiv.org/pdf/1711.00350.pdf},
}
@inproceedings{Keysers2020,
  title={Measuring Compositional Generalization: A Comprehensive Method on
         Realistic Data},
  author={Daniel Keysers and Nathanael Sch"{a}rli and Nathan Scales and
          Hylke Buisman and Daniel Furrer and Sergii Kashubin and
          Nikola Momchev and Danila Sinopalnikov and Lukasz Stafiniak and
          Tibor Tihon and Dmitry Tsarkov and Xiao Wang and Marc van Zee and
          Olivier Bousquet},
  note={Additional citation for MCD splits},
  booktitle={ICLR},
  year={2020},
  url={https://arxiv.org/abs/1912.09713.pdf},
}
a!  SCAN tasks with various splits.

SCAN is a set of simple language-driven navigation tasks for studying
compositional learning and zero-shot generalization.

Most splits are described at https://github.com/brendenlake/SCAN. For the MCD
splits please see https://arxiv.org/abs/1912.09713.pdf.

Basic usage:

```
data = tfds.load('scan/length')
```

More advanced example:

```
data = tfds.load(
    'scan',
    builder_kwargs=dict(
        config=tfds.text.ScanConfig(
            name='simple_p8', directory='simple_split/size_variations')))
```
z6https://github.com/brendenlake/SCAN/archive/master.zipz=https://storage.googleapis.com/cfq_dataset/scan-splits.tar.gzc                       s*   e Zd ZdZejjd fdd	Z  ZS )
ScanConfiga  BuilderConfig for SCAN.

  Splits can be read in two formats:

  1) As a pair of train and test files where each file contains one example
     input and output per line.
  2) With a 'splitfile' which contains for each split the indices into the full
     (unsplit) dataset.
  Nc                    sv   t t| jd|tjdtd| || _d|v r|d | _| jr+|du r+d| _dS |du r6|d | _dS || _dS )	aQ  BuilderConfig for SCAN.

    Args:
      name: Unique name of the split.
      directory: Which subdirectory to read the data files from.
      splitfile: If set the samples are read from the original archive
        (tasks.txt) but the splits are created using this index file.
      **kwargs: keyword arguments forwarded to super.
    z1.1.1)nameversiondescriptionmcdz.jsonN _split )	superr   __init__tfdscoreVersion_DESCRIPTION	splitfile	directory)selfr   r   r   kwargs	__class__r   Q/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/text/scan.pyr   ]   s   



zScanConfig.__init__NN)	__name__
__module____qualname____doc__r   r   disallow_positional_argsr   __classcell__r   r   r   r   r   R   s    
r   commandsactionsc                   @   s   e Zd ZdZeddedddedddedd	ded
d	dedd	dedd	deddedddedddedddedddeddeddeddgZdd Zdd Zdd ZdddZ	dS ) ScanzCSCAN task / splits as proposed by Brenden M. Lake and Marco Baroni.simple)r   addprim_jumpadd_prim_split)r   r   addprim_turn_leftfiller_num0filler_splitfiller_num1filler_num2filler_num3lengthtemplate_around_righttemplate_splittemplate_jump_around_righttemplate_opposite_righttemplate_rightmcd1mcd2mcd3c              
   C   s8   t jj| tt jtt j tt j ittfdt	dS )Nz#https://github.com/brendenlake/SCAN)builderr   featuressupervised_keyshomepagecitation)
r   r   DatasetInfor   r7   FeaturesDict	_COMMANDSText_ACTIONS	_CITATION)r   r   r   r   _info   s   

z
Scan._infoc           	      C   s  | tjjttjjjd}tj	|d| j
j}| j
j}| j
j}d|v r7| t}tj	|d}tj	||}|rTtj	|d|d}| }d|d< | }d	|d< nd
tj	|d| d i}d
tj	|d| d i}tjjtjj|dtjjtjj|dgS )zReturns SplitGenerators.)urlextract_methodzSCAN-masterr	   zscan-splitsz	tasks.txt)datapath	splitpathtrain	splitnametestrD   tasks_train_z.txttasks_test_)r   
gen_kwargs)download_and_extractr   downloadResource	_DATA_URLExtractMethodZIPospathjoinbuilder_configr   r   r   _MCD_SPLITS_URLcopyr   SplitGeneratorSplitTRAINTEST)	r   
dl_managerdata_dirsplitr   	split_dirr   train_kwargstest_kwargsr   r   r   _split_generators   s@   


zScan._split_generatorsc                 c   s    t jj|1}t|D ]#\}}|dsq|tdd   dd\}}|t	|t
|ifV  qW d    d S 1 s<w   Y  d S )NzIN: z OUT:    )tfiogfileGFile	enumerate
startswithlenstripr^   r=   r?   )r   rD   infileiliner!   r"   r   r   r   _read_examples   s   
 "zScan._read_examplesNc           	      c   s    |r6t | |}tjj|}t|}W d   n1 s!w   Y  ||d  D ]}|| V  q,dS | |D ]}|V  q;dS )zYields examples.NIdxs)listro   rd   re   rf   rg   jsonload)	r   rD   rE   rG   all_samplesrl   r^   idxexampler   r   r   _generate_examples   s   zScan._generate_examplesr   )
r   r   r   r   r   BUILDER_CONFIGSrA   rb   ro   rw   r   r   r   r   r#   }   s,    









&	r#   )r   
__future__r   r   r   rr   rR   tensorflow.compat.v2compatv2rd   tensorflow_datasets.public_api
public_apir   r@   r   rO   rV   r   BuilderConfigr   r=   r?   GeneratorBasedBuilderr#   r   r   r   r   <module>   s"   '