o
    Ni                     @   sp   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddl	m
Z dZdZdZG d	d
 d
ejjZdS )zclinc_oos dataset.    )absolute_import)division)print_functionNz<https://github.com/jereliu/datasets/raw/master/clinc_oos.zipau  
@inproceedings{larson-etal-2019-evaluation,
    title = "An Evaluation Dataset for Intent Classification and Out-of-Scope Prediction",
    author = "Larson, Stefan  and
      Mahendran, Anish  and
      Peper, Joseph J.  and
      Clarke, Christopher  and
      Lee, Andrew  and
      Hill, Parker  and
      Kummerfeld, Jonathan K.  and
      Leach, Kevin  and
      Laurenzano, Michael A.  and
      Tang, Lingjia  and
      Mars, Jason",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
    month = nov,
    year = "2019",
    address = "Hong Kong, China",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/D19-1131",
    doi = "10.18653/v1/D19-1131",
    pages = "1311--1316",
}
a  
Task-oriented dialog systems need to know when a query falls outside their range of supported intents, but current text classification corpora only define label sets that cover every example. We introduce a new dataset that includes queries that are out-of-scope (OOS), i.e., queries that do not fall into any of the system's supported intents. This poses a new challenge because models cannot assume that every query at inference time belongs to a system-supported intent class. Our dataset also covers 150 intent classes over 10 domains, capturing the breadth that a production task-oriented agent must handle. It offers a way of more rigorously and realistically benchmarking text classification in task-driven dialog systems.
c                   @   s4   e Zd ZdZejdZdd Zdd Z	dd Z
d	S )
ClincOOSa  CLINC Dataset for Intent Classification and Out-of-Scope (OOS) Detection.

  This dataset is for evaluating the performance of intent classification
  systems in the presence of "out-of-scope" queries. By "out-of-scope",
  we mean queries that do not fall into any of the system-supported intent
  classes. Most datasets include only data that is "in-scope".
  Our dataset includes both in-scope and out-of-scope data.

  This version of the CLINC OOS dataset contains 150 "in-scope" intents from
  10 domains. Each intent has 100 train, 20 validation, and 30 test samples.
  There are 100 train and validation out-of-scope samples, and 1000 out-of-scope
  test samples. It also contains labels for intent domain which are not included
  in the original dataset.
  z0.1.0c                 C   sB   t jj| tt jt j tjtjt j t j dddt	dS )N)textintentdomainintent_namedomain_name)r   r   z"https://github.com/clinc/oos-eval/)builderdescriptionfeaturessupervised_keyshomepagecitation)
tfdscoreDatasetInfo_DESCRIPTIONr   FeaturesDictTexttfint32	_CITATION)self r   V/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/text/clinc_oos.py_infoN   s   zClincOOS._infoc                 C   s   | t}tjjddtj|didtjjddtj|didtjjddtj|didtjjd	dtj|d
idtjjddtj|didtjjddtj|didgS )zReturns SplitGenerators.trainfilenamez	train.csv)name
gen_kwargstestztest.csv
validationzval.csv	train_oosztrain_ood.csvtest_oosztest_ood.csvvalidation_ooszval_ood.csv)download_and_extract_DOWNLOAD_URLr   r   SplitGeneratorospathjoin)r   
dl_manager	data_pathr   r   r   _split_generators_   s4   
zClincOOS._split_generatorsc                 c   s    t jj|<}t|}t|D ])\}}i }|d |d< |d |d< |d |d< |d |d< |d |d< ||fV  qW d   dS 1 sGw   Y  dS )zYields examples.r   r   r   r	   r
   N)r   iogfileGFilecsv
DictReader	enumerate)r   r   freaderrow_idrowexampler   r   r   _generate_examples}   s   
"zClincOOS._generate_examplesN)__name__
__module____qualname____doc__r   r   VersionVERSIONr   r/   r;   r   r   r   r   r   <   s    r   )r?   
__future__r   r   r   r3   r*   
tensorflowr   tensorflow_datasets.public_api
public_apir   r(   r   r   r   GeneratorBasedBuilderr   r   r   r   r   <module>   s   