o
    Ni                     @   s|   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlmZ	 dZ
dZdZG d	d
 d
e	jjZG dd de	jjZdS )IMDB movie reviews dataset.    )absolute_import)division)print_functionNa&  Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.a]  @InProceedings{maas-EtAl:2011:ACL-HLT2011,
  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
  title     = {Learning Word Vectors for Sentiment Analysis},
  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
  month     = {June},
  year      = {2011},
  address   = {Portland, Oregon, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {142--150},
  url       = {http://www.aclweb.org/anthology/P11-1015}
}
z>http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gzc                       s*   e Zd ZdZejjd fdd	Z  ZS )IMDBReviewsConfigzBuilderConfig for IMDBReviews.Nc                    s:   t t| jddtjddi| |ptjj | _	dS )a  BuilderConfig for IMDBReviews.

    Args:
      text_encoder_config: `tfds.features.text.TextEncoderConfig`, configuration
        for the `tfds.features.text.TextEncoder` used for the IMDB `"text"`
        feature.
      **kwargs: keyword arguments forwarded to super.
    versionz1.0.0z6New split API (https://tensorflow.org/datasets/splits)N )
superr   __init__tfdscoreVersionfeaturestextTextEncoderConfigtext_encoder_config)selfr   kwargs	__class__r   Q/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/text/imdb.pyr
   8   s   
zIMDBReviewsConfig.__init__N)	__name__
__module____qualname____doc__r   r   disallow_positional_argsr
   __classcell__r   r   r   r   r   5   s    r   c                
   @   s   e Zd ZdZedddeddejjjejj	 dded	d
ejjjejjj
dddeddejjjejjj
dddgZdd Zdd Zdd ZdddZdS )IMDBReviewsr   
plain_textz
Plain text)namedescriptionbyteszGUses byte-level text encoding with `tfds.features.text.ByteTextEncoder`)encoder)r    r!   r   
subwords8kz?Uses `tfds.features.text.SubwordTextEncoder` with 8k vocab sizei    )encoder_cls
vocab_sizesubwords32kz@Uses `tfds.features.text.SubwordTextEncoder` with 32k vocab sizei   c              	   C   sB   t jj| tt jt jj| jjdt jj	ddgddddt
dS )N)encoder_confignegpos)namesr   labelz-http://ai.stanford.edu/~amaas/data/sentiment/)builderr!   r   supervised_keyshomepagecitation)r   r   DatasetInfo_DESCRIPTIONr   FeaturesDictTextbuilder_configr   
ClassLabel	_CITATION)r   r   r   r   _infok   s   zIMDBReviews._infoc                 c   s0    |  |tjddD ]	\}}|d V  qd S )NaclImdbtrainr   )_generate_examplesospathjoin)r   archive_exr   r   r   _vocab_text_geny   s   zIMDBReviews._vocab_text_genc              	      s    t  fdd}| jjd | |  tjjtj	j
| tjddddtjjtj	j| tjddddtjjt	d	| tjddd
ddgS )Nc                      s
     S r   )iter_archiver   	arch_path
dl_managerr   r   <lambda>   s   
 z/IMDBReviews._split_generators.<locals>.<lambda>r   r:   r;   )r@   	directory)r    
gen_kwargstestunsupervisedF)r@   rI   labeled)download_DOWNLOAD_URLinfor   maybe_build_from_corpusrC   r   r   SplitGeneratorSplitTRAINr=   r>   r?   TEST)r   rG   r@   r   rE   r   _split_generators~   s2   

zIMDBReviews._split_generatorsTc                 c   s    |rdnd}t tjd| |ddd}|D ]$\}}||}|s&q|  }	|r4|	 d nd}
||	|
d	fV  qd
S )zGenerate IMDB examples.z(?P<label>neg|pos)unsupz^%s \z\\r-   r,   N)
recompiler=   r>   r?   replacematchreadstrip	groupdict)r   r@   rI   rM   reg_pathregr>   imdb_fresr   r-   r   r   r   r<      s    
zIMDBReviews._generate_examplesN)T)r   r   r   r   r   r   r   r   r   ByteTextEncoderSubwordTextEncoderBUILDER_CONFIGSr9   rC   rV   r<   r   r   r   r   r   K   sD    
r   )r   
__future__r   r   r   r=   r[   tensorflow_datasets.public_api
public_apir   r3   r8   rO   r   BuilderConfigr   GeneratorBasedBuilderr   r   r   r   r   <module>   s   