o
    Nif3                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlmZ ddl	m
  mZ ddlmZ ddlmZ dZd	Zejd
dZejddejdgZdZdZdZdZdZdZdZdZdZ dZ!G dd dejj"Z#G dd dejj$Z%dS )!C4 dataset based on Common Crawl.    )absolute_import)division)print_functionN)logging)c4_utilsa  A colossal, cleaned version of Common Crawl's web crawl corpus.

Based on Common Crawl dataset: https://commoncrawl.org

To generate this dataset, please follow
[the instructions from t5](https://github.com/google-research/text-to-text-transfer-transformer#c4).

Due to the overhead of cleaning the dataset, it is recommend you prepare it with
a distributed service like Cloud Dataflow. More info at
https://www.tensorflow.org/datasets/beam_datasets.
au  
@article{2019t5,
  author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
  title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
  journal = {arXiv e-prints},
  year = {2019},
  archivePrefix = {arXiv},
  eprint = {1910.10683},
}
z2.3.0z Deduplicate lines within a page.z2.2.1zUpdate dataset_info.jsonz2.2.0z$https://commoncrawl.s3.amazonaws.comzQhttps://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-{cc_version}/wet.paths.gzzhttps://raw.githubusercontent.com/rowanz/grover/38f7184bd87237ae2d3bc330b99f1e2e246f6d51/realnews/domain_to_allowed_subdomains.jsonzhttps://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/25e679f03d96baa721cde20db9944649e8d0a844/{lang}z@https://storage.googleapis.com/tfds-data/manual_checksums/c4.txtzOpenWebText.zipz2https://mega.nz/#F!EZZD0YwJ!9_PlEQzdMVLaNdKv_ICNVQz OpenWebText/Version 1/URLs/*.txt)2019-18)z2018-34z2018-39z2018-43z2018-47z2018-51z2019-04z2019-09z2019-13r   z2019-22z2019-26z2019-30c                       s2   e Zd ZdZejj				d fdd	Z  ZS )C4ConfigzBuilderConfig for C4 dataset.NTFc           	         s   |g}|r| d| |s| d |r| d |r"| d d|}tt| jd|ttd| || _|p@|r?tnt	| _
|| _|| _|| _dS )	a2  BuilderConfig for C4.

    Args:
      language: string, the language code, or "all" to disable language
        filtering.
      cc_versions: tuple(string), a collection of versions of Common Crawl to
        use as the raw source text. Set to None to use defaults.
      clean: bool, whether to clean the dataset for badwords, duplications, etc.
      realnewslike: bool, whether to limit to news domains as compiled by
        RealNews.
      webtextlike: bool, whether to limit to WebText-like URLs.
      **kwargs: keyword arguments forwarded to super.
    _nocleanrealnewslikewebtextlike.)nameversionsupported_versionsN )appendjoinsuperr	   __init___VERSION_SUPPORTED_VERSIONSlang _DEFAULT_WEBTEXTLIKE_CC_VERSIONS_DEFAULT_CC_VERSIONScc_versionscleanr   r   )	selflanguager   r   r   r   kwargs
name_partsr   	__class__r   O/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/text/c4.pyr   N   s2   




zC4Config.__init__)NTFF)	__name__
__module____qualname____doc__tfdscoredisallow_positional_argsr   __classcell__r   r   r"   r$   r	   K   s    r	   c                   @   sf   e Zd ZdZdZedddeddddedd	d
dedd	ddgZdd Zdd Zdd Z	dd Z
dS )C4r   a!    For the WebText-like config, you must manually download 'OpenWebText.zip'
  (from https://mega.nz/#F!EZZD0YwJ!9_PlEQzdMVLaNdKv_ICNVQ) and the Common Crawl
  WET files from August 2018 to July 2019
  (https://commoncrawl.org/the-data/get-started/) and place them in the
  `manual_dir`.
  enzEnglish C4 dataset.)r   descriptionFzGDisables all cleaning (deduplication, removal based on bad words, etc.))r   r   r/   TzFilters from the default config to only include content from the domains used in the 'RealNews' dataset (Zellers et al., 2019).)r   r   r/   zFilters from the default config to only include content from the URLs in OpenWebText (https://github.com/jcpeterson/openwebtext).)r   r   r/   c                 C   sb   t j t j d}| jdkr"|t j t j t j d t jj| tt j|t	ddS )N)texturlz1.0.0)content-typecontent-length	timestampzMhttps://github.com/google-research/text-to-text-transfer-transformer#datasets)builderr/   featurescitationhomepage)
r)   r6   Textr   updater*   DatasetInfo_DESCRIPTIONFeaturesDict	_CITATION)r   r6   r   r   r$   _info   s    

zC4._infoc              	   C   s  | t t| jj}|tt@ }|tt }i }dd |D |d< | jjr0tj| jj	d|d< | jj
r8t|d< ||}| jjratj|jt}tjj|sZtdt|jt|||d< g }	|d D ]#}
tjj|
}|	d	d |D  W d    n1 sw   Y  qg|jr||d
|	i n|	|d< g |d
< |D ]5}tj|j|}tjjtj|d}tjj|std||t !dt"|| |d
 | q| #|||}t$j%j&t$j'j(t)d|dd ddt$j%j&t$j'j*t)d|dd ddgS )Nc                 S   s   g | ]}t j|d qS ))
cc_version)_WET_PATH_URLformat).0r@   r   r   r$   
<listcomp>   s    
z(C4._split_generators.<locals>.<listcomp>wet_path_urls)r   badwordsrealnews_domainszlFor the WebText-like config, you must manually download the following file from {0} and place it in {1}: {2}openwebtext_urls_zipc                 S   s   g | ]
}d t | f qS )z%s/%s)_DOWNLOAD_HOSTstriprC   lr   r   r$   rD      s    	wet_fileswet_urlsz*.warc.wet.gzzlFor the non-default Common Crawl version {0}, you must manually download the WET files to the directory {1}.z7Adding %d WET files for manually downloaded version %s.trainc                 S   s   | d dkS Ni  r   r   xr   r   r$   <lambda>       z&C4._split_generators.<locals>.<lambda>)splitpage_contenthashed_url_predicate)r   
gen_kwargs
validationc                 S   s   | d dkS rP   r   rQ   r   r   r$   rS      rT   )+download_checksums_CHECKSUMS_URLsetbuilder_configr   r   r   _BADWORDS_URLrB   r   r   _REALNEWS_DOMAINS_URLdownload_and_extractr   ospathr   
manual_dir_OPENWEBTEXT_URLS_ZIPtfiogfileexistsAssertionError_OPENWEBTEXT_URLS_URLextractGFileextendregister_checksumsr:   downloadglobr   infolen_get_page_contentr)   r*   SplitGeneratorSplitTRAINdict
VALIDATION)r   
dl_managerpipeliner   auto_cc_versionsmanual_cc_versionsfiles_to_download
file_pathsowt_pathrN   wet_path_urlfr@   cc_dirrM   page_content_pcollectionr   r   r$   _split_generators   s   



zC4._split_generatorsc                 C   s  t jjj}|d||d ? B }d|v r2dd }|d||d ? B |j||dB }||f| B }||tj	B |
tjB }| jjrktjj|d }	t|	}
W d	   n1 s]w   Y  ||
tj|
B }|d
|tj? B d| ? B |tjB }| jjr|d|jtj|d t? B d|dd ? B d|tj? B }||dd| ? B |tjB }| jj rtjj|d }	dd |	D }W d	   n1 sw   Y  |d|t!|? B }t"|}| jj#dkr||j
tj$| jj#dO }|S )z+Build PCollection of un-split page content.create_wet_filesrM   rN   c                 S   s   | | | i|  S )N)ro   )r1   
downloaderr   r   r$   download_url  s   z*C4._get_page_content.<locals>.download_urlcreate_wet_urls)r   rG   Nnormalize_url	group_urlread_webtextlike_urlsrH   add_dummy_pagec                 S   s   | dfS )N r   rQ   r   r   r$   rS   *  s    z&C4._get_page_content.<locals>.<lambda>normal_webtext_url)r0   webtextlike_urlsgroup_webtextlike_urlsrF   c                 S   s   g | ]}|  qS r   )rJ   rK   r   r   r$   rD   9  s    z(C4._get_page_content.<locals>.<listcomp>clean_pagesall)r   )%r)   r*   lazy_importsapache_beamCreateMapFlattenFlatMapr   split_wet_fileFilteris_valid_lengthr]   r   re   rf   rg   rl   jsonloadis_realnews_domainr   
GroupByKeydedupe_urlsr   ReadFromTextra   rb   r   _OPENWEBTEXT_URLS_FILE_PATTERNCoGroupByKeyfilter_by_webtextliker   get_clean_page_fnremove_duplicate_textr   is_language)r   rz   r~   ry   beamwet_file_pathsr   dl_wet_file_pathsrV   r   rG   r   rF   r   r   r$   rs      s   









zC4._get_page_contentc                    s4   t jjj} fdd}||t|B ||B S )Nc                    sB   t  d | \}}|d |d |d |d |d |d dfS )Nexamplesr1   r0   r2   r3   r4   )r1   r0   r2   r3   r4   )r   get_counter_inc_fn)elr
   r6   rU   r   r$   _emit_examplesK  s   z-C4._build_pcollection.<locals>._emit_examples)r)   r*   r   r   r   r   get_hashed_url_filter_fnr   )r   unused_pipelinerU   rV   rW   r   r   r   r   r$   _build_pcollectionG  s   

zC4._build_pcollectionN)r%   r&   r'   r(   MANUAL_DOWNLOAD_INSTRUCTIONSr	   BUILDER_CONFIGSr?   r   rs   r   r   r   r   r$   r-   {   s0    
	LNr-   )&r(   
__future__r   r   r   r   ra   abslr   tensorflow.compat.v2compatv2re   tensorflow_datasets.public_api
public_apir)   tensorflow_datasets.textr   r<   r>   r*   Versionr   r   rI   rA   r_   r^   r[   rd   rj   r   r   r   BuilderConfigr	   BeamBasedBuilderr-   r   r   r   r$   <module>   s8   

0