o
    Ni;                     @   sp  d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddlZddl	Z	ddl
Z
ddlmZ ddlm  mZ ddlmZ dZdZd	Zd
ZdZdZdZdZdZdZdZg dZda dd Z!dd Z"dd Z#dd Z$d<ddZ%d=dd Z&d>d!d"Z'ddeeefd#d$Z(d%d& Z)d'd( Z*d)d* Z+efd+d,Z,d-d. Z-d>d/d0Z.d1d2 Z/d?d4d5Z0d6d7 Z1d8d9 Z2d:d; Z3dS )@z(Utilities for generating the C4 dataset.    )absolute_import)division)print_functionN)loggingzWARC/1.0zWARC-Target-URI:z
WARC-Date:zContent-Type:zContent-Length:)WARCzCONTENT-zContent-      i  ).?!"z...)zterms of usezprivacy policyzcookie policyzuses cookieszuse of cookieszuse cookiesc                    s   d fdd	}|S )N   c                    s    t jjjjj | | d S N)tfdscorelazy_importsapache_beammetricsMetricscounterinc)r   amt	namespace U/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/text/c4_utils.pycounter_inc_fn9   s   z*get_counter_inc_fn.<locals>.counter_inc_fn)r   r   )r   r   r   r   r   get_counter_inc_fn8   s   r   c                    s    fdd}|S )Nc                    s2   | \}}t ttj|d d} |S )Nutf-8   )inthashlibmd5tfcompatas_textencode	hexdigest)elurl_valpredicate_fnr   r   	filter_fn@   s
   z+get_hashed_url_filter_fn.<locals>.filter_fnr   )r-   r.   r   r,   r   get_hashed_url_filter_fn?   s   r/   c                  C   sL   t jjj} t  | d | jdW  d   S 1 sw   Y  dS )z)Returns a sentence tokenization function.punktz$nltk:tokenizers/punkt/english.pickleN)	r   r   r   nltk	threadingLockdownloaddataload)r1   r   r   r   _load_sentence_tokenizerH   s
   



$r7   c                 C   s    t st a tt tj| S r   )_SENTENCE_TOKENIZERr7   listtokenizer#   r$   r%   )textr   r   r   _get_sentencesS   s   r<   Fc                    s2   g }|   D ]}| fddt|D  q|S )Nc                    s   g | ]
} r
|  n|qS r   lower).0sr=   r   r   
<listcomp>]   s    z*_get_sentences_by_line.<locals>.<listcomp>)
splitlinesappendr<   )r;   r>   	sentencesliner   r=   r   _get_sentences_by_lineZ   s   
rF   Gz?c           
      C   s   | \}}|d }t d}tjjj}d|j_z||}W n |jj	y,   |d Y dS w |s5|d dS |d }	|	j
|k rD|d dS |	j|krV|d |d	|	j  dS |d
|	j  dS )zGReturns True iff text is in `language` with at least `min_probability`.r;   zdetected-langr   zlangdetect-exceptionFzpage-filtered-nolangpredictionszpage-filtered-lowlangdetectconfzpage-filtered-ignoredlangzpage-filtered-ignoredlang-%szpage-emited-%sT)r   r   r   r   
langdetectDetectorFactoryseeddetect_langslang_detect_exceptionLangDetectExceptionproblang)
pagelanguagemin_probability
unused_urlfeaturesr;   r   rH   predictionsbest_predictionr   r   r   is_languagec   s0   


rW   c                 C   s>   t d}| rt dd| pg }nd}tjt||dS )zDReturns `clean_page` with pre-compiled badword and citation regexes.z$\[\d*\]|\[edit\]|\[citation needed\]z[^a-z]({})[^a-z]|N)citation_regexbadwords_regex)recompileformatjoin	functoolspartial
clean_page)badwordsrY   rZ   r   r   r   get_clean_page_fn   s   
rc   c                 #   s   | \}}|d }	|st d}|	 }
g }d}fdd}|
D ]}| }||r.|d q|d|}|tr>|trC|d qt| |k rP|d	 q|	  d
 v r_|d  dS d v rh|d qd|v rs|d  dS t
 fddtD r|d q|r| }|dur|d  dS |tt|7 }|| |d q||k r|d dS |d d| |d< ||fV  dS )aB  Cleans a CommonCrawl page, yielding nothing if it should be skipped.

  Cleaning removes lines with no end marks or with too few words. After line
  filtering, pages are filtered out if they have too few sentences based on a
  simple count of end marks.

  Args:
    url_and_features: tuple(string, dict), the url and features of the page.
    citation_regex: Regex to use for finding Wikipedia-like citations to filter.
    badwords_regex: Regex to use for finding badwords. Default None, which means
      don't apply badwords filtering.
    counter_inc_fn: function, a function taking the name of a counter to be
      incremented and the (optional) amount. Defaults to a beam Metric counter.
    min_words_per_line: int, the minimum number of words a line needs to not be
      removed.
    min_num_sentences: int, the minimum number of sentences a page needs to not
      be skipped.
    max_word_length: int, the maximum number of characters allowed in a word.
      Lines containing a word with too many characters are removed.
  Yields:
    The url and cleaned text for the page.
  r;   z
clean-pager   c                    s$   |   D ]}t| kr dS qdS )NTF)splitlen)rE   word)max_word_lengthr   r   line_has_too_long_word   s
   z*clean_page.<locals>.line_has_too_long_wordzlines-with-too-long-word zlines-no-endmarkzlines-too-shortzlorem ipsumzfiltered-page-loremipsumN
javascriptzlines-javascript{zfiltered-page-squigglybracketc                 3   s    | ]}| v V  qd S r   r   )r?   p)
line_lowerr   r   	<genexpr>   s    zclean_page.<locals>.<genexpr>zlines-policyzfiltered-page-badwordzlines-validzfiltered-page-toofewsentenceszemitted-clean-pages
)r   rB   stripsubendswith
_END_MARKS	_ELLIPSISre   rd   r>   any_POLICY_SUBSTRINGSsearchr<   rC   r^   )url_and_featuresrY   rZ   r   min_words_per_linemin_num_sentencesrg   r)   rT   r;   linesvalid_linesnum_sentencesrh   rE   badwords_foundr   )rm   rg   r   ra      s`   


ra   c                 C   s0   t  }|tj| d   |	 S Nr   )
r!   r"   updater#   r$   r%   r&   rp   r>   r'   )rE   mr   r   r   
_hash_line   s    r   c                 c   s4    | \}}|d }| dD ]	}t||fV  qdS )z-Emits url to all (lower-cased, hashed) lines.r;   ro   N)rd   r   )rP   r)   rT   r;   rE   r   r   r   _emit_url_to_lines   s   r   c                 c   sV    | \}}t |}t|dd d}|D ]}||kr||fV  q|dt|d d dS )z'Emits (hashed) line to all but one url.c                 S   s   t tj| d S r   )r!   r"   r#   r$   r%   r&   r'   )xr   r   r   <lambda>  s    z$_emit_line_to_urls.<locals>.<lambda>)keyzemitted-line-duplicater   )r   N)r9   minre   )r(   r   rE   urlsskip_urlr)   r   r   r   _emit_line_to_urls   s   
r   c                 c   s    | \}}|d }t |dksJ dt ||f |d }|d }t|d }g }t }	|dD ]}
t|
}||v r@|d q1||	vrN||
 |	| q1d|}t t||k rb|d	 d
S | }||d< ||fV  d
S )a<  Removes matching lines from the page.

  Process the result of a join containing a single value for 'features' and zero
  or more values for 'lines'. Each value in 'lines' is a lower-cased, hashed
  line.

  If a line has fewer sentences than `max_window_size`, the full line is
  compared for a match.

  Args:
    el: `(string, {'features': features_dict, 'lines': [string]})`,
      element containing the result of a join on key with both the page text
      and lower-cased, hashed lines to remove.
    counter_inc_fn: function, a function taking the name of a counter to be
      incremented and the (optional) amount.
    min_num_sentences: int, the minimum number of sentences a page needs to not
      be skipped.

  Yields:
    url: The URL of the page.
    features: The page features with lines removed from text.
  rT   r   zInvalid page count (%d) for %sr   r;   r{   ro   zfiltered-lines-duplicatezfiltered-doc-toofewsentencesN)	re   setrd   r   rC   addr^   r<   copy)r(   r   rz   r)   join_valuesrT   r;   lines_to_remove	new_lineshashed_linesrE   hashed_linenew_textnew_featuresr   r   r   _remove_lines_from_text	  s4   



r   c                 C   sb   t jjj}td}| |tB d| ? B |jt|dB }| |dd|	 ? B |jt
|dB }|S )z8Utility to remove duplicate lines across text documents.zdedupe-linesgroup_sentences)r   )rT   r{   group_features_and_lines_by_url)r   r   r   r   r   FlatMapr   
GroupByKeyr   CoGroupByKeyr   )pagesbeamr   r   
final_docsr   r   r   remove_duplicate_text;  s*   


	r   c              	   #   s   t d|  stdd tjj| d}tj|d}dd ddd fdd}tj	|d	d
D ]f}|
 }|sFq=|tkr\| }|rR|V  dg  ddd|trk|ttd 
 |trz|ttd 
 |tr|ttd 
 |tr|ttd 
 |trq= | q=| }|r|V  W d   n1 sw   Y  W d   dS W d   dS 1 sw   Y  dS )z%Split a WET file into separate pages.zSplitting file: %szsplit-wet-filezwet-filerb)fileobjNc                      s   s
dur
d  s durd sdurd s(dur(d s2dur2d  rGrGd d  d	fS dS )
z"Generate a (url, {features}) page.Nzpage-filtered-nourlzpage-filtered-nocontentzpage-nocontenttypezpage-nocontentlenzpage-notimestampzpage-emittedro   )r;   zcontent-typezcontent-length	timestampr)   )r^   r   contentcontent_lencontent_typer   r   r)   r   r   _maybe_get_pageb  s(   z'split_wet_file.<locals>._maybe_get_pager   )encodingri   )r   infor   r#   iogfileGFilegzipGzipFileTextIOWrapperrp   _PAGE_DELIMITER
startswith_URL_KEYre   	_URL_DATE_CONTENT_TYPE_CONTENT_LEN_METADATA_PREFIXESrC   )wet_file_pathr   fgr   rE   rP   r   r   r   split_wet_fileS  sX   




Pr   c                 C   sH   t d}| \}}d}d}|D ]}|d7 }q|d|d  |d ||fS )z(Returns the first value for a given URL.zdedupe-urlsr   Nr   zfiltered-url-duplicatez
unique-url)r   )r(   r   r)   valscntvr   r   r   dedupe_urls  s   
r       1Ac                 C   s8   t d}| \}}t|d |kr|d dS |d dS )z*Returns False iff page's text is too long.zis-valid-lengthr;   zfiltered-page-contenttoolongFzvalid-lengthTr   re   )r(   
max_lengthr   r*   rP   r   r   r   is_valid_length  s   r   c                 C   sx   t d}| \}}tjjj|}|jd |j }||vr"|d dS || }t|t	r6|j
|vr6|d dS |d dS )z4Returns False iff page's (sub)domain is not allowed.zis-realnews-domainr	   zfiltered-url-invaliddomainFzfiltered-url-invalidsubdomainzrealnews-domainT)r   r   r   r   
tldextractextractdomainsuffix
isinstancer9   	subdomain)r(   realnews_domainsr   r)   r*   extmain_domainallowed_subdomainsr   r   r   is_realnews_domain  s   

r   c                 c   sl    t d}| \}}|d }|d }|s|d dS |s!|d dS t|dks)J |d ||d	 fV  dS )
z3Yields only pages with a matching WebText-like URL.zfilter-by-webtextliker;   webtextlike_urlszfiltered-url-notwebtextlikeNzmissing-webtextliker   zfound-webtextliker   r   )r(   r   r)   r   r;   webtextliker   r   r   filter_by_webtextlike  s   r   c                 C   sB   | \}}t j|}tdd|}tdd|}|d}||fS )Nzhttps?:\/\/(www\.)?ri   z\?(utm_|ref|feed).*/)r#   r$   r%   r[   rq   rstrip)r(   r)   r+   r   r   r   normalize_url  s   
r   )F)rG   r   )r   )4__doc__
__future__r   r   r   r_   r   r!   r   r[   r2   abslr   tensorflow.compat.v2r$   v2r#   tensorflow_datasets.public_api
public_apir   r   r   r   r   r   r   _MIN_WORDS_PER_LINE_MIN_NUM_SENTENCES_MAX_WORD_LENGTHrs   rt   rv   r8   r   r/   r7   r<   rF   rW   rc   ra   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   sf   	

	

\
2
J
