o
    Ni+                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddlm	Z	 ddl
mZ ddlmZ d	Zd
ZdZdZdZdd ZG dd de	jZedkrSe	  dS dS )zTests for c4_utils.    )absolute_import)division)print_functionN)testing)lazy_imports)c4_utilsuC  This line has enough words and ends in punctuation, Dr. Roberts!
Economic History | All Institutions | Open Access Articles | Digital Commons Network
"Open Access. Powered by Scholars. Published by Universities."
Digital Commons Network™/ Social and Behavioral Sciences...
Too few words.
You need to enable javascript in your browser in order to see this page.
You have JavaScript disabled and that means you can't see this page.
Adam Roberts has a cookie policy: always eat them.
Colin has a privacy policy: don't share people's secrets.
You'd better follow our terms of use!
zThis line has enough words and ends in punctuation, Dr. Roberts!
"Open Access. Powered by Scholars. Published by Universities."5793z
text/plainz2019-04-24T09:23:58Zc                     s    t t d fdd	}  | fS )N   c                    s    |   |7  < d S N )camtcountersr   Z/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/text/c4_utils_test.pycounter_inc_fn6   s   z%_get_counters.<locals>.counter_inc_fn)r	   )collectionsdefaultdictintr   r   r   r   _get_counters3   s   
r   c                   @   s^   e Zd ZdddZdd Zdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dd Zdd ZdS )C4UtilsTestNc                 C   sP   t  \}}tt|d|f|d}| t|d |sd n|d d }||fS )Nurl)url_and_featuresr   r	   r   )r   listr   get_clean_page_fnassertLessEquallen)selffeaturesbadwordsr   r   resultsresultr   r   r   run_clean_page>   s   
zC4UtilsTest.run_clean_pagec              	   C   sJ   |  ttttd\}}| t|d  | dddddddt| d S )Ntextzcontent-typezcontent-length	timestampr%      r	      )lines-validzlines-too-shortlines-no-endmarkzlines-javascriptlines-policyemitted-clean-pages)r#   EN_TEXTFAKE_CONTENT_TYPEFAKE_CONTENT_LENGTHFAKE_TIMESTAMPassertEqualEXPECTED_CLEAN_ENdict)r   clean_enr   r   r   r   test_clean_pageG   s"   zC4UtilsTest.test_clean_pagec                 C   B   d}|  |tttd\}}| d | | dddt| d S )NziThis first line has one sentence.
This line looks like it has three sentences...but it's actually just 1.r$   r'   r	   )r)   zfiltered-page-toofewsentencesr#   r.   r/   r0   r1   r3   )r   text_with_toofewsentencesr4   r   r   r   r   test_clean_page_toofewsentencesY   s   z+C4UtilsTest.test_clean_page_toofewsentencesc                 C   r6   )NzThis page starts out with some text.
Everything looks good at first, since these are sentences.
But then, all of a sudden, there's a bunch of code like the next block.
fn foo(a) { bar = a + 10; }.r$   r	   r(   )zfiltered-page-squigglybracketr)   r7   )r   text_that_is_actually_coder4   r   r   r   r   test_clean_page_squigglybracketh   s   z+C4UtilsTest.test_clean_page_squigglybracketc                 C   s@   d}|  |tttd\}}| d | | ddit| d S )Na  Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.r$   zfiltered-page-loremipsumr	   r7   )r   lorem_ipsum_textr4   r   r   r   r   test_clean_page_loremipsumy   s   z&C4UtilsTest.test_clean_page_loremipsumc                 C   s   d}g d}g d}ddddddddddddg}t |||D ]1\}}}|| }| j|tttd	d
gd\}	}
|rC| d |	 n| ||	d  | |t|
 q"d S )NzThis page starts out with some text.
Everything looks good at first, since these are sentences.
But then, all of a sudden, there's a badword... or not?
)z6I asked my friend for assistance polishing my cutlass.zCHe took the saddle and put it on his ass in preparation for travel.zdAss is one of several species of small, horse-like animals.Donkey is one synonym for the word "ass".)FTTT   r	   )r)   r,   r(   )r)   zfiltered-page-badwordr$   ass)r    r%   )zipr#   r.   r/   r0   r1   r3   )r   padding_textfinal_sentencesoutputs_should_be_noneexpected_countersfinal_sentenceoutput_should_be_noneexpected_counterr%   outr   r   r   r   test_clean_page_badwords   sB   

z$C4UtilsTest.test_clean_page_badwordsc                 C   sP   d}d}dddd}|  |tttd\}}| ||d  | |t| d S )NzThis page has some text.
Some lines don't end with punctuation
And some of these lines end with citations.[3]
Or have requested citations[citation needed]. Or the option to edit.[edit]
zxThis page has some text.
And some of these lines end with citations.
Or have requested citations. Or the option to edit.r(   r	   )r)   r*   r,   r$   r%   r7   r   r%   expected_clean_textrD   rH   r   r   r   r   test_clean_page_citations   s   z%C4UtilsTest.test_clean_page_citationsc                 C   sP   d}d}dddd}|  |tttd\}}| ||d  | |t| d S )NzThis page has with some text. So, that's good!
But at the end it has some polciy lines.
This line mentions the Terms of Use.
This line should be okay.
The privacy policy is mentioned in this line.
Let's talk about the Cookie Policy now.
zqThis page has with some text. So, that's good!
But at the end it has some polciy lines.
This line should be okay.r(   r	   )r)   r+   r,   r$   r%   r7   rJ   r   r   r   test_clean_page_policy   s   z"C4UtilsTest.test_clean_page_policyc                 C   s   dd l m  m} tj}g d}ddg}| (}||dd |D B }t|}|	||
dd |D  W d    d S 1 sCw   Y  d S )Nr   ))url/1-0aF  This is a duplicated line.
This is a unique line.
This one comes first and so it stays.
This one is duplicate within the page so the others are removed.
Here is a sentence between the duplicates.
This one is duplicate within the page so the others are removed.
this One is Duplicate WITHIN the page so the others are removed. )zurl/2-1zThis is 2nd unique line.
This one comes second so it is removed even though the capitalizaiton is different.
this is a Duplicated line. )url/3-4zThis is a 3rd unique line.
This is a duplicated line.
This one comes third and so it is removed. But the page stays because there are still 3 sentences remaining.)zurl/4-4zThis is a 4th unique line.
This is a duplicated line.
This one comes third and so it is removed, and the page is too since there aren't enough sentences left.)rN   zThis is a duplicated line.
This is a unique line.
This one comes first and so it stays.
This one is duplicate within the page so the others are removed.
Here is a sentence between the duplicates.)rO   zThis is a 3rd unique line.
This one comes third and so it is removed. But the page stays because there are still 3 sentences remaining.c                 S      g | ]
\}}|d |ifqS r%   r   .0r   r%   r   r   r   
<listcomp>      z:C4UtilsTest.test_remove_duplicate_text.<locals>.<listcomp>c                 S   rP   rQ   r   rR   r   r   r   rT     rU   )apache_beam.testing.utilr   utilr   apache_beamPipelineCreater   remove_duplicate_textassert_thatequal_to)r   beam_testing_utilbeaminput_urls_and_textexpected_urls_and_textpipelinepagesdeduped_pagesr   r   r   test_remove_duplicate_text   s$   



"z&C4UtilsTest.test_remove_duplicate_textc                 C   sR   t jrd S t \}}ttjtjt	
 dd|d | ddddt| d S )Nc4zcc_0.warc.wet.gzr   r	   r'   )zwet-filezpage-emittedzpage-filtered-nourl)sixPY2r   r   r   split_wet_fileospathjoinr   fake_examples_dirr1   r3   )r   r   r   r   r   r   test_split_wet_file  s    
zC4UtilsTest.test_split_wet_filer
   )__name__
__module____qualname__r#   r5   r9   r;   r=   rI   rL   rM   re   rn   r   r   r   r   r   <   s    
	3.r   __main__)__doc__
__future__r   r   r   r   rj   rg   tensorflow_datasetsr   )tensorflow_datasets.core.lazy_imports_libr   tensorflow_datasets.textr   r-   r2   r/   r.   r0   r   TestCaser   ro   	test_mainr   r   r   r   <module>   s*   	 n