o
    Ni3#                  
   @   s$  d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddlm	Z
 dZdZdZd	Zd
Zdddddddddd	ZG dd de
jjZG dd de
jjZedZedZedZedZedZedZdadd  Zd!d" Zd#d$ Z d%d& Z!d'd( Z"d)d* Z#d+d, Z$d-d. Z%dS )/zBIGPATENT Dataset.    )absolute_import)division)print_functionNa  
@misc{sharma2019bigpatent,
    title={BIGPATENT: A Large-Scale Dataset for Abstractive and Coherent Summarization},
    author={Eva Sharma and Chen Li and Lu Wang},
    year={2019},
    eprint={1906.03741},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}
a  
BIGPATENT, consisting of 1.3 million records of U.S. patent documents
along with human written abstractive summaries.
Each US patent application is filed under a Cooperative Patent Classification
(CPC) code. There are nine such classification categories:
A (Human Necessities), B (Performing Operations; Transporting),
C (Chemistry; Metallurgy), D (Textiles; Paper), E (Fixed Constructions),
F (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),
G (Physics), H (Electricity), and
Y (General tagging of new or cross-sectional technology)

There are two features:
  - description: detailed description of patent.
  - summary: Patent abastract.

zPhttps://drive.google.com/uc?export=download&id=1mwH7eSh1kNci31xduR4Da_XcmTE8B8C3descriptionabstractzHuman Necessitiesz#Performing Operations; TransportingzChemistry; MetallurgyzTextiles; PaperzFixed Constructionsz=Mechanical Engineering; Lightning; Heating; Weapons; BlastingPhysicsElectricityz4General tagging of new or cross-sectional technology)	abcdefghyc                       s*   e Zd ZdZejjd fdd	Z  ZS )BigPatentConfigzBuilderConfig for BigPatent.Nc                    s:   t t| jdtjddtjdgd| || _dS )zBuilderConfig for Wikihow.

    Args:
      cpc_codes: str, cpc_codes
      **kwargs: keyword arguments forwarded to super.
    z2.0.0zUpdated to cased raw strings.z1.0.0)versionsupported_versionsN )superr   __init__tfdscoreVersion	cpc_codes)selfr   kwargs	__class__r   `/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/summarization/big_patent.pyr   O   s   
zBigPatentConfig.__init__N)	__name__
__module____qualname____doc__r   r   disallow_positional_argsr   __classcell__r   r   r   r    r   L   s    r   c                   @   sP   e Zd ZdZeddddgdd ee D  Zdd	 Z	d
d Z
dddZdS )	BigPatentzBigPatent datasets.*allzPatents under all categories.r   namer   c              	   C   s&   g | ]\}}t ||d ||dqS )z=Patents under Cooperative Patent Classification (CPC){0}: {1}r+   )r   format).0kvr   r   r    
<listcomp>h   s    zBigPatent.<listcomp>c              
   C   s8   t jj| tt jtt j tt j ittfdt	dS )Nz&https://evasharma.github.io/bigpatent/)builderr   featuressupervised_keyshomepagecitation)
r   r   DatasetInfo_DESCRIPTIONr3   FeaturesDict	_DOCUMENTText_SUMMARY	_CITATION)r   r   r   r    _infoq   s   

zBigPatent._infoc                    s   | t g d}| fdd|D fdd|D tjjtjjdd idtjjtjjdd idtjjtjj	dd	 idgS )
zReturns SplitGenerators.)trainvaltestc              	      s"   i | ]}|t j d |d qS )bigPatentDataNonTokenizedz.tar.gzospathjoinr.   r/   )dl_pathr   r    
<dictcomp>   s    z/BigPatent._split_generators.<locals>.<dictcomp>c                    s    i | ]}|t j | |qS r   rC   rG   )extract_pathsr   r    rI      s     rE   r?   )r,   
gen_kwargsr@   rA   )
download_and_extract_URLextractr   r   SplitGeneratorSplitTRAIN
VALIDATIONTEST)r   
dl_managersplit_typesr   )rH   rJ   r    _split_generators~   s&   



zBigPatent._split_generatorsNc                 C   sF   t jjj}dd }tj|| jjd}|d|j	j
|? B ||B S )zBuild PCollection of examples.c                 s   s6    t | }|d tt|t tt|t ifV  d S )Npublication_number)jsonloadsr:   _bigpatent_clean_descriptionr<   _bigpatent_clean_abstract)rowjson_objr   r   r    _process_example   s   
z6BigPatent._build_pcollection.<locals>._process_exampler)   
ReadTextIO)r   r   lazy_importsapache_beamrD   rE   rF   builder_configr   iotextioReadFromTextFlatMap)r   pipelinerE   beamr^   file_patternr   r   r    _build_pcollection   s   
zBigPatent._build_pcollectionr!   )r"   r#   r$   r%   r   sorted_CPC_DESCRIPTIONitemsBUILDER_CONFIGSr>   rV   rj   r   r   r   r    r(   `   s    
r(   z(FIG.)\s+(\d)(,*)\s*(\d*)z(FIGS.)\s+(\d)(,*)\s*(\d*)z(FIGURE)\s+(\d)(,*)\s*(\d*)z	\[(\d+)\]z^\s*\[(\d+)\]z^(\s*)TABLE\s+\d+(\s+(.*))?$c                   C   s   t sttjjjjj a t S r!   )_ENGLISH_WORDS	frozensetr   r   r`   nltkcorpuswordsr   r   r   r    _get_english_words   s   rt   c                 C   s   d dd | dD S )N c                 S   s   g | ]}|r|qS r   r   )r.   wr   r   r    r1      s    z0_remove_excessive_whitespace.<locals>.<listcomp>)rF   splittextr   r   r    _remove_excessive_whitespace   s   rz   c                 C   s   t dd|  } t| } | S )zCleans the abstract text.z[\(\{\[].*?[\}\)\]] )resubstriprz   rx   r   r   r    r[      s   r[   c                 C   s(   t d| } td| } td| } | S )z(Remove references from description text.zFIG\2 )	_FIG_EXP1r}   	_FIG_EXP2	_FIG_EXP3rx   r   r   r    _bigpatent_remove_referenecs   s   r   c                 C   s   dd |   dD S )zRemove non-empty lines.c                 S   s$   g | ]}|  rtd |  qS )r{   )r~   _NON_EMPTY_LINESr}   )r.   sr   r   r    r1      s    z:_bigpatent_get_list_of_non_empty_lines.<locals>.<listcomp>T)r~   
splitlinesrx   r   r   r    &_bigpatent_get_list_of_non_empty_lines   s   r   c                 C   s   g }d}d}|t | k rZ| | }|dkr"t|rd}n4|| n.|dkrP|dd}d}|D ]}| s9q2|t v rO|d7 }|dkrOd}||  nq2|d7 }|t | k s|S )z$Remove Tables from description text.r      	ru      )len_TABLE_HEADERmatchappendr~   rw   isalphart   )	sentencesnew_sentencesitable_startsentencers   num_engrv   r   r   r    _bigpatent_remove_tables   s2   


r   c                 C   s6   g }| D ]}t |d}t|dkr|| q|S )z)Remove sentences with less than 10 words.ru   
   )setrw   r   r   )r   r   r   rs   r   r   r    '_bigpatent_remove_lines_with_less_words  s   
r   c                 C   s6   t | }t|}t|}d|} t| } t| } | S )zClean the description text.
)r   r   r   rF   r   rz   )ry   r   r   r   r    rZ     s   
rZ   )&r%   
__future__r   r   r   rX   rD   r|   tensorflow_datasets.public_api
public_apir   r=   r8   rM   r:   r<   rl   r   BuilderConfigr   BeamBasedBuilderr(   compiler   r   r   _LINE_NUM_EXPr   r   ro   rt   rz   r[   r   r   r   r   rZ   r   r   r   r    <module>   sP   
R




%
