o
    5ti2                     @   s  d dl Z d dlZd dlZd dlZd dlmZmZmZmZm	Z	 zd dl
Z
dZW n ey9   ed e  dZY nw e	dZdee ded	eeed
f  fddZdeded	ee fddZded	eeeeeef f  fddZdeded	eeeeeef f  fddZG dd dZdS )    N)IteratorListSequenceTupleTypeVarTzGWARNING: C++ module could not be loaded. Janitor running in python modeFTsequencenreturn.c                 c   sr    g }|dkr%zt | }W n
 ty   Y d S w || |d8 }|dks| D ]}|| t|V  |d= q'd S )N   r   )nextStopIterationappendtuple)r   r	   history	next_itemitem r   S/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/decontamination/janitor.pyform_ngrams   s    
	

r   sc                 C   s$   |   }tt||}dd |D S )z Splits a string into ngram wordsc                 s   s    | ]}d  |V  qdS ) Njoin).0ngramr   r   r   	<genexpr>.   s    zword_ngrams.<locals>.<genexpr>)splitr   iter)r   r	   tokens
ngram_seqsr   r   r   word_ngrams*   s   r!   c                 C   s   dd t d| D S )zSplits a string on whitespaces and records the indices of each in the original string.
    @:return generator((word, (start_idx, end_idx)), ...)
    c                 s   s.    | ]}| d | | d ffV  qdS )r   r   N)groupstartend)r   mr   r   r   r   N   s   , z split_indices.<locals>.<genexpr>z\S+)refinditer)r   r   r   r   split_indicesJ   s   r(   c                 C   s.   t | }t||}dd |D }dd |D S )zDSplits a string into pairs of (ngram words, their start/end indices)c                 s   s    | ]}t | V  qd S N)zip)r   ngram_with_indicesr   r   r   r   b   s    
z&word_ngrams_indices.<locals>.<genexpr>c                 s   s6    | ]\}}d  ||d d |d d ffV  qdS )r   r   r   Nr   )r   	ngram_seqindicesr   r   r   r   g   s
     
)r(   r   )r   r	   tokens_with_indicesngram_seqs_with_indicesngram_indices_pairsr   r   r   word_ngrams_indicesQ   s   
r2   c                   @   s   e Zd Zddddejfdededededed	d
fddZded	d
fddZded	d
fddZ	ded	d
fddZ
ded	ee fddZdedee d	ee fddZd&ddZded	ee fddZded	efd d!Zded	d
fd"d#Zded	ee fd$d%Zd
S )'Janitor      
   ngram_nwindow_to_removetoo_dirty_cutoffminimum_slice_lengthdelete_charsr
   Nc                 C   sJ   || _ || _|| _|| _|| _t | _tt	j
t	j t	j
d | j| _d S )N   )r7   r8   r9   r:   r;   setdirt_ngramsstr	maketransstringascii_lowercaseascii_uppercasetranslation_table)selfr7   r8   r9   r:   r;   r   r   r   __init__o   s   

zJanitor.__init__filenamec                 C   s<   t |d}t|| W d    d S 1 sw   Y  d S )Nwb)openpickledumprE   rG   fpr   r   r   save_contamination_ngrams      "z!Janitor.save_contamination_ngramsc                 C   s<   t |d}t|| _W d    d S 1 sw   Y  d S )Nrb)rI   rJ   loadr>   rL   r   r   r   load_contamination_ngrams   rO   z!Janitor.load_contamination_ngramsdirt_stringc                 C       t r| |S td | |S )zRegister a string as contamination to be removed, e.g. a test set
        This breaks the dirt_string into ngrams to store for future cleaning'WARNING: Janitor running in python mode)JANITOR_CPPregister_contaminant_cppprintregister_contaminant_pythonrE   rS   r   r   r   register_contaminant   s   

zJanitor.register_contaminantdirty_stringc                 C   rT   )zClean a string (e.g. a training set) by removing all ngrams previously
        registered as contaminants. Returns a list of clean chunks, or empty if
        the string was too dirtyrU   )rV   	clean_cpprX   clean_python)rE   r\   r   r   r   clean   s   

zJanitor.cleandirty_partsc           	      C   s   g }d}d}t |D ]4\}\}}}|| jkrg   S td|| j }tt||| j }|| | jkr<||||  |}q
|t|| j k rS|||d d   |S )Nr   r,   r   )	enumerater9   maxr8   minlenr:   r   )	rE   r\   r`   clean_chunks
splice_idxr$   ir   r#   r   r   r   _split_chunks   s   
zJanitor._split_chunksc                 C   s   | j t|| j| j d S r)   )r>   updatejanitor_utilclean_ngramr;   r7   rZ   r   r   r   rW         z Janitor.register_contaminant_cppc                 C   s   t || j| j}| ||S r)   )rj   clean_ngram_with_indicesr;   r7   rh   rE   r\   contamination_indicesr   r   r   r]      s   
zJanitor.clean_cppr   c                 C   s   | | jS r)   )	translaterD   )rE   r   r   r   r   normalize_string   s   zJanitor.normalize_stringc                 C   s   | j t| || j d S r)   )r>   ri   r!   rq   r7   rZ   r   r   r   rY      rl   z#Janitor.register_contaminant_pythonc                    s&    fddt | jD } ||S )Nc                 3   s0    | ]\}}  | jv rd g|R V  qd S r)   )rq   r>   )r   dirty_ngramidx_pairrE   r   r   r      s    
z'Janitor.clean_python.<locals>.<genexpr>)r2   r7   rh   rn   r   rt   r   r^      s   

zJanitor.clean_python)r
   N)__name__
__module____qualname__rA   punctuationintr?   rF   rN   rR   r[   r   r_   r   r   rh   rW   r]   rq   rY   r^   r   r   r   r   r3   m   sF    
	



r3   )rJ   r&   rA   	tracebacktypingr   r   r   r   r   rj   rV   	ExceptionrX   	print_excr   ry   r   r?   r!   r(   r2   r3   r   r   r   r   <module>   s&    && *