o
    5t¾i¥  ã                   @   sˆ   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZddlmZ ddl	m
Z
mZ dededefdd	„Zd
edededefdd„ZdS )é    Né   )ÚZStdTextReader)ÚJanitorÚword_ngramsÚdocsÚngrams_pathÚngrams_n_sizec                 C   s(   d}t t| ƒ| ƒ}t tt| ƒƒ|¡S )Ngš™™™™™¹?)ÚintÚlenÚrandomÚsampleÚrange)r   r   r   Úsimulated_overlapÚcontaminated© r   úY/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/decontamination/decontaminate.pyÚget_train_overlap_stub   s   r   Údocs_by_task_setÚlimitÚreturnc           %   
   C   s>  t j |d¡}t t|ddd¡}|d }tƒ }tdƒ t 	¡ }dt
fdd	„}i }	i }
t|  ¡ ƒ}|  ¡ D ]\\}}}t j d
|› ¡sLt  d
|› ¡ |||||ƒ}t j |¡rjt t|dƒ¡|
||f< |d8 }q5tƒ |
||f< d
|› d|› d|› d|› d	}t j |¡ršt|› dƒ t t|dƒ¡|	||f< q5t|› dƒ t t¡}t|ƒD ]\}}t| |¡|ƒ}|D ]	}||  |¡ q¸qªt |t|dƒ¡ ||	||f< q5t 	¡ | }td|d›dƒ g }|dkrtdƒ t 	¡ }t t¡}|	 ¡ D ]\\}}}| ¡ D ]\}}||  |||f¡ qqút 	¡ | }td|d›dƒ t|› d|› dƒ t t j |d¡¡}t|ƒ |D ]·}t 	¡ }td|› ƒ t|ƒ}d}d}d}d} d}!| ¡ D ]M}"|d7 }|" dd¡\}}#||!kr©|d7 }|}!||v r¥| |¡ |d7 }|| D ]\}}}|
||f }$|D ]}|$ |¡ q•qˆ||= q]| d7 } q]td |› ƒ td!|› ƒ td"|› ƒ td#| › ƒ td$ƒ |D ]}t|ƒ qÍt 	¡ | }td%|d›dƒ td&t j |¡d' | › d(ƒ q>t|
ƒ |
 ¡ D ]\\}}}|||||ƒ}t |t|dƒ¡ qþd)d*„ |
 ¡ D ƒS )+Nz	info.jsonÚrzutf-8)ÚencodingÚ
ngram_sizezBuilding Lookups...r   c              	   S   s   d| › d|› d|› d|› d	S )Núdata/ú/Ú_Úgrams_limitz	.overlapsr   )Ú	task_nameÚtask_setr   r   r   r   r   Úget_overlaps_dump_path2   s   z1get_train_overlap.<locals>.get_overlaps_dump_pathr   Úrbr   r   r   r   z.lookupz available, loading...z not available, building...ÚwbzBuilding lookups took z0.5fz	 seconds.r   zMerging lookups...zMerging lookups took z grams files found in ú:z*.sorted.zstz	Scanning Ú ú zTotal Ngrams: zUnique Ngrams: zUnique Matching: zUnique Non Matching: zMatched ngrams:z
Read took zSpeed: g    €„.Az	MB/secondc                 S   s   i | ]	\\}}}||“qS r   r   )Ú.0r   r   Údoc_idsr   r   r   Ú
<dictcomp>¦   s    z%get_train_overlap.<locals>.<dictcomp>) ÚosÚpathÚjoinÚjsonÚloadÚopenr   ÚprintÚtimeÚperf_counterÚstrr
   ÚkeysÚitemsÚexistsÚmkdirÚpickleÚsetÚcollectionsÚdefaultdictÚ	enumerater   Únormalize_stringÚaddÚdumpÚlistÚappendÚglobr   Ú	read_tqdmÚrsplitÚgetsize)%r   r   r   Úinfo_dict_pathÚ	info_dictr   ÚjanitorÚstartr   ÚlookupsÚ
duplicatesÚsets_to_decontaminater   r   r   Úoverlaps_dump_pathÚtask_set_lookup_pathÚlookupÚdoc_idÚdocumentÚngramsÚngramÚelapsedÚmatched_ngramsÚmerged_lookupr&   ÚfilesÚfileÚreaderÚtotal_ngramsÚunique_ngramsÚmatching_uniqueÚnon_matching_uniqueÚcurrent_ngramÚlineÚdocument_idÚtask_doc_setr   r   r   Úget_train_overlap%   sÄ   ÿÿÿÿ
ÿ

ÿ


ÿ
€$ÿr`   )r8   r@   r+   r(   r6   r   r/   Úarchiverr   rF   r   r   Údictr1   r   r	   r`   r   r   r   r   Ú<module>   s    