o
    7ti                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZ ddddZe	dZ
ed	 ed
  Ze Zeeedd dD Ze
e d#ddZ			d#dededededef
ddZ				d$dededefddZd%dd Zd!d" ZdS )&    N)tqdm)DEFAULT_SEQ_LENGTHSget_tokenizerx   zBelow is a numbered list of words. In these words, some appear more often than others. Memorize the ones that appear most often.
{context}
Question: What are the 10 most common words in the above list?zA Answer: The top 10 words that appear most often in the list are:)tokens_to_generatetemplateanswer_prefix*   r   r   c                 C   s    g | ]}t j| D ]}|q	qS  )r_categories).0xitemr
   r
   Q/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/tasks/ruler/cwe_utils.py
<listcomp>%   s     r   )noun	adjectiveverb      
   c           	      C   sh   t t| }|d | ||d  }}|t| |t|  }t| ddd t|D }||fS )N c                 S   "   g | ]\}}|d   d| qS    z. r
   r   iwordr
   r
   r   r   2      " zget_example.<locals>.<listcomp>)randomsampleWORDSintRNGshufflejoin	enumerate)		num_wordscommon_repeatsuncommon_repeatscommon_numsword_list_fullcommonuncommon	word_listcontextr
   r
   r   get_example+   s   
r1   r(   max_seq_lengthfreq_cwfreq_ucwnum_cwc                 C   s   |dk rt ddd|\}}t | dd|\}}nt ddd|\}}t | |||\}}t}	|	j|dd	d
dd t|D  }
|	j|dd	}|
||fS )Ni      r   r      (   r    )r0   queryr   c                 S   r   r   r
   r   r
   r
   r   r   J   r   z)generate_input_output.<locals>.<listcomp>)r1   TEMPLATEformatr&   r'   )r(   r2   r3   r4   r5   context_exampleanswer_exampler0   answerr   input_example
input_textr
   r
   r   generate_input_output7   s"   
rB   Fnum_samplesincrementalc              	   C   s  |d usJ dg }|}|}d}|| |k rWt ||\}	}
}t||	d |
 d ddd t|D  j}|| |krB||8 }n||7 }|ttkrQtt}n|| |k stt| d| dD ]w}|}	 zt ||\}	}
}t||
j| }||ksJ | d
W n   ||kr||8 }Y qg|rd|
dddd	 
 }
d|	dddd	 
 }	|
td }|
d | }
||
	 |	|||td 	 d}|| qb|S )NzTokenizer is not provided.r   
r   c                 S   r   r   r
   r   r
   r
   r   r   o   r   z(sys_word_pair_random.<locals>.<listcomp>zGenerating CWE Samples | )descTz exceeds max_seq_length.	r   )indexinputr@   outputslength
max_length
gen_prefix)rB   lenr&   r'   	input_idsr"   r   rangereplacestripsplitrfindCONFIGappend)rC   r2   	tokenizerrD   remove_newline_tabr   write_jsonsr(   total_tokensr@   rA   r?   rH   
used_wordsrK   gen_prefix_indexformatted_outputr
   r
   r   sys_word_pair_randomT   s   



	r^   c                 K   s   t | }td||d}|S )Ni  )rC   r2   rW   )r   r^   )
pretrainedseqkwargsrW   rY   r
   r
   r   get_dataset   s
   rb   c                     sR   |  d|  di   fdd| dtD }dtjjttj	|tj
jdiS )NrW   r_   c                 3   s    | ]	}t  |d V  qdS ))r`   N)rb   )r   r`   r_   r
   r   	<genexpr>   s
    

z!get_cw_dataset.<locals>.<genexpr>max_seq_lengthstest)rS   )getpopr   datasetsDataset	from_listlist	itertoolschainfrom_iterableSplitTEST)ra   dfr
   rc   r   get_cw_dataset   s   

rs   )r   r   r   )Nr   Fr   )N)rm   r    ri   wonderwordsr    lm_eval.tasks.ruler.common_utilsr   r   rU   Randomr$   r;   
RandomWordr   sortedrl   setr"   r%   r1   r#   rB   r^   rb   rs   r
   r
   r
   r   <module>   s^   



 

U