o
    7ti                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m	Z	 d dl
mZmZ ddddZd	Zed
 ed  Z					d&dedddeeee ef fddZ						d'dddedededededededee fd d!Zd(d"d#Zd$d% ZdS ))    N)zeta)tqdm)DEFAULT_SEQ_LENGTHSget_tokenizer2   a  Read the following coded text and track the frequency of each coded word. Find the three most frequently appeared coded words. {context}
Question: Do not provide any explanation. Please ignore the dots '....'. What are the three most frequently appeared words in the above coded text?zY Answer: According to the coded text above, the three most frequently appeared words are:)tokens_to_generatetemplateanswer_prefix*   r   r	        
          @max_len	tokenizerz$transformers.PreTrainedTokenizerFastreturnc           
         sJ  fddt |D tt|k r)dtjtjd tt|k st	t
ttt dd< t fdd}|dkrp|}||\}}	t||j| kro||8 }||\}}	t||j| ks\n*|  }||\}}	t||j| k r||7 }||\}}	t||j| k s||8 }||\}}	||	|fS )	Nc                    s"   g | ]}d  tjtj dqS ) k)joinrandomchoicesstringascii_lowercase).0_)coded_wordlen Q/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/tasks/ruler/fwe_utils.py
<listcomp>0   s    z)generate_input_output.<locals>.<listcomp>r   r   z...r   c                    s   t dtd }| |    t  }dd t|tD }dd |D }tt	
| jd|dddd fS )	N   c                 S   s   g | ]	\}}|g| qS r   r   )r   wzir   r   r   r    @       z;generate_input_output.<locals>.gen_text.<locals>.<listcomp>c                 S   s   g | ]	}|D ]}|qqS r   r   )r   wlstxr   r   r   r    A   r$    r   )contextquery   )nparangelenr   zipastypeintr   RandomSEEDshuffleformatr   )	num_wordsr   sampled_cntsampled_words)alphar   vocabr   r   gen_text=   s    z'generate_input_output.<locals>.gen_text)ranger-   setappendr   r   r   r   r   sortedlistr1   r2   r3   TEMPLATE	input_ids)
r   r   r5   r   
vocab_sizeincrementalr8   r:   textanswerr   )r8   r   r   r9   r   generate_input_output&   s8   


rF     Fmax_seq_lengthnum_samplesrB   r   r8   r   remove_newline_tabc              
   C   s   g }|}|dkr|d n|}|}	t |	| |||	d |d\}
}
}tt|d| dD ]M}|}	t |	| ||||	d |d\}}}
t| |j| }|rZd|d	dd
d  }||d |	t
d   |||t
d  d}|| q*|S )Nr   r       )r   rB   rC   r8   zGenerating FWE Samples | )desc)r5   r   rB   rC   r8   r'   
	r	   )indexinputoutputslength
max_length
gen_prefix)rF   r   r;   r-   rA   r   replacestripsplitrfindCONFIGr=   )r   rH   rI   rB   r   r8   r   rJ   write_jsonsinput_max_lenr   num_example_wordsrO   
input_textrE   rR   formatted_outputr   r   r   	sys_kwextV   sN   
	


r_   c                 K   s   t | }t||d}|S )N)r   rH   )r   r_   )
pretrainedrH   kwargsr   rZ   r   r   r   get_dataset   s   rb   c                     sR   |  d|  di   fdd| dtD }dtjjttj	|tj
jdiS )Nr   r`   c                 3   s    | ]	}t  |d V  qdS ))rH   N)rb   )r   seqr`   r   r   	<genexpr>   s
    

zfwe_download.<locals>.<genexpr>max_seq_lengthstest)rW   )getpopr   datasetsDataset	from_listr?   	itertoolschainfrom_iterableSplitTEST)ra   dfr   rd   r   fwe_download   s   

rs   )r   r   r   r   r   )rG   r   r   r   r   F)N)rm   r   r   rj   numpyr+   transformersscipy.specialr   r    lm_eval.tasks.ruler.common_utilsr   r   rY   r2   r@   r0   tuplestrr?   rF   floatbooldictr_   rb   rs   r   r   r   r   <module>   sl   
3	

=	