o
    7ti                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlmZ edejdedefd	d
Zedejdedejdeddf
ddZedejdededdfddZedeeef fddZedeeef fddZdS )    N)cache)Dict)BeautifulSoup)tqdmclienturlreturnc                    s    |  |I d H }|  |jS )N)getraise_for_statustext)r   r   response r   N/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/tasks/ruler/essays.py	fetch_url   s   r   htemp_folderc              
      s   | dd dd}tjtj||rd S zDt| |I d H }t|d}|d}|r[|	t
|}ttj||ddd	}	|	| W d    W d S 1 sSw   Y  W d S W d S  ty| }
 ztd
| dt
|
  W Y d }
~
d S d }
~
ww )N/.htmlz.txtzhtml.parserfontwutf-8encodingFailed to download : )splitreplaceospathexistsjoinr   r   findhandlestropenwrite	Exceptionprint)r   r   r   r   filenamecontentsoupspecific_tagparsedfileer   r   r   process_html_essay!   s*   

&&r0   c              
      s   | dd }tjtj||rd S z.t| |I d H }ttj||ddd}|| W d    W d S 1 s<w   Y  W d S  tyb } zt	d| dt
|  W Y d }~d S d }~ww )Nr   r   r   r   r   r   r   )r   r   r   r    r!   r   r%   r&   r'   r(   r$   )r   r   r   r)   r*   r.   r/   r   r   r   process_text_essay7   s   &&r1   c               	      s  ddt jdd t jdd t d_d_d_d_d_d} t	j
ddd4 I d H P t | I d H }| }d	d
 |D }dd
 |D } fdd
|D }tj|ddiI d H   fdd
|D }tj|ddiI d H  W d   I d H  n1 I d H sw   Y  ttt jd}ttt jd}d}	|| D ]}
t|
ddd}|	| 7 }	W d    n1 sw   Y  qd|	iS )N
essay_repo
essay_htmlT)exist_okFzihttps://raw.githubusercontent.com/NVIDIA/RULER/main/scripts/data/synthetic/json/PaulGrahamEssays_URLs.txtg      >@)timeoutfollow_redirectsc                 S   s   g | ]}d |v r|qS r   r   .0r   r   r   r   
<listcomp>\       zget_essays.<locals>.<listcomp>c                 S   s   g | ]}d |vr|qS r7   r   r8   r   r   r   r:   ]   r;   c                    s   g | ]	}t  |qS r   )r0   r8   )r   r   temp_folder_htmlr   r   r:   `   s    desczDownloading HTML essaysc                    s   g | ]}t  |qS r   )r1   r8   )r   temp_folder_repor   r   r:   f   s    zDownloading text essaysz*.txt rr   r   r   )r   makedirs	html2text	HTML2Textignore_imagesignore_tables
escape_allreference_links	mark_codehttpxAsyncClientr   
splitlines
async_tqdmgathersortedglobr   r!   r%   read)url_listr*   urls	html_urls	text_urls
html_tasks
text_tasks
files_repo
files_htmlr   r.   fr   )r   r   r<   r>   r   
get_essaysF   sF   (rZ   c                   C   s   t t S )z$Synchronous wrapper for get_essays())asynciorunrZ   r   r   r   r   get_all_essaysx   s   r]   )r[   rO   r   	functoolsr   typingr   rB   rI   bs4r   tqdm.asyncior   rL   rJ   r$   r   rC   r0   r1   rZ   r]   r   r   r   r   <module>   sJ   1