o
    ϯi                     @   s  d dl Zd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZ dd Zdd Zdd	 Zed
kr	e ZejZeeejejZejdkrhejD ]Zee q`ejspee dev rwde_ ej rdd eD Z!ndd eD Z!e"e!gZ#e#$ej%ej&ej'ede(ee)ddde*dg ej+e# Z,ej-e,ej.dd dZ/d Z0dZ1zee2e/D ]\Z3Z4e5de3 e5de4 e3Z0e6e4Z1qW dS    e7ddZ8ej9e8d W d   n1 sw   Y  e5de0 e5de1 Y dS dS )    N)tqdm)tokenizec                  C   s~   t  } | jdtd dd | jdtddd | jdtdd	d | jd
dd dd | jdtdd | jddddd |  }|S )Nz
--tar-pathzPath to the tars)typedefaulthelpz--startr   zstart from tar-path + startz--endi zend with tar-path + endz	--exclude+zexclude tar-path + exclude)nargsr   r   z--batch-size   )r   r   z--orderF
store_truez$if keep the search order accendingly)r   actionr   )argparseArgumentParseradd_argumentstrint
parse_args)parserargs r   D/home/ubuntu/.local/lib/python3.10/site-packages/tests/check_tars.pyr      sL   r   c                 C   s   t dt|  d dS )zUCall in an exception handler to ignore any exception, isssue a warning, and continue.zHandling webdataset error (z). Ignoring.T)loggingwarningrepr)exnr   r   r   log_and_continue8   s   r   c                 C   s   d}d}t t| | \}}t| | d}|| d< |d }t|tr9t|d t	r9t
|dkr9t|}|| d< t|| d< | S )	z7
    Preprocess a single sample for wdsdataloader.
    flacjsonzutf-8waveformtextr   r	   raw_text)sfreadioBytesIOr   loadsdecode
isinstancelistr   lenrandomchoicer   )sample	audio_exttext_ext
audio_dataorig_srjson_dict_rawtextsr   r   r   
preprocess=   s   $
r2   __main__awsFc                 C   $   g | ]}t jtjt|d  qS )z.tarospathjoinr   tar_pathr   .0ir   r   r   
<listcomp>[      $ r>   c                 C   r5   )z.tar -r6   r;   r   r   r   r>   ]   r?   )handler__url____key__r   r	   )
batch_sizeshufflenum_workerszk:zbatch:zcheck_tar_log.txta)filezold_k:z
old_batch:):
webdatasetwds	soundfiler    r"   r7   r)   copyr   shutilr   	tracebackr   r   
laion_clapr   r   r   r2   __name__r   r:   r'   rangestartendidx_listexcludexremoveorderrD   localinput_shardsSimpleShardListpipelineextendsplit_by_nodesplit_by_workertarfile_to_samplesmapto_tuplebatchedDataPipelinedataset	WebLoaderrC   
dataloaderold_k	old_batch	enumeratekbatchprintdeepcopyopenrG   	print_excr   r   r   r   <module>   sv    (










