o
    .i>l                     @   s  d Z ddlZddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlmZ ddlmZ ddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZ ddl Z ddl!Z ddl"Z#ddl$m%Z% ddl&m'Z' ddl(m)Z) ddl&m*Z* ddl&m%Z+ ddl,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 zddlm3Z3 ddl4Z5ddl4m6Z6 W n e7y   d Z5 Z6Z3Y nw e*8e9Z:ej;Z<dd Z=dee>e?f de>fddZ@dd ZAde?de?deeBe?e?f  fd d!ZCd"d# ZDed$d% ZEedkd'e>fd(d)ZFd*d+ ZGd,d- ZHd.d/ ZId0d1 ZJG d2d3 d3eBZKG d4d5 d5eLZMd6d7 ZN	&	8	&	&			&	9		8	dld:eegef d;ed<eOd=eOd>eOd?eOd@ee> dAe>dBeOdCee> dDeeP dEeOdFee? defdGdHZQG dIdJ dJZRdmdLdMZSdNe?dePe?ee? f fdOdPZTdQe?de2fdRdSZUdTe?dePe?e?e?e?f fdUdVZVdWdX ZWedYZXdZejYd[ed\eeX f d]eBde>fd^d_ZZd`eej[j\e j[j\f de]e> fdadbZ^d`eej[j\e j[j\f d[ed\eeX f dceeB deeX fdddeZ_edfZ`dgee` dhe>deeae`  fdidjZbdS )nz'Some python utils function and classes.    N)Iterable)contextmanager)fieldsis_dataclass)Manager)Path)Empty)
disk_usage)AnyCallableOptionalTypeVarUnion)urlparse)tqdm   )config)parallel_map   )logging)Picklerdumpdumpspklregister)FileLock)Final)Literalc                 C   sX   | sdS g d}t | } |D ]\}}| | }|dkr$|dd|   S qt|  dS )a6  Returns a human readable size string.

    If size_in_bytes is None, then returns "Unknown size".

    For example `size_str(1.5 * datasets.units.GiB) == "1.50 GiB"`.

    Args:
        size_in_bytes: `int` or `None`, the size, in bytes, that we want to
            format as a human-readable size string.
    zUnknown size))PiB           )TiB        )GiB   @)MiB   )KiB   g      ?z.2f z bytes)floatint)size_in_bytes
_NAME_LISTname
size_bytesvalue r/   K/home/ubuntu/.local/lib/python3.10/site-packages/datasets/utils/py_utils.pysize_strK   s   r1   sizereturnc                 C   s  t | tr| S |  drt| dd d S |  dr)t| dd d S |  dr:t| dd d S |  d	rKt| dd d
 S |  dr\t| dd d S |  drxt| dd d }| drv|d S |S |  drt| dd d }| dr|d S |S |  drt| dd d }| dr|d S |S |  drt| dd d }| dr|d S |S |  drt| dd d }| dr|d S |S td|  d)a)  
    Converts a size expressed as a string with digits an unit (like `"50MB"`) to an integer (in bytes).

    Args:
        size (`int` or `str`): The size to convert. Will be directly returned if an `int`.

    Example:

    ```py
    >>> convert_file_size_to_int("1MiB")
    1048576
    ```
    PIBNr   TIBr    GIBr"   MIBr$   KIBr&   PBl     I5 b   TBl    J)GBi ʚ;MBi@B KB  z`size=zM` is not in a valid format. Use an integer followed by the unit, e.g., '5GB'.)
isinstancer)   upperendswith
ValueError)r2   int_sizer/   r/   r0   convert_file_size_to_intc   s8   
rH   c                 C   sb   |  dd dd dd dd d	d
 dd dd dd dd ddd
 ddS )N\z\\.z\.*z.*+z\+z///(z\()z\)|z\|^z\^$z\$?)replacerstrip)patternr/   r/   r0   glob_pattern_to_regex   s   
rW   stringrV   c                 C   s^   t dd|}t dd|}t || }|du rdS t| }t d|}tt||}|S )a  Un-format a string using a python f-string pattern.
    From https://stackoverflow.com/a/36838374

    Example::

        >>> p = 'hello, my name is {name} and I am a {age} year old {what}'
        >>> s = p.format(name='cody', age=18, what='quarterback')
        >>> s
        'hello, my name is cody and I am a 18 year old quarterback'
        >>> string_to_dict(s, p)
        {'age': '18', 'name': 'cody', 'what': 'quarterback'}

    Args:
        string (str): input string
        pattern (str): pattern formatted like a python f-string

    Returns:
        Optional[dict[str, str]]: dictionary of variable -> value, retrieved from the input using the pattern, or
        `None` if the string does not match the pattern.
    z{([^:}]+)(?::[^}]+)?}z{\1}z{(.+?)}z(?P<_\1>.+)N)resubsearchlistgroupsfindalldictzip)rX   rV   regexresultvalueskeys_dictr/   r/   r0   string_to_dict   s   rf   c                    s>   dd  fdd t | ts| st|  d | S )zbConvert an object to its dictionary representation recursively.

    <Added version="2.4.0"/>
    c                 S   s   t | o	t| t S N)r   rC   typeobjr/   r/   r0   _is_dataclass_instance   s   z&asdict.<locals>._is_dataclass_instancec                    s   | r+i }t | D ]} t| |j}|jr#||jks#|jddr(|||j< q
|S t| trBt	| drBt
|  fdd| D  S t| ttfrVt
|  fdd| D S t| trf fdd	|  D S t| S )
N$include_in_asdict_even_if_is_defaultF_fieldsc                    s   g | ]} |qS r/   r/   .0v_asdict_innerr/   r0   
<listcomp>   s    z1asdict.<locals>._asdict_inner.<locals>.<listcomp>c                 3   s    | ]} |V  qd S rg   r/   rn   rq   r/   r0   	<genexpr>       z0asdict.<locals>._asdict_inner.<locals>.<genexpr>c                    s   i | ]\}} | |qS r/   r/   ro   krp   rq   r/   r0   
<dictcomp>       z1asdict.<locals>._asdict_inner.<locals>.<dictcomp>)r   getattrr,   initdefaultmetadatagetrC   tuplehasattrrh   r\   r_   itemscopydeepcopy)rj   rb   fr.   rr   rk   r/   r0   rr      s   


zasdict.<locals>._asdict_innerz is not a dict or a dataclass)rC   r_   	TypeErrorri   r/   r   r0   asdict   s
   r   c              
   c   sB    t | |d}t| || zdV  W t| || dS t| || w )z%Temporarily assign obj.attr to value.N)rz   setattr)rj   attrr.   originalr/   r/   r0   temporary_assignment   s   r   Fseedc              	   c   s   t j }t j|  |r0tjr0ddl}|j }|j|  |j	
 r0|j	 }|j	|  |rotjroddl}ddlm} |j }	|jj| }
|j|
 | sYtd| }|j}t|d}|rj|j}||  z@dV  W t j| |rtjr|j| |j	
 r|j	| |rtjr|j|	 ||_|r||_dS t|d dS dS dS t j| |rtjr|j| |j	
 r|j	| |rtjr|j|	 ||_|r||_w t|d w w w )zUTemporarily set the random seed. This works for python numpy, pytorch and tensorflow.r   N)contextzBSetting random seed for TensorFlow is only available in eager mode_rng)nprandom	get_stater   r   TORCH_AVAILABLEtorchget_rng_statemanual_seedcudais_availableget_rng_state_allmanual_seed_allTF_AVAILABLE
tensorflowtensorflow.python.eagerr   get_global_generator	Generator	from_seedset_global_generatorexecuting_eagerlyrF   _seedr   r   _set_global_seed	set_stateset_rng_stateset_rng_state_alldelattr)r   set_pytorchset_tensorflownp_stater   torch_statetorch_cuda_statestftfpycontexttf_statetemp_gen
tf_contexttf_seedtf_rng_initializedtf_rngr/   r/   r0   	temp_seed   sd   















r   c                 c   s.    t  }| D ]}||vr|| |V  qdS )z=Iterate over iterable and return only unique values in order.N)setadd)rc   seenr.   r/   r/   r0   unique_values*  s   
r   c                    s    fdd}|S )z4If the value is None, return None, else call `func`.c                    s   | d ur | S d S rg   r/   )r.   funcr/   r0   wrapper6  s   z'no_op_if_value_is_null.<locals>.wrapperr/   )r   r   r/   r   r0   no_op_if_value_is_null3  s   r   c                 C   s*   t | D ]\}}|dur||f  S qdS )zwReturn the index and the value of the first non-null value in the iterable. If all values are None, return -1 as index.N)N)	enumerate)iterableir.   r/   r/   r0   first_non_null_value<  s
   r   c                  '   s6    t tj|  D ]  t fdd| D fV  qdS )z9Iterate over items of dictionaries grouped by their keys.c                 3   s    | ]}|  V  qd S rg   r/   )ro   dkeyr/   r0   rt   H  ru   zzip_dict.<locals>.<genexpr>N)r   	itertoolschainr   )dictsr/   r   r0   zip_dictD  s   r   c                       s8   e Zd ZdZ fddZ fddZ fddZ  ZS )NonMutableDictzDict where keys can only be added but not modified.

    Will raise an error if the user try to overwrite one key. The error message
    can be customized during construction. It will be formatted using {key} for
    the overwritten key.
    c                    s0   | dd| _|rtdt j|i | d S )N	error_msgz$Try to overwrite existing key: {key}z1NonMutableDict cannot be initialized with kwargs.)pop
_error_msgrF   super__init__)selfargskwargs	__class__r/   r0   r   S  s   zNonMutableDict.__init__c                    s(   || v rt | jj|dt ||S )Nr   )rF   r   formatr   __setitem__)r   r   r.   r   r/   r0   r   \  s   zNonMutableDict.__setitem__c                    s@   t  fdd|D rt jjt t|@ dt |S )Nc                 3   s    | ]}| v V  qd S rg   r/   )ro   rw   r   r/   r0   rt   b  ru   z(NonMutableDict.update.<locals>.<genexpr>r   )anyrF   r   r   r   r   update)r   otherr   r   r0   r   a  s   zNonMutableDict.update)__name__
__module____qualname____doc__r   r   r   __classcell__r/   r/   r   r0   r   K  s
    	r   c                   @   s   e Zd ZdZdddZdS )classpropertyz5Descriptor to be used as decorator for @classmethods.Nc                 C   s   | j d | S rg   )fget__get__)r   rj   objtyper/   r/   r0   r   j     zclassproperty.__get__rg   )r   r   r   r   r   r/   r/   r/   r0   r   g  s    r   c           	         s  | \} }}}t |ts!t |s!r|gd S |S rDt |tsDt |rDtfdd|D rDfddt| D S |durSt tjk rSt  |durj|sjtdd t	j
D rjtdd	d
d t |trs| n|}|dur|dur|d nd	d t| n|}t|||d|dO}t |tr fdd|D W  d   S  fdd|D }t |tr|W  d   S t |trt|W  d   S t|W  d   S 1 sw   Y  dS )zEApply a function recursively to each element of a nested data struct.r   c                 3   s     | ]}t |t f V  qd S rg   )rC   r_   rn   )typesr/   r0   rt   |  s    z%_single_map_nested.<locals>.<genexpr>c                    s   g | ]} |D ]}|qqS r/   r/   )ro   batchmapped_item)functionr/   r0   rs   ~  ry   z&_single_map_nested.<locals>.<listcomp>Nc                 s   s    | ]}d |j v V  qdS )notebookN)r   )ro   tqdm_clsr/   r/   r0   rt     s    r'    T)endflush#rj   )disablepositionunitdescc                    s*   i | ]\}}|t | d dd fqS NT_single_map_nestedrv   
batch_sizebatchedr   r   r/   r0   rx     s    z&_single_map_nested.<locals>.<dictcomp>c                    $   g | ]}t | d dd fqS r   r   rn   r   r/   r0   rs     s   $ )rC   r_   alliter_batchedr   get_verbosityWARNINGset_verbosity_warningr   r   __mro__printr   strhf_tqdmr\   r   r   array)	r   data_structrankdisable_tqdmr   pbar_iterable	pbar_descpbarmappedr/   r   r0   r   n  sF    ,


	$r   TrB   r   r  	dict_onlymap_list	map_tuple	map_numpynum_procparallel_min_lengthr   r   r   r  r   c                    sb  du r"g |s|r t |r t |r tj tt|ts=t|s=r1|g}|}r;|d }|S t|trHt| n|du rPdtfddD rk fddD }ndkrsdksyt	k rr du s dkrt
t	 tt	 dk d tt  fd	dt||d
D }rdd |D }nSt F tjddtd r du sψ dkrt	 tt	 dk  tt t ||t	}rdd |D }W d   n	1 sw   Y  t|trtt| |S t|tr"|S t|tr,t|S t|S )a4	  Apply a function recursively to each element of a nested data struct.

    Use multiprocessing if num_proc > 1 and the length of data_struct is greater than or equal to
    `parallel_min_length`.

    <Changed version="2.5.0">

    Before version 2.5.0, multiprocessing was not used if `num_proc` was greater than or equal to ``len(iterable)``.

    Now, if `num_proc` is greater than or equal to ``len(iterable)``, `num_proc` is set to ``len(iterable)`` and
    multiprocessing is used.

    </Changed>

    Args:
        function (`Callable`): Function to be applied to `data_struct`.
        data_struct (`Any`): Data structure to apply `function` to.
        dict_only (`bool`, default `False`): Whether only apply `function` recursively to `dict` values in
            `data_struct`.
        map_list (`bool`, default `True`): Whether also apply `function` recursively to `list` elements (besides `dict`
            values).
        map_tuple (`bool`, default `False`): Whether also apply `function` recursively to `tuple` elements (besides
            `dict` values).
        map_numpy (`bool, default `False`): Whether also apply `function` recursively to `numpy.array` elements (besides
            `dict` values).
        num_proc (`int`, *optional*): Number of processes.
            The level in the data struct used for multiprocessing is the first level that has smaller sub-structs,
            starting from the root.
        parallel_min_length (`int`, default `2`): Minimum length of `data_struct` required for parallel
            processing.
            <Added version="2.5.0"/>
        batched (`bool`, defaults to `False`):
            Provide batch of items to `function`.
            <Added version="2.19.0"/>
        batch_size (`int`, *optional*, defaults to `1000`):
            Number of items per batch provided to `function` if `batched=True`.
            If `batch_size <= 0` or `batch_size == None`, provide the full iterable as a single batch to `function`.
            <Added version="2.19.0"/>
        types (`tuple`, *optional*): Additional types (besides `dict` values) to apply `function` recursively to their
            elements.
        disable_tqdm (`bool`, default `True`): Whether to disable the tqdm progressbar.
        desc (`str`, *optional*): Prefix for the tqdm progressbar.

    Returns:
        `Any`
    Nr   r   c                 3   s*    | ]}t |ot|t kV  qd S rg   )rC   lenrn   )r   r   r/   r0   rt     s   ( zmap_nested.<locals>.<genexpr>c                    s"   g | ]}t | d qS ))r   r  r  r  r   r   r   )
map_nestedro   rj   )r   r   r   r  r  r   r/   r0   rs     s    
zmap_nested.<locals>.<listcomp>r   c                    r   r   r   r  r   r/   r0   rs     s    )r   r   c                 S      g | ]	}|D ]}|qqS r/   r/   ro   mapped_batchr   r/   r/   r0   rs         ignorezL.* is experimental and might be subject to breaking changes in the future\.$)messagecategoryc                 S   r  r/   r/   r  r/   r/   r0   rs     r  )appendr\   r   r   ndarrayrC   r_   rc   r   r  maxr)   r   r  warningscatch_warningsfilterwarningsUserWarningr   r   r`   rd   r  )r   r  r  r  r  r  r  r  r   r   r   r  r   r
  r/   )r   r   r   r   r  r  r   r0   r    sv   =


&
 
r  c                   @   s    e Zd ZdddZdddZdS )NestedDataStructureNc                 C   s   |d ur	|| _ d S g | _ d S rg   )datar   r#  r/   r/   r0   r   $  s   zNestedDataStructure.__init__c                    sT   |d ur|n j }t|tr t| S t|ttfr' fdd|D S |gS )Nc                    s    g | ]}  |D ]}|q	qS r/   )flatten)ro   item	flattenedr   r/   r0   rs   ,  s     z/NestedDataStructure.flatten.<locals>.<listcomp>)r#  rC   r_   r%  r\   rc   r   r$  r/   r   r0   r%  '  s   
zNestedDataStructure.flattenrg   )r   r   r   r   r%  r/   r/   r/   r0   r"  #  s    
r"  rJ   c                 C   s2   zt tj|j}W | |k S  ty   Y dS w r   )r	   ospathabspathfreeOSError)needed_bytes	directory
free_bytesr/   r/   r0   has_sufficient_disk_space1  s   r0  url_pathc                 C   s   t | }d}|jdv r[|jdkr[d| v r+| ds!td|  d| dd} | |fS |jd	d }d
|v r;|d
n|df\}}|d\}}d| d| d| d} | d| }| |fS )zMConvert a link to a file on a github repo in a link to the raw github object.N)httphttpss3z
github.comblobz.pyzExternal import from github at z) should point to a file ending with '.py'rawr   z/tree/masterrM   zhttps://github.com/z	/archive/z.zip-)r   schemenetlocrE   rF   rT   r)  split)r1  parsedsub_directorygithub_path	repo_infobranch
repo_owner	repo_namer/   r/   r0   _convert_github_url9  s   
rC  importable_local_filec                 C   s$   t t|  jj}|d }t|S )Nz.lock)r  r   resolveparentr   )rD  importable_directory_path	lock_pathr/   r/   r0   lock_importable_fileL  s   rI  	file_pathc           	         s  g }t | dd}||  W d   n1 sw   Y  td|  d g }d}|D ]}td|}t|dkr?| }|rBq.tjd	|tj	d
  du r]tjd|tj	d
  du r]q. 
drt fdd|D rnq. 
dr 
d}t|\}}|d 
d||f q. 
dr|d 
d 
ddf q. 
dr 
d}|d 
d|df q.|d 
d 
ddf q.|S )a  Find whether we should import or clone additional files for a given processing script.
        And list the import.

    We allow:
    - library dependencies,
    - local dependencies and
    - external dependencies whose url is specified with a comment starting from "# From:' followed by the raw url to a file, an archive or a github repository.
        external dependencies will be downloaded (and extracted if needed in the dataset folder).
        We also add an `__init__.py` to each sub-folder of a downloaded folder so the user can import from them in the script.

    Note that only direct import in the dataset processing script will be handled
    We don't recursively explore the additional import to download further files.

    Example::

        import tensorflow
        import .c4_utils
        import .clicr.dataset-code.build_json_dataset  # From: https://raw.githubusercontent.com/clips/clicr/master/dataset-code/build_json_dataset
    zutf-8)encodingNz	Checking z for additional imports.Fz[\s\S]*?"""[\s\S]*?r   z=^import\s+(\.?)([^\s\.]+)[^#\r\n]*(?:#\s+From:\s+)?([^\r\n]*))flagszQ^from\s+(\.?)([^\s\.]+)(?:[^\s]*)\s+import\s+[^#\r\n]*(?:#\s+From:\s+)?([^\r\n]*)c                 3   s"    | ]}|d    dkV  qdS )r   r   N)group)ro   impmatchr/   r0   rt     s     zget_imports.<locals>.<genexpr>   externalr   internallibrary)openextend	readlinesloggerdebugrY   r^   r  rP  	MULTILINErM  r   rC  r  )	rJ  linesr   importsis_in_docstringlinedocstr_start_matchr1  r=  r/   rO  r0   get_importsU  sJ   





 r`  c                 C   s(   t | j| j| j| j| j}| j|_|S rg   )r   FunctionType__code____globals__r   __defaults____closure____kwdefaults__)r   rb   r/   r/   r0   copyfunc  s   rg  Yqueuer   .r   c                 C   s*   t |di |D ]	\}}| | q	|S )Nr/   )r   put)ri  r   r   r   rb   r/   r/   r0   _write_generator_to_queue  s   rk  poolc                 C   s   dd | j D S )Nc                 S   s   h | ]}|j qS r/   )pid)ro   r   r/   r/   r0   	<setcomp>  s    z _get_pool_pid.<locals>.<setcomp>)_pool)rl  r/   r/   r0   _get_pool_pid  r   rp  kwargs_iterablec             	   #   s    t }d}ttjjrtntj}| _}|  fdd|D }z;	 z	jddV  W n t	yI   t
dd |D rG rGY nY nw t |krVd}td	q(W |sad
d |D  n|sld
d |D  w w W d    d S 1 sxw   Y  d S )NFc                    s   g | ]} t |fqS r/   )apply_asyncrk  )ro   r   r   rl  ri  r/   r0   rs     s    z&iflatmap_unordered.<locals>.<listcomp>T皙?timeoutc                 s   s    | ]}|  V  qd S rg   )readyro   async_resultr/   r/   r0   rt     ru   z%iflatmap_unordered.<locals>.<genexpr>zkOne of the subprocesses has abruptly died during map operation.To debug the error, disable multiprocessing.c                 S   s   g | ]}|j d dqS )rt  ru  )r~   rx  r/   r/   r0   rs     s    )rp  rC   multiprocessingrl  Poolr   multiprocessQueuer~   r   r   emptyRuntimeError)rl  r   rq  initial_pool_pidpool_changedmanager_clsmanagerasync_resultsr/   rs  r0   iflatmap_unordered  sB   	"r  Tr   nc                 c   sX    |dk rt d| g }| D ]}|| t||kr"|V  g }q|r*|V  d S d S )Nr   zInvalid batch size )rF   r  r  )r   r  r   r&  r/   r/   r0   r     s   

r   )FF)FTFFNr   FrB   NTN)rJ   )cr   r   	functoolsr   multiprocessing.poolrz  r(  ri  rY   r   r  collections.abcr   
contextlibr   dataclassesr   r   r   pathlibr   r   shutilr	   typingr
   r   r   r   r   urllib.parser   r|  multiprocess.poolnumpyr   	tqdm.autor   r   r   parallelr   r   r  _dillr   r   r   r   	_filelockr   r   typing_extensions_typing_extensionsr   ImportError
get_loggerr   rX  	lru_cachememoizer1   r)   r  rH   rW   r_   rf   r   r   r   r   r   r   r   r   propertyr   r   boolr   r  r"  r0  rC  rI  r`  rg  rh  r}  rk  rl  r{  r   rp  r  r  r\   r   r/   r/   r/   r0   <module>   s   
," '

5		/	

 

	I(&
"&