o
    8wi2                     @   sr  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
mZ d dlmZ d dlmZmZ ddlmZ ddlmZ dd	lmZ eeZG d
d dZG dd de
ZG dd dee
ZG dd deZG dd deZG dd deZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G d d! d!eZ$G d"d# d#Z%dS )$    N)ABCabstractmethod)Path)OptionalUnion   )config   )FileLock)
get_loggerc                   @   s`   e Zd Zddee fddZdedefddZd	ed
edefddZdded
edefddZ	dS )ExtractManagerN	cache_dirc                 C   s&   |r
t j|tjntj| _t| _d S N)	ospathjoinr   EXTRACTED_DATASETS_DIREXTRACTED_DATASETS_PATHextract_dir	Extractor	extractor)selfr    r   S/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/datasets/utils/extract.py__init__   s   
zExtractManager.__init__r   returnc                 C   s,   ddl m} tj|}tj| j||S )Nr	   )hash_url_to_filename)
file_utilsr   r   r   abspathr   r   )r   r   r   abs_pathr   r   r   _get_output_path   s   zExtractManager._get_output_pathoutput_pathforce_extractc                 C   s*   |pt j| ot j|ot | S r   )r   r   isfileisdirlistdir)r   r!   r"   r   r   r   _do_extract%   s   $zExtractManager._do_extractF
input_pathc                 C   s>   | j |}|s
|S | |}| ||r| j ||| |S r   )r   infer_extractor_formatr    r&   extract)r   r'   r"   extractor_formatr!   r   r   r   r)   *   s   
zExtractManager.extractr   F)
__name__
__module____qualname__r   strr   r    boolr&   r)   r   r   r   r   r      s
    r   c                   @   s\   e Zd Zeedeeef defddZ	e
edeeef deeef ddfdd	ZdS )
BaseExtractorr   r   c                 K      d S r   r   clsr   kwargsr   r   r   is_extractable5      zBaseExtractor.is_extractabler'   r!   Nc                 C   r2   r   r   )r'   r!   r   r   r   r)   9   r7   zBaseExtractor.extract)r,   r-   r.   classmethodr   r   r   r/   r0   r6   staticmethodr)   r   r   r   r   r1   4   s    .r1   c                   @   s`   e Zd ZU g Zee ed< edee	e
f defddZeddee	e
f dedefd	d
ZdS )MagicNumberBaseExtractormagic_numbersr   magic_number_lengthc                 C   s8   t | d}||W  d    S 1 sw   Y  d S )Nrb)openread)r   r<   fr   r   r   read_magic_numberA   s   $z*MagicNumberBaseExtractor.read_magic_number    magic_numberr   c                    sV    st dd | jD }z| || W n
 ty   Y dS w t fdd| jD S )Nc                 s   s    | ]}t |V  qd S r   )len.0cls_magic_numberr   r   r   	<genexpr>I   s    z:MagicNumberBaseExtractor.is_extractable.<locals>.<genexpr>Fc                 3   s    | ]}  |V  qd S r   )
startswithrE   rC   r   r   rH   N   s    )maxr;   rA   OSErrorany)r4   r   rC   r<   r   rJ   r   r6   F   s   z'MagicNumberBaseExtractor.is_extractableNrB   )r,   r-   r.   r;   listbytes__annotations__r9   r   r   r/   intrA   r8   r0   r6   r   r   r   r   r:   >   s   
 &r:   c                   @   s`   e Zd Zedeeef defddZe	dd Z
e	deeef deeef dd	fd
dZd	S )TarExtractorr   r   c                 K   s
   t |S r   )tarfile
is_tarfiler3   r   r   r   r6   R   s   
zTarExtractor.is_extractablec                 #   s    dt dt fdddt dt dtffdd dt dtf fdd	}|}| D ]D} |j|r<td
|j d q)| rS|||rStd
|j d|j  q)| rj|||rjtd
|j d|j  q)|V  q)dS )a  
        Fix for CVE-2007-4559
        Desc:
            Directory traversal vulnerability in the (1) extract and (2) extractall functions in the tarfile
            module in Python allows user-assisted remote attackers to overwrite arbitrary files via a .. (dot dot)
            sequence in filenames in a TAR archive, a related issue to CVE-2001-1267.
        See: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2007-4559
        From: https://stackoverflow.com/a/10077309
        r   r   c                 S   s   t jt j| S r   )r   r   realpathr   )r   r   r   r   resolvedb   s   z*TarExtractor.safemembers.<locals>.resolvedbasec                    s    t j|| | S r   )r   r   r   rI   )r   rX   )rW   r   r   badpathe   s   z)TarExtractor.safemembers.<locals>.badpathc                    s*   t j|t j| j} | j|dS )N)rX   )r   r   r   dirnamenamelinkname)inforX   tiprY   rW   r   r   badlinki   s   z)TarExtractor.safemembers.<locals>.badlinkzExtraction of z is blocked (illegal path)z is blocked: Symlink to z is blocked: Hard link to N)r/   r0   r[   loggererrorissymr\   islnk)membersr!   r`   rX   finfor   r_   r   safemembersV   s   zTarExtractor.safemembersr'   r!   Nc                 C   s:   t j|dd t| }|j|t||d |  d S )NTexist_ok)re   )r   makedirsrT   r>   
extractallrS   rg   close)r'   r!   tar_filer   r   r   r)   z   s   
zTarExtractor.extract)r,   r-   r.   r8   r   r   r/   r0   r6   r9   rg   r)   r   r   r   r   rS   Q   s    
#,rS   c                   @   <   e Zd ZdgZedeeef deeef ddfddZdS )GzipExtractors   r'   r!   r   Nc              	   C   x   t | d,}t|d}t|| W d    n1 sw   Y  W d    d S W d    d S 1 s5w   Y  d S Nr=   wb)gzipr>   shutilcopyfileobj)r'   r!   	gzip_fileextracted_filer   r   r   r)         "zGzipExtractor.extract	r,   r-   r.   r;   r9   r   r   r/   r)   r   r   r   r   ro          ,ro   c                       sj   e Zd Zg dZeddeeef dede	f fddZ
edeeef d	eeef dd
fddZ  ZS )ZipExtractor)s   PKs   PKs   PKrB   r   rC   r   c                    s  t  j||dr
dS zddlm}m}m}m}m}m}m	}	m
}
m}m} t|d}|	|}|r|| dkrK|| dkrK|| dkrK	 W d    W dS || || kr|||  | || kr|| |
kr||
}t||
krt||}|| |kr	 W d    W dS W d    W dS W d    W dS W d    W dS W d    W dS W d    W dS W d    W dS 1 sw   Y  W dS  ty   Y dS w )NrJ   Tr   )
_CD_SIGNATURE_ECD_DISK_NUMBER_ECD_DISK_START_ECD_ENTRIES_TOTAL_ECD_OFFSET	_ECD_SIZE_EndRecDatasizeCentralDirstringCentralDirstructCentralDirr=   F)superr6   zipfiler|   r}   r~   r   r   r   r   r   r   r   r>   seektellr?   rD   structunpack	Exception)r4   r   rC   r|   r}   r~   r   r   r   r   r   r   r   fpendrecdatacentdir	__class__r   r   r6      sT   0$






zZipExtractor.is_extractabler'   r!   Nc                 C   sR   t j|dd t| d}|| |  W d    d S 1 s"w   Y  d S )NTrh   r)r   rj   r   ZipFilerk   rl   )r'   r!   zip_filer   r   r   r)      s
   

"zZipExtractor.extractrN   )r,   r-   r.   r;   r8   r   r   r/   rP   r0   r6   r9   r)   __classcell__r   r   r   r   r{      s    &$0r{   c                   @   rn   )XzExtractors   7zXZ r'   r!   r   Nc              	   C   sv   t | ,}t|d}t|| W d    n1 sw   Y  W d    d S W d    d S 1 s4w   Y  d S )Nrr   )lzmar>   rt   ru   r'   r!   compressed_filerw   r   r   r   r)      s   "zXzExtractor.extractry   r   r   r   r   r      rz   r   c                   @   s>   e Zd ZddgZedeeef deeef ddfddZdS )	RarExtractors   Rar! s   Rar! r'   r!   r   Nc                 C   sD   t jstddd l}tj|dd || }|| |  d S )NzPlease pip install rarfiler   Trh   )	r   RARFILE_AVAILABLEImportErrorrarfiler   rj   RarFilerk   rl   )r'   r!   r   rfr   r   r   r)      s   

zRarExtractor.extractry   r   r   r   r   r      s    ,r   c                   @   rn   )ZstdExtractors   (/r'   r!   r   Nc              	   C   s   t jstddd l}| }t| d,}t|d}||| W d    n1 s+w   Y  W d    d S W d    d S 1 sCw   Y  d S )NzPlease pip install zstandardr   r=   rr   )r   ZSTANDARD_AVAILABLEr   	zstandardZstdDecompressorr>   copy_stream)r'   r!   zstddctxifhofhr   r   r   r)      s   PzZstdExtractor.extractry   r   r   r   r   r      rz   r   c                   @   rn   )Bzip2Extractors   BZhr'   r!   r   Nc              	   C   rp   rq   )bz2r>   rt   ru   r   r   r   r   r)      rx   zBzip2Extractor.extractry   r   r   r   r   r      rz   r   c                   @   rn   )SevenZipExtractors   7z'r'   r!   r   Nc                 C   s`   t jstddd l}tj|dd || d}|| W d    d S 1 s)w   Y  d S )NzPlease pip install py7zrr   Trh   r   )r   PY7ZR_AVAILABLEr   py7zrr   rj   SevenZipFilerk   )r'   r!   r   archiver   r   r   r)      s   "zSevenZipExtractor.extractry   r   r   r   r   r      rz   r   c                   @   rn   )Lz4Extractors   "Mr'   r!   r   Nc              	   C   s   t jstddd l}|j| d,}t|d}t|| W d    n1 s)w   Y  W d    d S W d    d S 1 sAw   Y  d S )NzPlease pip install lz4r   r=   rr   )r   LZ4_AVAILABLEr   	lz4.frameframer>   rt   ru   )r'   r!   lz4r   rw   r   r   r   r)     s   "zLz4Extractor.extractry   r   r   r   r   r      rz   r   c                
   @   s   e Zd ZU eeeeeee	e
ed	Zeeee f ed< edd Zedeeef defddZeddeeef d
edefddZedeeef dee fddZedeeef deeef deddfddZdS )r   )	tarrs   zipxzrarr   r   7zr   
extractorsc                 C   s   t dd | j D S )Nc                 s   s.    | ]}t |tr|jD ]}t|V  qqd S r   )
issubclassr:   r;   rD   )rF   r   extractor_magic_numberr   r   r   rH     s    z9Extractor._get_magic_number_max_length.<locals>.<genexpr>)rK   r   values)r4   r   r   r   _get_magic_number_max_length  s   z&Extractor._get_magic_number_max_lengthr   r<   c                 C   s&   zt j| |dW S  ty   Y dS w )N)r<   rB   )r:   rA   rL   )r   r<   r   r   r   _read_magic_number$  s
   zExtractor._read_magic_numberFreturn_extractorr   c                 C   s>   t jdtd | |}|r|sdS d| j| fS |sdS dS )Nz{Method 'is_extractable' was deprecated in version 2.4.0 and will be removed in 3.0.0. Use 'infer_extractor_format' instead.)categoryTF)FN)warningswarnFutureWarningr(   r   )r4   r   r   r*   r   r   r   r6   +  s   
zExtractor.is_extractablec                 C   sB   |   }| ||}| j D ]\}}|j||dr|  S qd S )NrJ   )r   r   r   itemsr6   )r4   r   magic_number_max_lengthrC   r*   r   r   r   r   r(   7  s   z Extractor.infer_extractor_formatr'   r!   r*   Nc                 C   sx   t jt j|dd tt|d}t| tj	|dd | j
| }|||W  d    S 1 s5w   Y  d S )NTrh   z.lock)ignore_errors)r   rj   r   rZ   r/   r   with_suffixr
   rt   rmtreer   r)   )r4   r'   r!   r*   	lock_pathr   r   r   r   r)   ?  s   


$zExtractor.extractr+   )r,   r-   r.   rS   ro   r{   r   r   r   r   r   r   r   dictr/   typer1   rQ   r8   r   r9   r   r   rR   r   r0   r6   r   r(   r)   r   r   r   r   r     s:   
 
" 

r   )&r   rs   r   r   rt   r   rT   r   r   abcr   r   pathlibr   typingr   r    r   	_filelockr
   loggingr   r,   ra   r   r1   r:   rS   ro   r{   r   r   r   r   r   r   r   r   r   r   r   <module>   s:    
1
4

