o
    8wi+b                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
mZmZ ddlZddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZ ddlmZ ddlm Z! e
rnddl"m#Z# ddl$m%Z%m&Z& e'e(Z)dZ*e+dedd  dej,Z-e+dZ.G dd de/Z0G dd de/Z1eddG dd dZ2		d9de3de4d dee3d f d!ee3 d"ee3 d#e2fd$d%Z5G d&d' d'Z6G d(d) d)e6Z7G d*d+ d+e6Z8eddG d,d- d-Z9eddG d.d/ d/Z:d0d1 Z;d2d3 Z<d4d5 Z=d6d7 Z>G d8d  d Z?dS ):zArrow ArrowReader.    N)	dataclass)partial)TYPE_CHECKINGOptionalUnion)
thread_map   )DownloadConfig)	_split_refilenames_for_dataset_split)InMemoryTableMemoryMappedTableTableconcat_tables)logging)tqdm)DatasetInfo)Split	SplitInfoz=https://storage.googleapis.com/huggingface-nlp/cache/datasetsz
^
 (?P<split>z)
 (\[
    ((?P<from>-?[\d_]+)
     (?P<from_pct>%)?)?
    :
    ((?P<to>-?[\d_]+)
     (?P<to_pct>%)?)?
 \])?(\((?P<rounding>[^\)]*)\))?
$
z\s*\+\s*c                   @      e Zd ZdZdS )DatasetNotOnHfGcsErrorz?When you can't get the dataset from the Hf google cloud storageN__name__
__module____qualname____doc__ r   r   R/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/datasets/arrow_reader.pyr   A       r   c                   @   r   )MissingFilesOnHfGcsErrorz9When some files are missing on the Hf oogle cloud storageNr   r   r   r   r   r    G   r   r    T)frozenc                   @   s&   e Zd ZU dZeed< ee ed< dS )FileInstructionsa}  The file instructions associated with a split ReadInstruction.

    Attributes:
        num_examples: `int`, The total number of examples
        file_instructions: List[dict(filename, skip, take)], the files information.
            The filenames contains the relative path, not absolute.
            skip/take indicates which example read in the file: `ds.slice(skip, take)`
    num_examplesfile_instructionsN)r   r   r   r   int__annotations__listdictr   r   r   r   r"   M   s   
 	r"   namesplit_infosr   instructionReadInstructionfiletype_suffixprefix_pathreturnc                    s  t tstdtj stddd |D }dd |D  fdd|D }t |ts9t|}||}g }d}	|D ]}
||
j	 }||
j	 }|
j	 }|
j
du r\dn|
j
}|
jdu rf|n|
j}|du r|D ]}|| }|dkrzqo|	|7 }	||||d	 qoqDd}d}t||D ]D\}}||7 }||k r||kr||kr|| nd}||k r|| | nd
}|dkrq||||d	 |	|d
kr|| n|7 }	||7 }qqDt|	|dS )a  Returns instructions of the split dict.

    Args:
        name (`str`): Name of the dataset.
        split_infos (`list` of `[SplitInfo]`): Dataset splits information.
        instruction ([`ReadInstruction`] or `str`): Reading instruction for a dataset.
        filetype_suffix (`str`, *optional*): Suffix of dataset files, e.g. 'arrow' or 'parquet'.
        prefix_path (`str`, *optional*): Prefix of dataset files, e.g. directory name.

    Returns:
        [`FileInstructions`]
    zExpected str 'name', but got: zExpected non-empty str 'name'c                 S      i | ]}|j |jqS r   )r)   r#   .0infor   r   r   
<dictcomp>s       z*make_file_instructions.<locals>.<dictcomp>c                 S   r0   r   )r)   shard_lengthsr1   r   r   r   r4   t   r5   c              
      s*   i | ]}|j t|j  |j  d qS ))pathdataset_namesplitr-   r6   )r)   r   r1   r-   r)   name2shard_lengthsr.   r   r   r4   u   s    r   N)filenameskiptaker   )r#   r$   )
isinstancestr	TypeErrortyper   
ValueErrorr,   	from_specto_absolute	splitnamefrom_toappendzipr"   )r)   r*   r+   r-   r.   name2lenname2filenamesabsolute_instructionsr$   r#   	abs_instrsplit_length	filenamesr6   rG   rH   r<   r>   index_start	index_endshard_lengthr=   r   r:   r   make_file_instructions\   sZ   









rT   c                   @   sz   e Zd ZdZdeded fddZddefd	d
ZddefddZ	dd Z
	dddZ		ddee ded fddZdS )
BaseReaderz@
    Build a Dataset object out of Instruction instance(s).
    r7   r3   r   c                 C   s   || _ || _d| _dS )zInitializes ArrowReader.

        Args:
            path (str): path where tfrecords are stored.
            info (DatasetInfo): info about the dataset.
        N)_path_info_filetype_suffixselfr7   r3   r   r   r   __init__   s   
zBaseReader.__init__Fr/   c                 C   s   t )=Returns a Dataset instance from given (filename, skip, take).)NotImplementedError)rZ   filename_skip_take	in_memoryr   r   r   _get_table_from_filename   s   z#BaseReader._get_table_from_filenamec                 C   s   t |dkstdd |D stdt|}|D ]}tj| j|d |d< qt	t
| j|d|tdt |dkp:d	d
}dd |D }|sU| jd	u sQ| jjd	u rUtd|pdtjg t| jjjdg}t |dkrqt|}|S |d }|S )a  Returns Dataset for given file instructions.

        Args:
            files: List[dict(filename, skip, take)], the files information.
                The filenames contain the absolute path, not relative.
                skip/take indicates which example read in the file: `ds.slice(skip, take)`
            in_memory (bool, default False): Whether to copy the data in-memory.
        r   c                 s   s    | ]}t |tV  qd S N)r?   r(   )r2   fr   r   r   	<genexpr>   s    z)BaseReader._read_files.<locals>.<genexpr>z&please provide valid file informationsr<   r_   zLoading dataset shards   N)
tqdm_classdescdisablec                 S   s   g | ]
}t |d kr|qS )r   )len)r2   tr   r   r   
<listcomp>   s    z*BaseReader._read_files.<locals>.<listcomp>zqTried to read an empty table. Please specify at least info.features to create an empty table with the right type.)schemar   )ri   allrC   copydeepcopyosr7   joinrV   r   r   r`   hf_tqdmrW   featuresr   from_batchesparl   rB   r   )rZ   filesr_   rb   	pa_tablespa_tabler   r   r   _read_files   s,   	
 zBaseReader._read_filesc                 C   s    t |||| j| jd}|j}|S )z?Return list of dict {'filename': str, 'skip': int, 'take': int})r-   r.   )rT   rX   rV   r$   )rZ   r)   r+   r*   r$   rv   r   r   r   get_file_instructions   s
   z BaseReader.get_file_instructionsc                 C   s6   |  |||}|sd| d}t|| j|||dS )a  Returns Dataset instance(s).

        Args:
            name (str): name of the dataset.
            instructions (ReadInstruction): instructions to read.
                Instruction can be string and will then be passed to the Instruction
                constructor as it.
            split_infos (list of SplitInfo proto): the available splits for dataset.
            in_memory (bool, default False): Whether to copy the data in-memory.

        Returns:
             kwargs to build a single Dataset instance.
        zInstruction "z" corresponds to no data!)rv   original_instructionsr_   )rz   rC   
read_files)rZ   r)   instructionsr*   r_   rv   msgr   r   r   read   s
   zBaseReader.readNrv   r{   )Nr,   r   c                 C   sF   | j ||d}|durddlm} |t|}nd}|| j|d}|S )aJ  Returns single Dataset instance for the set of file instructions.

        Args:
            files: List[dict(filename, skip, take)], the files information.
                The filenames contains the relative path, not absolute.
                skip/take indicates which example read in the file: `ds.skip().take()`
            original_instructions: store the original instructions used to build the dataset split in the dataset.
            in_memory (bool, default False): Whether to copy the data in-memory.

        Returns:
            kwargs to build a Dataset instance.
        rd   Nr   )r   )arrow_tabler3   r9   )ry   splitsr   r@   rW   )rZ   rv   r{   r_   rx   r   r9   dataset_kwargsr   r   r   r|      s   zBaseReader.read_filesF)NF)r   r   r   r   r@   r   r[   r   r`   ry   rz   r   r'   r(   r   r|   r   r   r   r   rU      s     
rU   c                       sR   e Zd ZdZdeded f fddZddefd	d
Ze	ddefddZ
  ZS )ArrowReaderz
    Build a Dataset object out of Instruction instance(s).
    This Reader uses either memory mapping or file descriptors (in-memory) on arrow files.
    r7   r3   r   c                       t  || d| _dS )zInitializes ArrowReader.

        Args:
            path (str): path where Arrow files are stored.
            info (DatasetInfo): info about the dataset.
        arrowNsuperr[   rX   rY   	__class__r   r   r[   #     
zArrowReader.__init__Fr/   c                 C   s   |d d|v r|d ndd|v r|d nd}}}t j||d}|dkr+t|| }|durC|durC|dkr=|t|ksC|||}|S )r\   r<   r=   Nr>   rd   r   r   )r   
read_tableri   slice)rZ   r^   r_   r<   r=   r>   tabler   r   r   r`   -  s   
$z$ArrowReader._get_table_from_filenamec                 C   s   |rt nt}|| S )z
        Read table from file.

        Args:
            filename (str): File name of the table.
            in_memory (bool, default=False): Whether to copy the data in-memory.

        Returns:
            pyarrow.Table
        )r   r   	from_file)r<   r_   	table_clsr   r   r   r   <  s   
zArrowReader.read_tabler   )r   r   r   r   r@   r   r[   r   r`   staticmethodr   __classcell__r   r   r   r   r     s    
r   c                       s6   e Zd ZdZdeded f fddZdd Z  ZS )	ParquetReaderzv
    Build a Dataset object out of Instruction instance(s).
    This Reader uses memory mapping on parquet files.
    r7   r3   r   c                    r   )zInitializes ParquetReader.

        Args:
            path (str): path where tfrecords are stored.
            info (DatasetInfo): info about the dataset.
        parquetNr   rY   r   r   r   r[   R  r   zParquetReader.__init__c                 K   sv   |d d|v r|d ndd|v r|d nd}}}t j|dd}|dur9|dur9|dkr3|t|ks9|||}|S )r\   r<   r=   Nr>   T)
memory_mapr   )pqr   ri   r   )rZ   r^   kwargsr<   r=   r>   rx   r   r   r   r`   \  s   
$z&ParquetReader._get_table_from_filename)	r   r   r   r   r@   r   r[   r`   r   r   r   r   r   r   L  s    
r   c                   @   s*   e Zd ZU dZeed< eed< eed< dS )_AbsoluteInstructionz?A machine friendly slice: defined absolute positive boundaries.rF   rG   rH   N)r   r   r   r   r@   r&   r%   r   r   r   r   r   k  s
   
 r   c                   @   sb   e Zd ZU dZeed< dZee ed< dZ	ee ed< dZ
ee ed< dZee ed< dd	 ZdS )
_RelativeInstructionzHRepresents a single parsed slicing instruction, can use % and negatives.rF   NrG   rH   unitroundingc                 C   s   | j d ur| j dvrtd| jd ur| jdvrtd| j dkr*| jd ur*td| j dkr?| jd ur?t| jdkr?td| j dkrT| jd urTt| jdkrTtd| jd u r`| j dkr`d	n| j| jd
< d S )N)%abszunit must be either % or abs)closestpct1_dropremainderz5rounding must be either closest or pct1_dropremainderr   zAIt is forbidden to specify rounding if not using percent slicing.d   z2Percent slice boundaries must be > -100 and < 100.r   r   )r   rC   r   rG   r   rH   __dict__rZ   r   r   r   __post_init__~  s   ""(z"_RelativeInstruction.__post_init__)r   r   r   r   r@   r&   rG   r   r%   rH   r   r   r   r   r   r   r   r   t  s   
 r   c                 C   s   t | }|std|  |ds|drdnd}t|d|d|dr1t|dnd	|d
rAt|d
|dS d	|dS )z)Returns ReadInstruction for given string.z!Unrecognized instruction format: from_pctto_pctr   r   r9   r   fromNrH   )
split_namer   rG   rH   r   )_SUB_SPEC_REmatchrC   groupr,   r%   )specresr   r   r   r   _str_to_read_instruction  s   
r   c                 C   s&   |dk r
d}t || t|d  S )Nr   zUsing "pct1_dropremainder" rounding on a split with less than 100 elements is forbidden: it always results in an empty dataset.      Y@)rC   mathtrunc)boundaryr#   r~   r   r   r   _pct_to_abs_pct1  s
   r   c                 C   s   t t| | d S )Nr   )r%   round)r   r#   r   r   r   _pct_to_abs_closest  s   r   c                 C   s   | j dkrtnt}| j}||vrtd| dt| d|| }| j}| j}| jdkrC|du r2dn|||}|du r=|n|||}n|du rIdn|}|du rQ|n|}|dk r^t	|| d}|dk rit	|| d}t
||}t
||}t|||S )zReturns _AbsoluteInstruction instance for given RelativeInstruction.

    Args:
        rel_instr: RelativeInstruction instance.
        name2len: dict {split_name: num_examples}.
    r   zUnknown split "z". Should be one of .r   Nr   )r   r   r   rF   rC   r'   rG   rH   r   maxminr   )	rel_instrrK   
pct_to_absr9   r#   rG   rH   r   r   r   _rel_to_abs_instr  s&   


r   c                   @   sb   e Zd ZdZdd Zedd ZdddZed	d
 Zdd Z	dd Z
dd Zdd Zdd ZdS )r,   a  Reading instruction for a dataset.

    Examples::

      # The following lines are equivalent:
      ds = datasets.load_dataset('mnist', split='test[:33%]')
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec('test[:33%]'))
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction('test', to=33, unit='%'))
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction(
          'test', from_=0, to=33, unit='%'))

      # The following lines are equivalent:
      ds = datasets.load_dataset('mnist', split='test[:33%]+train[1:-1]')
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec(
          'test[:33%]+train[1:-1]'))
      ds = datasets.load_dataset('mnist', split=(
          datasets.ReadInstruction('test', to=33, unit='%') +
          datasets.ReadInstruction('train', from_=1, to=-1, unit='abs')))

      # The following lines are equivalent:
      ds = datasets.load_dataset('mnist', split='test[:33%](pct1_dropremainder)')
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec(
          'test[:33%](pct1_dropremainder)'))
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction(
          'test', from_=0, to=33, unit='%', rounding="pct1_dropremainder"))

      # 10-fold validation:
      tests = datasets.load_dataset(
          'mnist',
          [datasets.ReadInstruction('train', from_=k, to=k+10, unit='%')
          for k in range(0, 100, 10)])
      trains = datasets.load_dataset(
          'mnist',
          [datasets.ReadInstruction('train', to=k, unit='%') + datasets.ReadInstruction('train', from_=k+10, unit='%')
          for k in range(0, 100, 10)])

    c                 C   s
   || _ d S ra   _relative_instructions)rZ   relative_instructionsr   r   r   _init  s   
zReadInstruction._initc                 C   s   |  | }|| |S )zCReturns ReadInstruction obj initialized with relative_instructions.)__new__r   )clsr   resultr   r   r   ,_read_instruction_from_relative_instructions  s   

z<ReadInstruction._read_instruction_from_relative_instructionsNc                 C   s   |  t|||||g dS )a  Initialize ReadInstruction.

        Args:
            split_name (str): name of the split to read. Eg: 'train'.
            rounding (str, optional): The rounding behaviour to use when percent slicing is
                used. Ignored when slicing with absolute indices.
                Possible values:
                 - 'closest' (default): The specified percentages are rounded to the
                     closest value. Use this if you want specified percents to be as
                     much exact as possible.
                 - 'pct1_dropremainder': the specified percentages are treated as
                     multiple of 1%. Use this option if you want consistency. Eg:
                         len(5%) == 5 * len(1%).
                     Using this option, one might not be able to use the full set of
                     examples, if the number of those is not a multiple of 100.
            from_ (int):
            to (int): alternative way of specifying slicing boundaries. If any of
                {from_, to, unit} argument is used, slicing cannot be specified as
                string.
            unit (str): optional, one of:
                '%': to set the slicing unit as percents of the split size.
                'abs': to set the slicing unit as absolute numbers.
        N)r   r   )rZ   r   r   rG   rH   r   r   r   r   r[     s   zReadInstruction.__init__c                 C   sL   t |}t|}|std| t|d }tdd |dd D |S )aM  Creates a `ReadInstruction` instance out of a string spec.

        Args:
            spec (`str`):
                Split(s) + optional slice(s) to read + optional rounding
                if percents are used as the slicing unit. A slice can be specified,
                using absolute numbers (`int`) or percentages (`int`).

        Examples:

            ```
            test: test split.
            test + validation: test split + validation split.
            test[10:]: test split, minus its first 10 records.
            test[:10%]: first 10% records of test split.
            test[:20%](pct1_dropremainder): first 10% records, rounded with the pct1_dropremainder rounding.
            test[:-5%]+train[40%:60%]: first 95% of test + middle 20% of train.
            ```

        Returns:
            ReadInstruction instance.
        z&No instructions could be built out of r   c                 s   s    | ]}t |V  qd S ra   )r   )r2   subr   r   r   rc   5  s    z,ReadInstruction.from_spec.<locals>.<genexpr>r   N)r@   _ADDITION_SEP_REr9   rC   r   sum)r   r   subsr+   r   r   r   rD     s   
zReadInstruction.from_specc           
      C   s   g }| j D ]c}|j}|jd us|jd urc|j}|j}|j}|j}|dkr&|nd}|d ur2t|| nd}|d ur>t|| nd}d| d| d}|dkr[|d ur[|dkr[d| dnd}	|||	 7 }|| qd	|S )
Nr    [:]r   ()+)	r   rF   rG   rH   r   r   r@   rI   rq   )
rZ   rel_instr_specsr   rel_instr_specrG   rH   r   r   	slice_strrounding_strr   r   r   to_spec7  s"   
&
zReadInstruction.to_specc                 C   sj   t |tsd}t|| j}|j}|d jdkr.|d jdkr.| jd j|d jkr.td| || S )zEReturns a new ReadInstruction obj, result of appending other to self.zAReadInstruction can only be added to another ReadInstruction obj.r   r   zPIt is forbidden to sum ReadInstruction instances with different rounding values.)r?   r,   rA   r   r   r   rC   r   )rZ   otherr~   self_ris	other_risr   r   r   __add__K  s   
zReadInstruction.__add__c                 C   s   |   S ra   )r   r   r   r   r   __str__Z  s   zReadInstruction.__str__c                 C   s   d| j  dS )NzReadInstruction(r   r   r   r   r   r   __repr__]  s   zReadInstruction.__repr__c                    s    fdd| j D S )aZ  Translate instruction into a list of absolute instructions.

        Those absolute instructions are then to be added together.

        Args:
            name2len (`dict`):
                Associating split names to number of examples.

        Returns:
            list of _AbsoluteInstruction instances (corresponds to the + in spec).
        c                    s   g | ]}t | qS r   )r   )r2   r   rK   r   r   rk   l  r5   z/ReadInstruction.to_absolute.<locals>.<listcomp>r   )rZ   rK   r   r   r   rE   `  s   zReadInstruction.to_absolute)NNNN)r   r   r   r   r   classmethodr   r[   rD   r   r   r   r   rE   r   r   r   r   r,     s    &


)NN)@r   rn   r   rp   redataclassesr   	functoolsr   typingr   r   r   pyarrowru   pyarrow.parquetr   r   tqdm.contrib.concurrentr   download.download_configr	   namingr
   r   r   r   r   r   r   utilsr   r   rr   r3   r   r   r   r   
get_loggerr   loggerHF_GCP_BASE_URLcompileXr   r   ConnectionErrorr   r    r"   r@   r'   rT   rU   r   r   r   r   r   r   r   r   r,   r   r   r   r   <module>   sv   




Kv/