o
    8wi8                     @   sl   d Z ddlmZ ddlmZ deeeeef defddZ	G dd	 d	e
ZG d
d de
ZG dd dZdS )a  
Hashing function for dataset keys using `hashlib.md5`

Requirements for the hash function:

- Provides a uniformly distributed hash from random space
- Adequately fast speed
- Working with multiple input types (in this case, `str`, `int` or `bytes`)
- Should be platform independent (generates same hash on different OS and systems)

The hashing function provides a unique 128-bit integer hash of the key provided.

The split name is being used here as the hash salt to avoid having same hashes
in different splits due to same keys
    )Union)insecure_hashlib	hash_datareturnc                 C   sP   t | ttfr	| S t | tr| dd} nt | trt| } nt| | dS )z|
    Returns the input hash_data in its bytes form

    Args:
    hash_data: the hash salt/key to be converted to bytes
    \/zutf-8)
isinstancebytes	bytearraystrreplaceintInvalidKeyErrorencode)r    r   M/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/datasets/keyhash.py	_as_bytes&   s   



r   c                       s    e Zd ZdZ fddZ  ZS )r   z6Raises an error when given key is of invalid datatype.c                    sD   d| _ d| dt| | _d| _t | j  | j | j  d S )Nz7
FAILURE TO GENERATE DATASET: Invalid key type detectedz
Found Key z	 of type z-
Keys should be either str, int or bytes type)prefixtypeerr_msgsuffixsuper__init__)selfr   	__class__r   r   r   @   s   "zInvalidKeyError.__init____name__
__module____qualname____doc__r   __classcell__r   r   r   r   r   =   s    r   c                       s"   e Zd ZdZd fdd	Z  ZS )DuplicatedKeysErrorz(Raise an error when duplicate key found. c                    s   || _ || _|| _d| _t|dkrdd| d| | _ndd|d d  dt|d  d| | _|r<d| nd	| _t 	| j | j | j  d S )
Nz3Found multiple examples generated with the same key   z
The examples at index z, z have the key z... (z more) have the key 
r#   )
keyduplicate_key_indicesfix_msgr   lenjoinr   r   r   r   )r   r&   r'   r(   r   r   r   r   J   s   ."zDuplicatedKeysError.__init__)r#   r   r   r   r   r   r"   G   s    r"   c                   @   s:   e Zd ZdZdefddZdeeeef defddZ	d	S )
	KeyHasherz,KeyHasher class for providing hash using md5	hash_saltc                 C   s   t t|| _d S )N)r   md5r   
_split_md5)r   r,   r   r   r   r   Z   s   zKeyHasher.__init__r&   r   c                 C   s*   | j  }t|}|| t| dS )zReturns 128-bits unique hash of input key

        Args:
        key: the input key to be hashed (should be str, int or bytes)

        Returns: 128-bit int hash key   )r.   copyr   updater   	hexdigest)r   r&   r-   byte_keyr   r   r   hash]   s   

zKeyHasher.hashN)
r   r   r   r    r   r   r   r   r	   r4   r   r   r   r   r+   W   s     r+   N)r    typingr   huggingface_hub.utilsr   r   r   r	   r
   r   	Exceptionr   r"   r+   r   r   r   r   <module>   s   
