o
    2wÖiCK  ã                   @   sX  d Z ddlmZ ddlmZmZmZmZmZm	Z	m
Z
 ddlmZmZ ddlmZmZ ddlmZmZ dedefd	d
„Zdedefdd„Zdedefdd„Zdede	eeef eeef f fdd„Z	ddedee
ee ee f  de	eeeeef   eeef f fdd„Zdeeeef  deeef fdd„Zdefdd„ZG dd„ dƒZdS )uQ  
â•”â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•—
â•‘ Export/Import CutSet to HuggingFace Dataset â•‘
â•šâ•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
é    )Úmd5)ÚAnyÚDictÚListÚOptionalÚSetÚTupleÚUnion)Ú	RecordingÚSupervisionSegment)ÚCutSetÚMonoCut)ÚPathlikeÚis_module_availableÚcutsetÚreturnc                 C   ó   t dd„ | D ƒƒS )Nc                 s   s    | ]}t |tƒV  qd S ©N)Ú
isinstancer   ©Ú.0Úcut© r   úF/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/hf.pyÚ	<genexpr>   s   € z*contains_only_mono_cuts.<locals>.<genexpr>©Úall©r   r   r   r   Úcontains_only_mono_cuts   ó   r   c                 C   r   )Nc                 s   s    | ]
}t |jƒd kV  qdS ©é   N)ÚlenÚsupervisionsr   r   r   r   r      s   € z.has_one_supervision_per_cut.<locals>.<genexpr>r   r   r   r   r   Úhas_one_supervision_per_cut   r   r$   c                 C   r   )Nc                 s   s     | ]}t |jjƒd kV  qdS r    )r"   Ú	recordingÚsourcesr   r   r   r   r      s   € z'has_one_audio_source.<locals>.<genexpr>r   r   r   r   r   Úhas_one_audio_source   r   r'   c                 C   sn   ddl m}m} dd„ | D ƒdd„ | D ƒdd„ | D ƒdd„ | D ƒdœ}|d	ƒ|d
d|dƒ|dƒdœ}||fS )aP  
    Converts the cut information into a dictionary compatible with HuggingFace datasets format.

    :param cutset: A CutSet object.
    :return: A tuple where the first element is a dictionary
        representing the cut attributes and the second element is a dictionary describing the
        format of the HuggingFace dataset.
    r   )ÚAudioÚValuec                 S   ó   g | ]}|j ‘qS r   )Úidr   r   r   r   Ú
<listcomp>&   ó    z+convert_cuts_info_to_hf.<locals>.<listcomp>c                 S   s   g | ]	}|j jd  j‘qS ©r   )r%   r&   Úsourcer   r   r   r   r,   '   ó    c                 S   r*   r   )Údurationr   r   r   r   r,   (   r-   c                 S   s   g | ]}t |jjƒ‘qS r   )r"   r%   Úchannel_idsr   r   r   r   r,   )   s    )r+   Úaudior1   Únum_channelsÚstringF)ÚmonoÚfloatÚuint16)Údatasetsr(   r)   )r   r(   r)   Úcut_infoÚcut_info_descriptionr   r   r   Úconvert_cuts_info_to_hf   s   	üür<   NÚexclude_attributesc                 C   s¨  ddl m}m}m} tdd„ | D ƒƒ}tdd„ | D ƒƒ}dd„ | D ƒ}tdd„ |D ƒƒ}g }| D ]}	g }
|	jD ]}d	|ji}|d
u sFd|vrK|j|d< |d
u sSd|vrX|j	|d< |d
u s`d|vryt
|jtƒrrd tt|jƒ¡|d< nt|jƒ|d< |rŠ|d
u sƒd|vrŠt|jƒ|d< |r›|d
u s”d|vr›t|jƒ|d< |rÂ|d
u s¥d|vrÂi }|D ]}ttdd„ |j| ƒƒ||d < q©i |¥|¥}|
 |¡ q7| |
¡ q0d	|dƒi}|d
u sÜd|vrâ|dƒ|d< |d
u sêd|vrð|dƒ|d< |d
u sød|vrþ|dƒ|d< |r|d
u sd|vr|dƒ|d< |r$|d
u sd|vr$|dƒ|d< |rP|d
u s1d|vrP|dƒ|dƒ|dƒdœ}|D ]}||di |¤Žƒ||d < q?||fS )a­  
    Converts cut supervisions into a dictionary compatible with HuggingFace datasets format.

    :param cutset: A CutSet object.
    :param exclude_attributes: A list|set of attributes to exclude from the supervisions dicts.
    :return: A tuple where the first element is a dictionary
        representing the cut attributes and the second element is a dictionary describing the
        format of the HuggingFace dataset.
    r   )ÚFeaturesÚSequencer)   c                 s   ó0    | ]}t |jd  dƒo|jd  jduV  qdS )r   ÚspeakerN)Úhasattrr#   rA   r   r   r   r   r   D   ó   € ý
ýz2convert_supervisions_info_to_hf.<locals>.<genexpr>c                 s   r@   )r   ÚlanguageN)rB   r#   rD   r   r   r   r   r   K   rC   c                 S   s,   g | ]}|j D ]}|jd ur|j ¡ ‘qqS r   )r#   Ú	alignmentÚkeys)r   ÚcÚsr   r   r   r,   R   s    ý
ýz3convert_supervisions_info_to_hf.<locals>.<listcomp>c                 S   s   g | ]	}|D ]}|‘qqS r   r   )r   ÚsublistÚitemr   r   r   r,   X   r0   ÚtextNÚstartÚendÚchannelú,rA   rD   Ú
alignmentsc                 S   s   | j | j| jdœS )N©ÚsymbolrL   rM   rQ   )rJ   r   r   r   Ú<lambda>   s   ýz1convert_supervisions_info_to_hf.<locals>.<lambda>Ú
_alignmentr5   r7   rQ   r   )r9   r>   r?   r)   ÚanyÚsetr#   rK   rL   rM   r   rN   ÚlistÚjoinÚmapÚstrrA   rD   rE   Úappend)r   r=   r>   r?   r)   Úhas_speakerÚhas_languageÚalignment_typesÚ	sup_dictsrG   Úcut_sup_dictsrH   Úsup_dictrP   Úalignment_typeÚsup_dicts_infoÚalignment_infor   r   r   Úconvert_supervisions_info_to_hf4   sˆ   ûûþ
ÿ

úÿýÿre   Úlodc                    s   ‡ fdd„ˆ d   ¡ D ƒS )z2
    Converts List of Dicts to Dict of Lists.
    c                    s    i | ]‰ ˆ ‡ fd d„ˆD ƒ“qS )c                    s   g | ]}|ˆ  ‘qS r   r   )r   Úd©Úkr   r   r,   ¶   ó    z)lod_to_dol.<locals>.<dictcomp>.<listcomp>r   )r   ©rf   rh   r   Ú
<dictcomp>¶   s     zlod_to_dol.<locals>.<dictcomp>r   )rF   rk   r   rk   r   Ú
lod_to_dol²   s   rm   c           
      C   sÔ   t | ƒsJ dƒ‚tdƒstdƒ‚ddlm}m}m} t| ƒ\}}t| t	| ƒr*h d£ndd\}}t	| ƒrKi |¥t
d	d
„ |D ƒƒ¥}|di |¤|¤Ž}	ni |¥d|i¥}|dd||di |¤Žƒi|¤Ž}	|j||	dS )u!  
    Converts a CutSet to a HuggingFace Dataset. Currently, only MonoCut with one recording source is supported.
    Other cut types will be supported in the future.

    Currently, two formats are supported:
        1. If each cut has one supervision (e.g. LibriSpeech), each cut is represented as a single row (entry)
           in the HuggingFace dataset with all the supervision information stored along the cut information.
           The final HuggingFace dataset format is:
               â•”â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¦â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•—
               â•‘      Feature      â•‘            Type               â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘        id         â•‘ Value(dtype='string')         â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘      audio        â•‘ Audio()                       â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘     duration      â•‘ Value(dtype='float32')        â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘   num_channels    â•‘ Value(dtype='uint16')         â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘       text        â•‘ Value(dtype='string')         â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘     speaker       â•‘ Value(dtype='string')         â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘     language      â•‘ Value(dtype='string')         â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘   {x}_alignment   â•‘ Sequence(Alignment)           â•‘
               â•šâ•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•©â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
           where x stands for the alignment type (commonly used: "word", "phoneme").

           Alignment is represented as:
               â•”â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¦â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•—
               â•‘      Feature      â•‘            Type               â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘      symbol       â•‘ Value(dtype='string')         â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘       start       â•‘ Value(dtype='float32')        â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘        end        â•‘ Value(dtype='float32')        â•‘
               â•šâ•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•©â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•


        2. If each cut has multiple supervisions (e.g. AMI), each cut is represented as a single row (entry)
           while all the supervisions are stored in a separate list of dictionaries under the 'segments' key.
           The final HuggingFace dataset format is:
               â•”â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¦â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•—
               â•‘   Feature    â•‘                 Type               â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘      id      â•‘ Value(dtype='string')              â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘    audio     â•‘ Audio()                            â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘   duration   â•‘ Value(dtype='float32')             â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘ num_channels â•‘ Value(dtype='uint16')              â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘   segments   â•‘ Sequence(Segment)                  â•‘
               â•šâ•â•â•â•â•â•â•â•â•â•â•â•â•â•â•©â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
           where one Segment is represented as:
               â•”â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¦â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•—
               â•‘      Feature      â•‘            Type               â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘        text       â•‘ Value(dtype='string')         â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘       start       â•‘ Value(dtype='float32')        â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘        end        â•‘ Value(dtype='float32')        â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘      channel      â•‘ Value(dtype='string')         â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘      speaker      â•‘ Value(dtype='string')         â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘      language     â•‘ Value(dtype='string')         â•‘
               â• â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•¬â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•£
               â•‘   {x}_alignment   â•‘ Sequence(Alignment)           â•‘
               â•šâ•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•©â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•

    :param cutset: A CutSet object.
    :return: A HuggingFace Dataset.
    zWOnly CutSets with one audio source per cut are supported. MultiSource cuts coming soon.r9   z=Please install the 'datasets' package (pip install datasets).r   )ÚDatasetr>   r?   >   rM   rL   rN   N)r=   c                 S   s   g | ]}|d  ‘qS r.   r   )r   Úxr   r   r   r,   !  rj   z%export_cuts_to_hf.<locals>.<listcomp>Úsegments)Úfeaturesr   )r'   r   ÚImportErrorr9   rn   r>   r?   r<   re   r$   rm   Ú	from_dict)
r   rn   r>   r?   r:   r;   r_   rc   Údataset_dictÚdataset_infor   r   r   Úexport_cuts_to_hf¹   sN   Qÿþÿ
ÿ
üÿþÿþÿþÿþrv   c                	   @   s>   e Zd ZdZdddddœdeded	ed
efdd„Zdd„ ZdS )ÚLazyHFDatasetIteratoraX  
    Thin wrapper on top of HF datasets objects that allows to interact with them through a Lhotse CutSet.
    It can be initialized with an existing HF dataset, or args/kwargs passed on to ``datasets.load_dataset()``.

    Use ``audio_key``, ``text_key``, ``lang_key`` and ``gender_key`` options to indicate which keys in dict examples
    returned from HF Dataset should be looked up for audio, transcript, language, and gender respectively.
    The remaining keys in HF dataset examples will be stored inside ``cut.custom`` dictionary.

    Example with existing HF dataset::

        >>> import datasets
        ... dataset = datasets.load_dataset("mozilla-foundation/common_voice_11_0", "hi", split="test")
        ... dataset = dataset.map(some_transform)
        ... cuts_it = LazyHFDatasetIterator(dataset)
        ... for cut in cuts_it:
        ...     pass

    Example providing HF dataset init args/kwargs::

        >>> import datasets
        ... cuts_it = LazyHFDatasetIterator("mozilla-foundation/common_voice_11_0", "hi", split="test")
        ... for cut in cuts_it:
        ...     pass

    r3   ÚsentencerD   Úgender)Ú	audio_keyÚtext_keyÚlang_keyÚ
gender_keyrz   r{   r|   r}   c                O   s4   t dƒsJ ‚|| _|| _|| _|| _|| _|| _d S )Nr9   )r   rz   r{   r|   r}   Údataset_argsÚdataset_kwargs)Úselfrz   r{   r|   r}   r~   r   r   r   r   Ú__init__O  s   	
zLazyHFDatasetIterator.__init__c                 c   s   ddl m}m}m}m}m}m} t| jƒdkr*t	| jd ||||fƒr*| jd }n	|| ji | j
¤Ž}| | j|dd¡}|D ]B}| | j¡}	tj|	d t|	d ƒ ¡ d}
t|
j|
jd|
j| | jd ¡| | jd ¡| | jd ¡d	}|
 ¡ }|g|_||_|V  q?d S )
Nr   )r(   rn   ÚDatasetDictÚIterableDatasetÚIterableDatasetDictÚload_datasetr!   F)ÚdecodeÚbytes)Úrecording_idg        )r+   rˆ   rL   r1   rK   rD   ry   )r9   r(   rn   r‚   rƒ   r„   r…   r"   r~   r   r   Úcast_columnrz   Úpopr
   Ú
from_bytesr   Ú	hexdigestr   r+   r1   r{   r|   r}   Úto_cutr#   Úcustom)r€   r(   rn   r‚   rƒ   r„   r…   ÚdatasetrJ   Ú
audio_datar%   Úsupervisionr   r   r   r   Ú__iter__`  s8   € 	
þÿù	ïzLazyHFDatasetIterator.__iter__N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__rZ   r   r’   r   r   r   r   rw   4  s     úýüû
úrw   r   )r–   Úhashlibr   Útypingr   r   r   r   r   r   r	   Úlhotser
   r   Ú
lhotse.cutr   r   Úlhotse.utilsr   r   Úboolr   r$   r'   rZ   r<   re   rm   rv   rw   r   r   r   r   Ú<module>   s*    $*þÿþ"
ý&~{