o
    ei)                     @   s   d Z ddlZddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 e
eZg dZdZedg dZd	e_ G d
d deZi fddZdd Zdd Zi dfddZdS )z'SpeechBrain Extended CSV Compatibility.    N)DynamicItemDataset)
get_logger)wavflacaacoggr   mp3_dataCSVItem)dataformatoptsz)The Legacy Extended CSV Data item tripletc                       s.   e Zd ZdZi dddg g f fdd	Z  ZS )ExtendedCSVDataseta  Extended CSV compatibility for DynamicItemDataset.

    Uses the SpeechBrain Extended CSV data format, where the CSV must have an
    'ID' and 'duration' fields.

    The rest of the fields come in triplets:
    ``<name>, <name>_format, <name>_opts``

    These add a <name>_sb_data item in the dict. Additionally, a basic
    DynamicItem (see DynamicItemDataset) is created, which loads the _sb_data
    item.

    Bash-like string replacements with $to_replace are supported.

    NOTE
    ----
    Mapping from legacy interface:

    - csv_file -> csvpath
    - sentence_sorting -> sorting, and "random" is not supported, use e.g.
      ``make_dataloader(..., shuffle = (sorting=="random"))``
    - avoid_if_shorter_than -> min_duration
    - avoid_if_longer_than -> max_duration
    - csv_read -> output_keys, and if you want IDs add "id" as key

    Arguments
    ---------
    csvpath : str, path
        Path to extended CSV.
    replacements : dict
        Used for Bash-like $-prefixed substitution,
        e.g. ``{"data_folder": "/home/speechbrain/data"}``, which would
        transform `$data_folder/utt1.wav` into `/home/speechbrain/data/utt1.wav`
    sorting : {"original", "ascending", "descending"}
        Keep CSV order, or sort ascending or descending by duration.
    min_duration : float, int
        Minimum duration in seconds. Discards other entries.
    max_duration : float, int
        Maximum duration in seconds. Discards other entries.
    dynamic_items : list
        Configuration for extra dynamic items produced when fetching an
        example. List of DynamicItems or dicts with keys::
            func: <callable> # To be called
            takes: <list> # key or list of keys of args this takes
            provides: key # key or list of keys that this provides
        NOTE: A dynamic item is automatically added for each CSV data-triplet
    output_keys : list, None
        The list of output keys to produce. You can refer to the names of the
        CSV data-triplets. E.G. if the CSV has: wav,wav_format,wav_opts,
        then the Dataset has a dynamic item output available with key ``"wav"``
        NOTE: If None, read all existing.
    originalr   i  c                    s   |dvr| j j}t| d| dt||\}	}
}t |	|| | j|
 d}d }|dks0d}|dkr8d}| jd|id|i||d	}|| _	|sQ| 
| d S d S )
N)r   	ascending
descendingz doesn't support z sortingFr   r   durationT)key_min_valuekey_max_valuesort_keyreverse)	__class____name__
ValueErrorload_sb_extended_csvsuper__init__pipelineadd_dynamic_items_filtered_sorted_idsdata_idsset_output_keys)selfcsvpathreplacementssortingmin_durationmax_durationdynamic_itemsoutput_keysclsnamer   	di_to_add
data_namesr   r   filtered_sorted_idsr    W/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/dataio/legacy.pyr   N   s0   

zExtendedCSVDataset.__init__)r   
__module____qualname____doc__r   __classcell__r/   r/   r.   r0   r      s    8r   c                    s  t | ddd}i }tj|dd}td}|jd dks!td	|jd
 dks,tdt|jdd d dks=td|jddd }|D ]s}i }|d }	|d= t	|d |d< |d= |	|v rhtd|	 t
| ddd D ]!\}
}z| fdd|||
< W qs ty   td| dw t|D ]\}}tt
| |d |d d   }|||t < q|||	< qGg }|D ]}t|t |d}|| q|||fW  d   S 1 sw   Y  dS )a  Loads SB Extended CSV and formats string values.

    Uses the SpeechBrain Extended CSV data format, where the
    CSV must have an 'ID' and 'duration' fields.

    The rest of the fields come in triplets:
    ``<name>, <name>_format, <name>_opts``.

    These add a <name>_sb_data item in the dict. Additionally, a
    basic DynamicItem (see DynamicItemDataset) is created, which
    loads the _sb_data item.

    Bash-like string replacements with $to_replace are supported.

    This format has its restriction, but they allow some tasks to
    have loading specified by the CSV.

    Arguments
    ---------
    csv_path : str
        Path to the CSV file.
    replacements : dict
        Optional dict:
        e.g. ``{"data_folder": "/home/speechbrain/data"}``
        This is used to recursively format all string values in the data.

    Returns
    -------
    dict
        CSV data with replacements applied.
    list
        List of DynamicItems to add in DynamicItemDataset.

     utf-8)newlineencodingT)skipinitialspacez
\$([\w.]+)r   IDzBCSV has to have an 'ID' field, with unique ids for all data points   r   zRCSV has to have an 'duration' field, with the length of the data point in seconds.   N   zHAll named fields must have 3 entries: <name>, <name>_format, <name>_optszDuplicate id: c                    s    | d  S )Nr;   r/   )matchr$   r/   r0   <lambda>   s    z&load_sb_extended_csv.<locals>.<lambda>z	The item z/ requires replacements which were not supplied.)functakesprovides)opencsv
DictReaderrecompile
fieldnamesKeyErrorlenr   floatlistitemssub	enumerater
   valuesITEM_POSTFIX_read_csv_itemappend)csv_pathr$   csvfileresultreadervariable_findernamesrow
data_pointdata_idkeyvalueinametripletdynamic_items_to_adddir/   r?   r0   r   t   s`   #

$
$r   c                 C   s   t | j}| jtv rt| j\}}|dS | jdkr"t| j|S | jdkrB| j}z|	d}W n	 t
y:   Y nw |d}|S td| j )zhReads the different formats supported in SB Extended CSV.

    Delegates to the relevant functions.
    r   pklstringr6    zDon't know how to read )_parse_csv_item_optsr   r   TORCHAUDIO_FORMATS
torchaudioloadr   squeezeread_pkldecodeAttributeErrorsplit	TypeError)itemr   audio_rf   r/   r/   r0   rS      s    





rS   c                 C   sF   |   } t| dkri S i }| dD ]}|d\}}|||< q|S )z0Parse the _opts field in a SB Extended CSV item.r   rg   :)striprK   rp   )entryr   optopt_nameopt_valr/   r/   r0   rh      s   
rh   c                 C   s6  zt | d}t|}W d   n1 sw   Y  W n tjy,   d|  }t|w d}t|trt|d trBt	|}d}t|d t
rPt|}d}t|d trq|durjt|D ]
\}}	||	 ||< q_t|}d}|sd| t|d f }t|n|}|j}
|
dkr|d	}|
d
kr|d}|S )a  This function reads tensors store in pkl format.

    Arguments
    ---------
    file : str
        The path to file to read.
    data_options : dict, optional
        A dictionary containing options for the reader.
    lab2ind : dict, optional
        Mapping from label to integer indices.

    Returns
    -------
    numpy.array
        The array containing the read signal.
    rbNzcannot read the pkl file %sFr   TzMThe pkl file %s can only contain list of integers, floats, or strings. Got %sfloat64float32int64int32)rD   picklerk   UnpicklingErrorr   
isinstancerM   rL   torchFloatTensorint
LongTensorstrrP   typedtypeastype)filedata_optionslab2indfpkl_elementerr_msgtype_oktensorindexvaltensor_typer/   r/   r0   rm      sH   





rm   )r3   collectionsrE   r   rG   r   rj   speechbrain.dataio.datasetr   speechbrain.utils.loggerr   r   loggerri   rR   
namedtupler
   r   r   rS   rh   rm   r/   r/   r/   r0   <module>   s&    \`