o
    %ݫih"                  	   @   s   d Z ddlZddlZddlmZmZ ddlmZ ddlm	Z	 ddl
mZ eddG d	d
 d
Zdd Zdd ZeddG dd dZdddejfddZdedddeei df	ddZdS )z<Webdataset compatible iterators

Authors:
 * Aku Rouhe 2021
    N)	dataclassfield)partial)Any)PaddedBatchT)orderc                   @   s,   e Zd ZU dZeed< eddZeed< dS )
LengthItemzData class for lengthslengthF)comparedataN)	__name__
__module____qualname____doc__int__annotations__r   r   r    r   r   P/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/dataio/iterators.pyr      s   
 r   c                 C   s   t | t|  S )z1Determines how long would batch be (with padding))lenmaxlengthsr   r   r   total_length_with_padding   s   r   c                 C   s   dt | t|   S )z(Determines how much of batch is padding.g      ?)sumr   r   r   r   r   padding_ratio   s   r   c                   @   s"   e Zd ZU dZeed< eed< dS )
RatioIndexzData class for Ratio.ratioindexN)r   r   r   r   floatr   r   r   r   r   r   r   "   s   
 r   g?c                    s   t  }|du r
|}|d|d  }} | jg fdd}	|d | |k rjt|k rjg }
|	|d |
 |	|d |
 |
sCn't|
}t||j}t||j} |j j |d | |k rjt|k s0tt||d S )aa  Random pivot sampler_fn for dynamic_bucketed_batch

    Create a batch around a random pivot index in the sorted buffer

    This works on the databuffer which is assumed to be in sorted order. An
    index is chosen at random. This starts the window of indices: at first,
    only the randomly chosen pivot index is included. The window of indices is
    grown one-index-at-a-time, picking either the index to the right of the
    window, or the index to the left, picking the index that would increase the
    padding ratio the least, and making sure the batch wouldn't exceed the
    maximum batch length nor the maximum padding ratio.

    Arguments
    ---------
    databuffer : list
        Sorted list of LengthItems
    target_batch_numel : int
        Target of total batch length including padding, which is simply computed
        as batch size * length of longest example. This function aims to return
        the batch as soon as the gathered length exceeds this. If some limits
        are encountered first, this may not be satisfied.
    max_batch_size : None, int
        Maximum number of examples to include in the batch, or None to not limit
        by number of examples.
    max_batch_numel : None, int
        Maximum of total batch length including padding, which is simply computed
        as batch size * length of longest example.
    max_padding_ratio : float
        Each batch can have at most this much devoted to padding.
    randint_generator : generator
        Provide a generator to get reproducible results.

    Returns
    -------
    indices : list
        A list of consecutive indices.
    Nr      c                    sx   | dk s
| t  krdS  |  }|jg }dur$t|}|kr$dS t|}dur2|kr2dS |t||  dS )zTAdds an index to the to_consider list, if the index passes all
        requirements.r   N)r   r	   r   r   appendr   )r   to_consider
considereeupdated_lengthsupdated_totalupdated_ratio
databufferr   max_batch_numelmax_padding_ratior   r   possibly_consider^   s   z6indices_around_random_pivot.<locals>.possibly_consider)	r   r	   r   minr   r   r    listrange)r'   target_batch_numelmax_batch_sizer(   r)   randint_generator	bufferlen	min_index	max_indexr*   r!   to_addr   r&   r   indices_around_random_pivot)   s(   -r5   i   Fc
                 c   sF   g }
|rt |fi |}| D ]h}|dur |dur ||| }n|dur)|| }n|dur2||}ntd|dur>||k sF|durG||krGqt||}t|
| t|
|krw||
}g }t|ddD ]}|
|}||j	 qd||V  q|	s|
r||
}g }t|ddD ]}|
|}||j	 q||V  |
s|dS dS dS )a%  Produce batches from a sorted buffer

    This function keeps a sorted buffer of the incoming samples.
    The samples can be filtered for min/max length.
    An external sampler is used to choose samples for each batch,
    which allows different dynamic batching algorithms to be used.

    Arguments
    ---------
    data : iterable
        An iterable source of samples, such as an IterableDataset.
    len_key : str, None
        The key in the sample dict to use to fetch the length of the sample, or
        None if no key should be used.
    len_fn : callable
        Called with sample[len_key] if len_key is not None, else sample. Needs
        to return the sample length as an integer.
    min_sample_len : int, None
        Discard samples with length lower than this. If None, no minimum is
        applied.
    max_sample_len : int, None
        Discard samples with length larger than this. If None, no maximum is
        applied.
    buffersize : int
        The size of the internal sorted buffer. The buffer is always filled up
        before yielding a batch of samples.
    collate_fn : callable
        Called with a list of samples. This should return a batch. By default, using
        the SpeechBrain PaddedBatch class, which works for dict-like samples, and
        pads any tensors.
    sampler_fn : callable
        Called with the sorted data buffer. Needs to return a list of indices, which
        make up the next batch. By default using ``indices_around_random_pivot``
    sampler_kwargs : dict
        Keyword arguments, passed to sampler_fn.
    drop_end : bool
        After the data stream is exhausted, should batches be made until the data
        buffer is exhausted, or should the rest of the buffer be discarded. Without
        new samples, the last batches might not be efficient to process.
        Note: you can use ``.repeat`` on `webdataset` IterableDatasets to never
        run out of new samples, and then use
        `speechbrain.dataio.dataloader.LoopedLoader` to set a nominal epoch length.

    Yields
    ------
    Batches
    Nz.Must specify at least one of len_key or len_fnT)reverse)
r   
ValueErrorr   bisectinsortr   sortedpopr    r   )r   len_keylen_fnmin_sample_lenmax_sample_len
buffersize
collate_fn
sampler_fnsampler_kwargsdrop_endr'   sampler	   itemindices
batch_listir   r   r   dynamic_bucketed_batch   sH   ;






rJ   )r   r8   randomdataclassesr   r   	functoolsr   typingr   speechbrain.dataio.batchr   r   r   r   r   randintr5   r   rJ   r   r   r   r   <module>   s8    	
]