o
    SiS%                     @   s\   d dl Z d dlmZmZmZ d dlmZmZ d dlm	Z	m
Z
 d dlmZ G dd de	ZdS )    N)AnyDictOptional)CutSetSeconds)
CutSamplerTimeConstraint)
DataSourcec                       s   e Zd ZdZ							ddededee ded	ed
ee dee def fddZ	e
dee fddZe
dee fddZe
dee fddZdeeef f fddZdeeef ddf fddZd ddZdefddZ  ZS )!SimpleCutSamplera  
    Samples cuts from a CutSet to satisfy the input constraints.
    It behaves like an iterable that yields lists of strings (cut IDs).

    When one of :attr:`max_duration`, or :attr:`max_cuts` is specified,
    the batch size is dynamic.
    Exactly zero or one of those constraints can be specified.
    Padding required to collate the batch does not contribute to max duration.

    Example usage::

        >>> dataset = K2SpeechRecognitionDataset(cuts)
        >>> sampler = SimpleCutSampler(cuts, shuffle=True)
        >>> loader = DataLoader(dataset, sampler=sampler, batch_size=None)
        >>> for epoch in range(start_epoch, n_epochs):
        ...     sampler.set_epoch(epoch)
        ...     train(loader)

    NFr   cutsmax_durationmax_cutsshuffle	drop_last
world_sizerankseedc	           	         sP   t  j|||||d tdd ||fD sJ dt|| _t||d| _dS )a  
        SimpleCutSampler's constructor.

        :param cuts: the ``CutSet`` to sample data from.
        :param max_duration: The maximum total recording duration from ``cuts``.
        :param max_cuts: The maximum number of cuts sampled to form a mini-batch.
            By default, this constraint is off.
        :param shuffle: When ``True``, the cuts will be shuffled at the start of iteration.
            Convenient when mini-batch loop is inside an outer epoch-level loop, e.g.:
            `for epoch in range(10): for batch in dataset: ...` as every epoch will see a
            different cuts order.
        :param drop_last: When ``True``, the last batch is dropped if it's incomplete.
        :param world_size: Total number of distributed nodes. We will try to infer it by default.
        :param rank: Index of distributed node. We will try to infer it by default.
        :param seed: Random seed used to consistently shuffle the dataset across different processes.
        )r   r   r   r   r   c                 s   s    | ]}|d uV  qd S )N ).0vr   r   R/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/dataset/sampling/simple.py	<genexpr>@   s    
z,SimpleCutSampler.__init__.<locals>.<genexpr>z7At least one of max_duration or max_cuts has to be set.)r   r   N)super__init__anyr	   data_sourcer   time_constraint)	selfr   r   r   r   r   r   r   r   	__class__r   r   r      s"   

zSimpleCutSampler.__init__returnc                 C      | j jS )z
        Remaining duration of data left in the sampler (may be inexact due to float arithmetic).
        Not available when the CutSet is read in lazy mode (returns None).
        )r   remaining_durationr   r   r   r   r"   I      z#SimpleCutSampler.remaining_durationc                 C   r!   )z
        Remaining number of cuts in the sampler.
        Not available when the CutSet is read in lazy mode (returns None).
        )r   remaining_cutsr#   r   r   r   r%   Q   r$   zSimpleCutSampler.remaining_cutsc                 C   s   | j jrdS t| j S )z
        Total number of cuts in the sampler.
        Not available when the CutSet is read in lazy mode (returns None).
        N)r   is_lazylenr#   r   r   r   num_cutsY   s   
zSimpleCutSampler.num_cutsc                    s"   t   }|d| j i |S )z
        Return the current state of the sampler in a state_dict.
        Together with ``load_state_dict()``, this can be used to restore the
        training loop's state to the one stored in the state_dict.
        r   )r   
state_dictupdater   )r   r)   r   r   r   r)   c   s   

zSimpleCutSampler.state_dictr)   c                    sz   t di |d}| j|krtd| j d| d || _t | | jr2| j| j	| j
  | j| jjj dS )aX  
        Restore the state of the sampler that is described in a state_dict.
        This will result in the sampler yielding batches from where the previous training left it off.

        .. caution::
            The samplers are expected to be initialized with the same CutSets,
            but this is not explicitly checked anywhere.

        .. caution::
            The input ``state_dict`` is being mutated: we remove each consumed key, and expect
            it to be empty at the end of loading. If you don't want this behavior, pass a copy
            inside of this function (e.g., using ``import deepcopy``).

        .. note::
            For implementers of sub-classes of CutSampler: the flag ``self._just_restored_state`` has to be
            handled in ``__iter__`` to make it avoid resetting the just-restored state (only once).
        r   zKSimpleCutSampler.load_state_dict(): Inconsistent time_constraint:
expected z

received z=
We will overwrite the settings with the received state_dict.Nr   )r   popr   warningswarnr   load_state_dictr   r   r   epochfast_forwarddiagnosticscurrent_epoch_stats
total_cuts)r   r)   r   r   r   r   r.   q   s   
z SimpleCutSampler.load_state_dictc                 C   s<   | j r| S | j  | jr| j| j| j  t| j | S )zi
        Prepare the dataset for iterating over a new epoch. Will shuffle the data if requested.
        )_just_restored_stater1   reset_current_epochr   r   r   r/   iterr#   r   r   r   __iter__   s   

zSimpleCutSampler.__iter__c                 C   s   | j   g }	 zt| j}W n! ty0   |r'| jr | j  r't| Y S | j	
| t w | |s=| j	| q| j | | j  sN|| n|rW| j| ntd || qt|S )NTzThe first cut drawn in batch collection violates the max_duration, or max_cuts constraints - we'll return it anyway. Consider increasing max_duration/max_cuts.)r   resetnextr   StopIterationr   close_to_exceedingr   	from_cutsr1   discard
_filter_fndiscard_singleaddexceededappend	take_backr,   r-   )r   r   next_cutr   r   r   _next_batch   s<   




3zSimpleCutSampler._next_batch)NNFFNNr   )r    r
   )__name__
__module____qualname____doc__r   r   r   intboolr   propertyfloatr"   r%   r(   r   strr   r)   r.   r7   rE   __classcell__r   r   r   r   r
   	   sH    	+	
#r
   )r,   typingr   r   r   lhotser   r   lhotse.dataset.sampling.baser   r   #lhotse.dataset.sampling.data_sourcer	   r
   r   r   r   r   <module>   s    