o
    Si.                     @   s`   d dl Z d dlmZmZmZmZ d dlmZmZ d dl	m
Z
mZ d dlmZ G dd de
ZdS )    N)AnyDictOptionalTuple)CutSetSeconds)
CutSamplerTimeConstraint)
DataSourcec                       s  e Zd ZdZ								d!dedededed	ee d
ededee dee def fddZ	e
dee fddZe
dee fddZe
dee fddZdeeef f fddZdeeef ddf fddZd"ddZdeeef fdd Z  ZS )#CutPairsSamplera  
    Samples pairs of cuts from a "source" and "target" CutSet.
    It expects that both CutSet's strictly consist of Cuts with corresponding IDs.
    It behaves like an iterable that yields lists of strings (cut IDs).

    When one of :attr:`max_source_duration`, :attr:`max_target_duration`, or :attr:`max_cuts` is specified,
    the batch size is dynamic.
    Exactly zero or one of those constraints can be specified.
    Padding required to collate the batch does not contribute to max source_duration/target_duration.
    NFr   source_cutstarget_cutsmax_source_durationmax_target_durationmax_cutsshuffle	drop_last
world_sizerankseedc                    sJ   t  j||||	|
d t|| _t|| _t||d| _t||d| _dS )a  
        CutPairsSampler's constructor.

        :param source_cuts: the first ``CutSet`` to sample data from.
        :param target_cuts: the second ``CutSet`` to sample data from.
        :param max_source_duration: The maximum total recording duration from ``source_cuts``.
        :param max_target_duration: The maximum total recording duration from ``target_cuts``.
        :param max_cuts: The maximum number of cuts sampled to form a mini-batch.
            By default, this constraint is off.
        :param shuffle: When ``True``, the cuts will be shuffled at the start of iteration.
            Convenient when mini-batch loop is inside an outer epoch-level loop, e.g.:
            `for epoch in range(10): for batch in dataset: ...` as every epoch will see a
            different cuts order.
        :param drop_last: When ``True``, the last batch is dropped if it's incomplete.
        :param world_size: Total number of distributed nodes. We will try to infer it by default.
        :param rank: Index of distributed node. We will try to infer it by default.
        :param seed: Random seed used to consistently shuffle the dataset across different processes.
        )r   r   r   r   r   )max_durationr   N)super__init__r
   r   r   r	   source_constraintstarget_constraints)selfr   r   r   r   r   r   r   r   r   r   	__class__ U/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/dataset/sampling/cut_pairs.pyr      s"   

zCutPairsSampler.__init__returnc                 C      | j jS )a	  
        Remaining duration of data left in the sampler (may be inexact due to float arithmetic).
        Not available when the CutSet is read in lazy mode (returns None).

        .. note: For :class:`.CutPairsSampler` we return the source cuts duration.
        )r   remaining_durationr   r   r   r   r"   G   s   z"CutPairsSampler.remaining_durationc                 C   r!   )z
        Remaining number of cuts in the sampler.
        Not available when the CutSet is read in lazy mode (returns None).
        )r   remaining_cutsr#   r   r   r   r$   Q   s   zCutPairsSampler.remaining_cutsc                 C   s   | j jrdS t| j S )z
        Total number of cuts in the sampler.
        Not available when the CutSet is read in lazy mode (returns None).
        N)r   is_lazylenr#   r   r   r   num_cutsY   s   
zCutPairsSampler.num_cutsc                    s*   t   }|| j | j d |S )z
        Return the current state of the sampler in a state_dict.
        Together with ``load_state_dict()``, this can be used to restore the
        training loop's state to the one stored in the state_dict.
        )r   r   )r   
state_dictupdater   r   )r   r(   r   r   r   r(   c   s   
zCutPairsSampler.state_dictr(   c                    s   t di |d}| j|krtd| j d| d || _t di |d}| j|kr;td| j d| d || _t | | jr[| j	| j
| j  | j| j
| j  | j	| jjj | j| jjj dS )	aX  
        Restore the state of the sampler that is described in a state_dict.
        This will result in the sampler yielding batches from where the previous training left it off.

        .. caution::
            The samplers are expected to be initialized with the same CutSets,
            but this is not explicitly checked anywhere.

        .. caution::
            The input ``state_dict`` is being mutated: we remove each consumed key, and expect
            it to be empty at the end of loading. If you don't want this behavior, pass a copy
            inside of this function (e.g., using ``import deepcopy``).

        .. note::
            For implementers of sub-classes of CutSampler: the flag ``self._just_restored_state`` has to be
            handled in ``__iter__`` to make it avoid resetting the just-restored state (only once).
        r   zLCutPairsSampler.load_state_dict(): Inconsistent source_constraint:
expected z

received z=
We will overwrite the settings with the received state_dict.r   zLCutPairsSampler.load_state_dict(): Inconsistent target_constraint:
expected Nr   )r	   popr   warningswarnr   r   load_state_dictr   r   r   epochr   fast_forwarddiagnosticscurrent_epoch_stats
total_cuts)r   r(   r   r   r   r   r   r-   r   s4   

zCutPairsSampler.load_state_dictc                 C   sZ   | j r| S | j  | jr!| j| j| j  | j| j| j  t| j t| j | S )zi
        Prepare the dataset for iterating over a new epoch. Will shuffle the data if requested.
        )	_just_restored_stater0   reset_current_epochr   r   r   r.   r   iterr#   r   r   r   __iter__   s   


zCutPairsSampler.__iter__c                 C   s  | j   | j  g }g }	 zt| j}t| j}|j|jks$J dW n7 ty\   |rS| jr;| j 	 s;| j	 rSt
|t
|ksGJ dt|t|f Y S | j| t w | |rg| |sn| j| q| j | | j| | j  s| j s|| || n|r| j| | j| ntd || || qt
|t
|ksJ dt|t|fS )NTzSampled source and target cuts with differing IDs. Ensure that your source and target cuts have the same length, the same IDs, and the same order.zPUnexpected state: some cuts in source / target are missing their counterparts...zThe first cut drawn in batch collection violates one of the max_... constraintswe'll return it anyway. Consider increasing max_source_duration/max_cuts/etc.)r   resetr   nextr   r   idStopIterationr   close_to_exceedingr&   r   	from_cutsr0   discard
_filter_fndiscard_singleaddexceededappend	take_backr+   r,   )r   r   r   next_source_cutnext_target_cutr   r   r   _next_batch   sr   






CzCutPairsSampler._next_batch)NNNFFNNr   )r    r   )__name__
__module____qualname____doc__r   r   r   intboolr   propertyfloatr"   r$   r'   r   strr   r(   r-   r6   r   rF   __classcell__r   r   r   r   r   	   sR    	
2		
/r   )r+   typingr   r   r   r   lhotser   r   lhotse.dataset.sampling.baser   r	   #lhotse.dataset.sampling.data_sourcer
   r   r   r   r   r   <module>   s    