o
    Siۈ                     @   s  d dl Z d dlZd dlZd dlmZmZ d dlmZ d dl mZ d dl	m
Z
mZ d dlmZ d dlmZmZmZmZmZmZmZmZ d dlZd dlmZ d d	lmZ d d
lmZmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z'm(Z(m)Z) G dd dee"Z*de+dee,ge,f fddZ-dede+de+ddfddZ.G dd dedZ/eG dd de/Z0eG dd  d e/Z1eG d!d" d"Z2eG d#d$ d$Z3G d%d& d&Z4d'eege5f d(eege5f deege5f fd)d*Z6dS )+    N)ABCMetaabstractmethod)bisect_left)deepcopy)asdict	dataclass)isclose)AnyCallableDictIterableLiteralOptionalTupleUnion)distributed)Sampler)CutCutSet)TextExample)Dillable)combine)Secondsexactly_one_not_nullifnoneis_none_or_gtc                       s  e Zd ZdZ					d4dededee dee d	eeed
 f ddf fddZ	e
dd Zdee dee fddZdeddfddZdeegef dd fddZdeegef dd fddZdeeef fddZdeeef ddfddZd d! Zd"d# Ze
dee fd$d%Ze
dee fd&d'Ze
dee fd(d)Zd*d+ Zd,d- Zd.eee ed/f f ddfd0d1Z!defd2d3Z"  Z#S )5
CutSamplera  
    ``CutSampler`` is responsible for collecting batches of cuts, given specified criteria.
    It implements correct handling of distributed sampling in ``DataLoader``,
    so that the cuts are not duplicated across workers.

    Sampling in a ``CutSampler`` is intended to be very quick - it only uses the metadata in
    ``CutSet`` manifest to select the cuts, and is not intended to perform any I/O.

    CutSampler works similarly to PyTorch's DistributedSampler - when :attr:`shuffle=True`,
    you should call ``sampler.set_epoch(epoch)`` at each new epoch to have a different
    ordering of returned elements. However, its actual behaviour is different than that of
    DistributedSampler -- instead of partitioning the underlying cuts into equally sized chunks,
    it will return every N-th batch and skip the other batches (where ``N == world_size``).
    The formula used to determine which batches are returned is:
    ``(batch_idx + (world_size - rank)) % world_size == 0``.
    This ensures that we can return an equal number of batches in all distributed workers
    in spite of using a dynamic batch size, at the cost of skipping at most ``world_size - 1`` batches.

    Example usage::

        >>> dataset = K2SpeechRecognitionDataset(cuts)
        >>> sampler = SimpleCutSampler(cuts, max_duration=200, shuffle=True)
        >>> loader = DataLoader(dataset, sampler=sampler, batch_size=None)
        >>> for epoch in range(start_epoch, n_epochs):
        ...     sampler.set_epoch(epoch)
        ...     train(loader)

    .. note::

        For implementers of new samplers:
        Subclasses of CutSampler are expected to implement ``self._next_batch()`` to introduce specific
        sampling logic (e.g. based on filters such as max number of frames/tokens/etc.).
        CutSampler defines ``__iter__()``, which optionally shuffles the cut IDs, and resets
        ``self.cut_idx`` to zero (to be used and incremented inside of ``_next_batch()``.
    FNr   shuffle	drop_last
world_sizerankseed)
randomizedtrngreturnc                    sP   t    || _|| _|| _d| _t | _d| _| j	||d t
 | _g | _dS )a  
        :param shuffle: When ``True``, the cuts will be shuffled at the start of iteration.
            Convenient when mini-batch loop is inside an outer epoch-level loop, e.g.:
            `for epoch in range(10): for batch in dataset: ...` as every epoch will see a
            different cuts order.
        :param drop_last: When ``True``, the last batch is dropped if it's incomplete.
        :param world_size: Total number of distributed nodes. We will try to infer it by default.
        :param rank: Index of distributed node. We will try to infer it by default.
        :param seed: Random seed used to consistently shuffle the dataset across different processes.
        r   F)r   r    N)super__init__r   r   r!   epochSamplingDiagnostics_diagnostics_just_restored_state_maybe_init_distributed_filter_nothing
_filter_fn_transforms)selfr   r   r   r    r!   	__class__ P/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/dataset/sampling/base.pyr&   ;   s   

zCutSampler.__init__c                 C   s   | j S )z
        Info on how many cuts / batches were returned or rejected during iteration.

        This property can be overriden by child classes e.g. to merge diagnostics of composite samplers.
        )r)   r/   r2   r2   r3   diagnostics^   s   zCutSampler.diagnosticsc                 C   s   |d ur
|dks
J |d ur|dksJ dt jv r5dt jv r5t|tt jd | _t|tt jd | _n%t rNt rNt|t	 | _t|t
 | _nt|d| _t|d| _| j| jk sbJ d S )N   r   
WORLD_SIZERANK)osenvironr   intr   r    distis_availableis_initializedget_world_sizeget_rank)r/   r   r    r2   r2   r3   r+   g   s   z"CutSampler._maybe_init_distributedr'   c                 C   s(   | j |kr	|   || _ | j| dS )a  
        Sets the epoch for this sampler. When :attr:`shuffle=True`, this ensures all replicas
        use a different random ordering for each epoch. Otherwise, the next iteration of this
        sampler will yield the same ordering.

        :param epoch: Epoch number.
        N)r'   allow_iter_to_reset_stater5   	set_epochr/   r'   r2   r2   r3   rB   ~   s   
zCutSampler.set_epoch	predicatec                 C   s(   t | jtr|| _| S t| j|| _| S )a
  
        Add a constraint on individual cuts that has to be satisfied to consider them.

        Can be useful when handling large, lazy manifests where it is not feasible to
        pre-filter them before instantiating the sampler.

        Example:
            >>> cuts = CutSet(...)
            ... sampler = SimpleCutSampler(cuts, max_duration=100.0)
            ... # Retain only the cuts that have at least 1s and at most 20s duration.
            ... sampler.filter(lambda cut: 1.0 <= cut.duration <= 20.0)
        )
isinstancer-   r,   _and)r/   rD   r2   r2   r3   filter   s
   zCutSampler.filterfnc                 C   s(   t |sJ d| d| j| | S )zAApply ``fn`` to each mini-batch of ``CutSet`` before yielding it.zAExpected a callable accepting and returning a CutSet, received: '')callabler.   append)r/   rH   r2   r2   r3   map   s   
zCutSampler.mapc                 C   s&   | j | j| j| j| j| j| j dS )z
        Return the current state of the sampler in a state_dict.
        Together with ``load_state_dict()``, this can be used to restore the
        training loop's state to the one stored in the state_dict.
        )r'   r   r   r    r!   r   r5   )r'   r   r   r    r!   r   r5   
state_dictr4   r2   r2   r3   rM      s   zCutSampler.state_dictrM   c                 C   s   | d| _| d}| j|ksJ d| j d| d|d= | j| dks)J | d}| j|kr@td	| j d
| d || _| d| _| j	| d t
|dkscJ dd|  d| _dS )aX  
        Restore the state of the sampler that is described in a state_dict.
        This will result in the sampler yielding batches from where the previous training left it off.

        .. caution::
            The samplers are expected to be initialized with the same CutSets,
            but this is not explicitly checked anywhere.

        .. caution::
            The input ``state_dict`` is being mutated: we remove each consumed key, and expect
            it to be empty at the end of loading. If you don't want this behavior, pass a copy
            inside of this function (e.g., using ``import deepcopy``).

        .. note::
            For implementers of sub-classes of CutSampler: the flag ``self._just_restored_state`` has to be
            handled in ``__iter__`` to make it avoid resetting the just-restored state (only once).
        r   r   zNCannot restore sampler with a different world_size (before load_state_dict(): z,attempted restoring to z]). Changing the world_size would result in different batches being returned from the sampler.r    r!   r   zNOverriding the shuffle value in CutSampler based on state_dict(initialized to z; restored to ).r'   r5   r   z:Error in CutSampler.load_state_dict(): Unexpected keys:
- 
- TN)popr   r   r!   r   warningswarnr'   r5   load_state_dictlenjoinkeysr*   )r/   rM   r   r   r2   r2   r3   rS      s6   




zCutSampler.load_state_dictc                 C      t d)Nz6Sub-classes of CutSampler have to implement __iter__()NotImplementedErrorr4   r2   r2   r3   __iter__      zCutSampler.__iter__c                 C   rW   )Nz>Sub-classes of CutSampler have to implement self._next_batch()rX   r4   r2   r2   r3   _next_batch   r[   zCutSampler._next_batchc                 C   rW   )z
        Remaining duration of data left in the sampler (may be inexact due to float arithmetic).
        Not available when the CutSet is read in lazy mode (returns None).
        zCSub-classes of CutSampler have to implement self.remaining_durationrX   r4   r2   r2   r3   remaining_duration      zCutSampler.remaining_durationc                 C   rW   )z
        Remaining number of cuts in the sampler.
        Not available when the CutSet is read in lazy mode (returns None).
        z?Sub-classes of CutSampler have to implement self.remaining_cutsrX   r4   r2   r2   r3   remaining_cuts   r^   zCutSampler.remaining_cutsc                 C   rW   )z
        Total number of cuts in the sampler.
        Not available when the CutSet is read in lazy mode (returns None).
        z9Sub-classes of CutSampler have to implement self.num_cutsrX   r4   r2   r2   r3   num_cuts  r^   zCutSampler.num_cutsc                 C   s
   d| _ dS )a8  
        Enables re-setting to the start of an epoch when iter() is called.
        This is only needed in one specific scenario: when we restored previous
        sampler state via ``sampler.load_state_dict()`` but want to discard
        the progress in the current epoch and start from the beginning.
        FN)r*   r4   r2   r2   r3   rA     s   
z$CutSampler.allow_iter_to_reset_statec           	   	   C   s$  |    g }t| jD ]}z|  }|| W q ty*   | jdks'| jr( Y qw t|dkr4t t|| jkrstdd |D }d}| jt|  }dkrm||j	|d
t| }|d7 }| jt|  }dksQ|| j}|| j }| | | jD ]}||}qt|| j| jd |S )Nr6   r   c                 S   s   g | ]}|d ur|qS Nr2   ).0br2   r2   r3   
<listcomp>>  s    z'CutSampler.__next__.<locals>.<listcomp>)first)r    r   )rA   ranger   r\   rK   StopIterationr   rT   r   subset
modify_idsmark_as_duplicatesplitr    _log_diagnosticsr.   attach_dataloading_info)	r/   batches_batchcombinedchunkdiffselectedtfnr2   r2   r3   __next__  s:   	



zCutSampler.__next__rp   .c                 C   sT   t |tr| j| d S t |tr#t |d tr#| j|d  d S td| )Nr   zObject with unexpected type: )rE   r   r5   keeptuple
ValueError)r/   rp   r2   r2   r3   rl   N  s
   
zCutSampler._log_diagnosticsc                 C   s
   | j  S )JReturns a string describing the statistics of the sampling process so far.)r5   
get_reportr4   r2   r2   r3   r{   V     
zCutSampler.get_report)FFNNr   )$__name__
__module____qualname____doc__boolr   r;   r   r   r&   propertyr5   r+   rB   r
   r   rG   r   rL   r   strr	   rM   rS   rZ   r\   floatr]   r_   r`   rA   rv   r   rl   r{   __classcell__r2   r2   r0   r3   r      sP    &#
.				"7r   	iterationr$   c                    s   dt dt f fdd}|S )Ncut_idr$   c                    s   |  d  S )N_dupr2   )r   r   r2   r3   inner\  s   z mark_as_duplicate.<locals>.inner)r   )r   r   r2   r   r3   rj   [  s   rj   cutsr    r   c                 C   s@   t jj }|du rd}n|j}|||d}| D ]}||_qdS )a=  
    Attaches diagnostic info about dataloading to each cut under ``dataloading_info`` custom field.
    This information contains the rank, world_size, and worker_id.
    If the training is not distributed, rank and world_size are 0 and 1.
    If the num_workers argument in DataLoader was 0, worker_id is None.
    N)r    r   	worker_id)torchutilsdataget_worker_infoiddataloading_info)r   r    r   worker_infor   infocutr2   r2   r3   rm   b  s   rm   c                	   @   s   e Zd ZdZededdfddZedefddZedefd	d
Z	edddZ
ededefddZ	ddedededefddZdddZdS )SamplingConstraintz
    Defines the interface for sampling constraints. A sampling constraint
    keeps track of the sampled examples and lets the sampler know when it
    should yield a mini-batch.
    exampler$   Nc                 C      dS )z
        Update the sampling constraint with the information about the sampled example
        (e.g. current batch size, total duration).
        Nr2   r/   r   r2   r2   r3   addz     zSamplingConstraint.addc                 C   r   )z4Inform if the sampling constraint has been exceeded.Nr2   r4   r2   r2   r3   exceeded     zSamplingConstraint.exceededc                 C   r   )zVInform if we're going to exceed the sampling constraint after adding one more example.Nr2   r4   r2   r2   r3   close_to_exceeding  r   z%SamplingConstraint.close_to_exceedingc                 C   r   )z?Resets the internal state (called after yielding a mini-batch).Nr2   r4   r2   r2   r3   reset  r   zSamplingConstraint.resetc                 C   r   )z
        Returns the "size" of an example, used to create bucket distribution for bucketing samplers
        (e.g., for audio it may be duration; for text it may be number of tokens; etc.).
        Nr2   r   r2   r2   r3   measure_length  r   z!SamplingConstraint.measure_lengthbucketsexample_lenc                 C   s<   t ||sJ d|d|d|du r| |}t||S )a  
        Given a list of buckets and an example, assign the example to the correct bucket.
        This is leveraged by bucketing samplers.

        Default implementation assumes that buckets are expressed in the same units as
        the output of :meth:`SamplingConstraint.measure_length` and returns the index
        of the first bucket that has a larger length than the example.
        zYselect_bucket requires either example= or example_len= as the input (we received example=z and example_len=rN   N)r   r   r   )r/   r   r   r   r2   r2   r3   select_bucket  s   

z SamplingConstraint.select_bucketc                 C   s
   t  | S )z)Return a shallow copy of this constraint.)copyr4   r2   r2   r3   r     r|   zSamplingConstraint.copyr$   N)NN)r$   r   )r}   r~   r   r   r   r	   r   r   r   r   r   r   r   r;   r   r   r2   r2   r2   r3   r   s  s.    
r   )	metaclassc                   @   s6  e Zd ZU dZdZee ed< dZee	 ed< dZ
ee	ef ed< dZe	ed< dZee	ef ed< dZee ed	< d'ddZd
efddZded
dfddZded
efddZd
efddZd
efddZd'ddZded
efddZd
eeef fddZdeeef d
dfd d!Zd(d#d$Zd"d d
efd%d&Z dS ))TimeConstraintal  
    Represents a time-based constraint for sampler classes.
    It is defined as maximum total batch duration (in seconds) and/or the total number of cuts.

    :class:`TimeConstraint` can be used for tracking whether the criterion has been exceeded
    via the `add(cut)`, `exceeded()` and `reset()` methods.
    It will automatically track the right criterion (i.e. select duration from the cut).
    It can also be a null constraint (never exceeded).

    When ``quadratic_duration`` is set, we will try to compensate for models that have a
    quadratic complexity w.r.t. the input sequence length. We use the following formula
    to determine the effective duration for each cut::

        effective_duration = duration + (duration ** 2) / quadratic_duration

    We recomend setting quadratic_duration to something between 15 and 40 for transformer architectures.
    Nmax_durationmax_cutsr   currentr`   longest_seenquadratic_durationr$   c                 C   4   t | jdsJ t | jdsJ t | jdsJ d S Nr   )r   r   r   r   r4   r2   r2   r3   __post_init__     zTimeConstraint.__post_init__c                 C   s   | j dup	| jduS )zAIs it an actual constraint, or a dummy one (i.e. never exceeded).N)r   r   r4   r2   r2   r3   	is_active  s   zTimeConstraint.is_activer   c                 C   sD   | j dur| |j}|  j|7  _t| j|| _|  jd7  _dS )z
        Increment the internal counter for the time constraint,
        selecting the right property from the input ``cut`` object.
        Nr6   )r   !_maybe_apply_quadratic_correctiondurationr   maxr   r`   )r/   r   r   r2   r2   r3   r     s
   
zTimeConstraint.addr   c                 C       | j d u r|S ||d | j   S N   )r   )r/   r   r2   r2   r3   r        
z0TimeConstraint._maybe_apply_quadratic_correctionc                 C   >   | j dur| j| j krdS | jdu rdS | j| j }|| jkS z"Is the constraint exceeded or not.NTFr   r`   r   r   r/   effective_durationr2   r2   r3   r        

zTimeConstraint.exceededc                 C   B   | j dur| j| j krdS | jdur| jd | j }|| jkS dS a(  
        Check if the batch is close to satisfying the constraints.
        We define "closeness" as: if we added one more cut that has
        duration/num_frames/num_samples equal to the longest seen cut
        in the current batch, then the batch would have exceeded the constraints.
        NTr6   Fr   r   r2   r2   r3   r        

z!TimeConstraint.close_to_exceedingc                 C      d| _ d| _d| _dS z{
        Reset the internal counter (to be used after a batch was created,
        to start collecting a new one).
        r   N)r   r`   r   r4   r2   r2   r3   r        
zTimeConstraint.resetc                 C      |j S ra   )r   r   r2   r2   r3   r        zTimeConstraint.measure_lengthc                 C      t | S ra   r   r4   r2   r2   r3   rM        zTimeConstraint.state_dictrM   c                 C   s   | d| _| d| _| d| _| d| _| dd| _| dd | _| dd  | d	d  | d
d  t|dksIJ dd|	  d S )Nr   r   r   r`   r   r   r   strictmax_samples
max_framesz>Error in TimeConstraint.load_state_dict(): Unexpected keys:
- rO   )
rP   r   r   r   r`   r   r   rT   rU   rV   r/   rM   r2   r2   r3   rS     s   zTimeConstraint.load_state_dictotherc                 C   s   dD ],}t | |}t ||}|d u o|d u }|s.t||s.J d| d| d| d| d	qt| j| j| j|j | j|j t| j|j| j	dS )Nr   r   r   zXTo add two TimeConstraint objects, they need to represent the same constraint (got self.=z
 != other.rN   )r   r   r   r`   r   r   )
getattrr   r   r   r   r   r`   r   r   r   )r/   r   key	self_attr
other_attris_noner2   r2   r3   __add__  s0   



zTimeConstraint.__add__c                 C   s$   | j |j ko| j|jko| j|jkS ra   r   r/   r   r2   r2   r3   __eq__0  s
   

zTimeConstraint.__eq__r   )r   r   r$   r   )!r}   r~   r   r   r   r   r   __annotations__r   r;   r   r   r`   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r	   rM   rS   r   r   r2   r2   r2   r3   r     s(   
 
	
	
r   c                   @   s   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< dZ
eed< dZee ed	< dddZded
dfddZded
efddZd
efddZd
efddZdddZded
efddZdS )TokenConstrainta:  
    Represents a token-based constraint for sampler classes that sample text data.
    It is defined as maximum total number of tokens in a mini-batch and/or max batch size.

    Similarly to :class:`TimeConstraint`, we support ``quadratic_length`` for quadratic
    token penalty when sampling longer texts.
    N
max_tokensmax_examplesr   r   num_examplesr   quadratic_lengthr$   c                 C   r   r   )r   r   r   r   r4   r2   r2   r3   r   I  r   zTokenConstraint.__post_init__r   c                 C   sH   | j dur| | |}|  j|7  _t| j|| _|  jd7  _dS )z
        Increment the internal token counter for the constraint,
        selecting the right property from the input object.
        Nr6   )r   r   r   r   r   r   r   )r/   r   sizer2   r2   r3   r   N  s
   
zTokenConstraint.addr   c                 C   r   r   )r   )r/   r   r2   r2   r3   r   Y  r   z1TokenConstraint._maybe_apply_quadratic_correctionc                 C   r   r   r   r   r   r   r   r2   r2   r3   r   a  r   zTokenConstraint.exceededc                 C   r   r   r   )r/   effective_sizer2   r2   r3   r   j  r   z"TokenConstraint.close_to_exceedingc                 C   r   r   )r   r   r   r4   r2   r2   r3   r   y  r   zTokenConstraint.resetc                 C   r   ra   )
num_tokensr   r2   r2   r3   r     r   zTokenConstraint.measure_lengthr   )r}   r~   r   r   r   r;   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r2   r2   r2   r3   r   8  s   
 
	
	r   c                   @   s   e Zd ZU dZeed< dZeed< dZeed< dZeed< dZ	eed< e
defdd	Ze
defd
dZdefddZdeeef fddZdeeef dd fddZdddZdS )EpochDiagnosticsr   r'   	kept_cutsdiscarded_cutskept_batchesdiscarded_batchesr$   c                 C      | j | j S ra   )r   r   r4   r2   r2   r3   
total_cuts     zEpochDiagnostics.total_cutsc                 C   r   ra   )r   r   r4   r2   r2   r3   total_batches  r   zEpochDiagnostics.total_batchesc                 C   s   | j dks
| jdkr	 dS d| jdd| jdd| jdd| j| j d	d
| jdd| jdd| j dd| j| j  d	d| jddS )rz   r   zSampling statistics unavailable: EpochDiagnostics received no cuts or batches. If this is unexpected, and you're using a custom sampler, ensure that the sampler is registering the batches in SamplerDiagnostics/EpochDiagnostics.z| ep z>3dz | cuts kept d/ (.2%) | cuts discarded  | batches kept )| batches discarded  |)r   r   r'   r   r   r   r   r4   r2   r2   r3   r{     s"    zEpochDiagnostics.get_reportc                 C   r   ra   r   r4   r2   r2   r3   rM     r   zEpochDiagnostics.state_dictrM   c                 C   sb   | d| _| d| _| d| _| d| _| d| _t|dks/J dd|  | S )	Nr'   r   r   r   r   r   z@Error in EpochDiagnostics.load_state_dict(): Unexpected keys:
- rO   )	rP   r'   r   r   r   r   rT   rU   rV   r   r2   r2   r3   rS     s   z EpochDiagnostics.load_state_dictr   c                 C   sD   | j |j ksJ t| j | j|j | j|j | j|j | j|j dS )N)r'   r   r   r   r   )r'   r   r   r   r   r   r   r2   r2   r3   r     s   



zEpochDiagnostics.__add__N)r   r   r$   r   )r}   r~   r   r'   r;   r   r   r   r   r   r   r   r   r   r{   r   r	   rM   rS   r   r2   r2   r2   r3   r     s   
 r   c                   @   sj  e Zd ZU dZdZeed< dZeee	f ed< dd Z
d2d	d
ZdeddfddZd2ddZede	fddZdee ddfddZdee ddfddZdeddfddZedefddZedefddZedefddZedefd d!Zedefd"d#Zedefd$d%Zd3d'edefd(d)Zdeeef fd*d+Zd,eeef dd fd-d.Z d4d0d1Z!dS )5r(   zr
    Utility for collecting diagnostics about the sampling process:
    how many cuts/batches were discarded.
    r   current_epochNstats_per_epochc                 C   s$   | j d u ri | _ | | j d S d S ra   )r   rB   r   r4   r2   r2   r3   r     s   
z!SamplingDiagnostics.__post_init__r$   c                 C   s   t | j| j| j< d S ra   )r   r   r   r4   r2   r2   r3   reset_current_epoch  s   z'SamplingDiagnostics.reset_current_epochr'   c                 C   s(   || _ || jvrt|d| j|< d S d S )N)r'   )r   r   r   rC   r2   r2   r3   rB     s   
zSamplingDiagnostics.set_epochc                 C   s   |  | jd  d S Nr6   )rB   r   r4   r2   r2   r3   advance_epoch     z!SamplingDiagnostics.advance_epochc                 C   s   | j | j S ra   )r   r   r4   r2   r2   r3   current_epoch_stats  r   z'SamplingDiagnostics.current_epoch_statsr   c                 C   sH   d}|D ]}| j  jd7  _|d7 }q|std | j  jd7  _d S )Nr   r6   z@Found and accepted batch with zero cuts. This could be an error.)r   r   rQ   rR   r   r/   r   cntrr   r2   r2   r3   rw     s   
zSamplingDiagnostics.keepc                 C   sB   d}|D ]}| j  jd7  _|d7 }q|r| j  jd7  _d S d S )Nr   r6   )r   r   r   r   r2   r2   r3   discard  s   
zSamplingDiagnostics.discardr   c                 C   s   | j  jd7  _d S r   )r   r   r/   r   r2   r2   r3   discard_single  r   z"SamplingDiagnostics.discard_singlec                 C      t dd | j D S )Nc                 s       | ]}|j V  qd S ra   )r   rb   sr2   r2   r3   	<genexpr>      z0SamplingDiagnostics.kept_cuts.<locals>.<genexpr>sumr   valuesr4   r2   r2   r3   r        zSamplingDiagnostics.kept_cutsc                 C   r   )Nc                 s   r  ra   )r   r  r2   r2   r3   r    r  z5SamplingDiagnostics.discarded_cuts.<locals>.<genexpr>r  r4   r2   r2   r3   r     r	  z"SamplingDiagnostics.discarded_cutsc                 C   r   )Nc                 s   r  ra   )r   r  r2   r2   r3   r    r  z3SamplingDiagnostics.kept_batches.<locals>.<genexpr>r  r4   r2   r2   r3   r     r	  z SamplingDiagnostics.kept_batchesc                 C   r   )Nc                 s   r  ra   )r   r  r2   r2   r3   r    r  z8SamplingDiagnostics.discarded_batches.<locals>.<genexpr>r  r4   r2   r2   r3   r     r	  z%SamplingDiagnostics.discarded_batchesc                 C   r   )Nc                 s   r  ra   )r   r  r2   r2   r3   r    r  z1SamplingDiagnostics.total_cuts.<locals>.<genexpr>r  r4   r2   r2   r3   r   	  r	  zSamplingDiagnostics.total_cutsc                 C   r   )Nc                 s   r  ra   )r   r  r2   r2   r3   r    r  z4SamplingDiagnostics.total_batches.<locals>.<genexpr>r  r4   r2   r2   r3   r     r	  z!SamplingDiagnostics.total_batchesF	per_epochc                 C   s   | j dks
| jdkr	 dS g }|r#t| jD ]}|| j|   q|d| jdd| jdd| j| j dd| jdd	| jdd| j dd| j| j  dd
| j	dd d
|S )rz   r   zSampling statistics unavailable: the SamplerDiagnostics received no cuts or batches. If this is unexpected, and you're using a custom sampler, ensure that the sampler is registering the batches in SamplerDiagnostics.z|  total  | cuts kept r   r   r   r   r   r   r   r   
)r   r   sortedr   rK   r{   r   r   r   r   rU   )r/   r
  retr'   r2   r2   r3   r{     s.   
	zSamplingDiagnostics.get_reportc                 C   r   ra   r   r4   r2   r2   r3   rM   +  r   zSamplingDiagnostics.state_dictrM   c                 C   s*   | d| _dd | d D | _| S )Nr   c                 S   s   i | ]\}}|t  |qS r2   )r   rS   )rb   r'   sdr2   r2   r3   
<dictcomp>0  s    z7SamplingDiagnostics.load_state_dict.<locals>.<dictcomp>r   )rP   r   itemsr   r   r2   r2   r3   rS   .  s
   z#SamplingDiagnostics.load_state_dictr   c                 C   sN   t | j}|j D ]\}}||v r|| | ||< q
|||< q
t| j|dS )N)r   r   )r   r   r  r(   r   )r/   r   r   r'   statsr2   r2   r3   r   6  s   

zSamplingDiagnostics.__add__r   )F)r   r(   r$   r(   )"r}   r~   r   r   r   r;   r   r   r   r   r   r   rB   r   r   r   r   r   rw   r   r   r   r   r   r   r   r   r   r   r{   r	   rM   rS   r   r2   r2   r2   r3   r(     s:   
 

	r(   c                   @   s   e Zd ZdedefddZdS )r,   r   r$   c                 C   r   )NTr2   r   r2   r2   r3   __call__D  s   z_filter_nothing.__call__N)r}   r~   r   r   r   r  r2   r2   r2   r3   r,   C  s    r,   fn1fn2c                    s   dt dtf fdd}|S )Nr   r$   c                    s    | o| S ra   r2   )r   r  r  r2   r3   _and_wrapperK  s   z_and.<locals>._and_wrapper)r   r   )r  r  r  r2   r  r3   rF   H  s   rF   )7r   r9   rQ   abcr   r   bisectr   r   dataclassesr   r   mathr   typingr	   r
   r   r   r   r   r   r   r   r   r<   torch.utils.datar   
lhotse.cutr   r   lhotse.cut.textr   lhotse.lazyr   lhotse.manipulationr   lhotse.utilsr   r   r   r   r   r;   r   rj   rm   r   r   r   r   r(   r,   r   rF   r2   r2   r2   r3   <module>   sP    (  G= M=~