o
    Si                     @   s   d dl Z d dlmZ d dlmZmZ d dlZd dlm	Z	 d dl
mZ 	 ddededeeee	f eeef f fd	d
ZddededefddZdS )    N)mean)DictTuple)CutSet)
CutSamplersamplerbatch_tuple_indexreturnc           	         s   dd dd dd dd t dd d}zt| } t|  t tr% |  W n ty8   td i i f Y S w  fd	d
|D } fdd
| D }| D ]&}t|trZ|| }| D ]\}}||}||| krt|||< |||< q^qO||fS )a  
    Function for finding 'pessimistic' batches, i.e. batches that have the highest potential
    to blow up the GPU memory during training. We will fully iterate the sampler and record
    the most risky batches under several criteria:
    - single longest cut
    - single longest supervision
    - largest batch cuts duration
    - largest batch supervisions duration
    - max num cuts
    - max num supervisions

    .. note: It is up to the users to convert the sampled CutSets into actual batches and test them
        by running forward and backward passes with their model.

    Example of how this function can be used with a PyTorch model
    and a :class:`~lhotse.dataset.K2SpeechRecognitionDataset`::

        sampler = SimpleCutSampler(cuts, max_duration=300)
        dataset = K2SpeechRecognitionDataset()
        batches, scores = find_pessimistic_batches(sampler)
        for reason, cuts in batches.items():
            try:
                batch = dset[cuts]
                outputs = model(batch)
                loss = loss_fn(outputs)
                loss.backward()
            except:
                print(f"Exception caught when evaluating pessimistic batch for: {reason}={scores[reason]}")
                raise


    :param sampler: An instance of a Lhotse :class:`.CutSampler`.
    :param batch_tuple_index: Applicable to samplers that return tuples of :class:`~lhotse.cut.CutSet`.
        Indicates which position in the tuple we should look up for the CutSet.
    :return: A tuple of dicts: the first with batches (as CutSets) and the other with criteria values, i.e.:
        ``({"<criterion>": <CutSet>, ...}, {"<criterion>": <value>, ...})``
    c                 S      t dd | D S )Nc                 s       | ]}|j V  qd S Nduration.0c r   Q/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/dataset/sampling/utils.py	<genexpr>4       =find_pessimistic_batches.<locals>.<lambda>.<locals>.<genexpr>maxcutsr   r   r   <lambda>4       z*find_pessimistic_batches.<locals>.<lambda>c                 S   r
   )Nc                 s   s$    | ]}t d d |jD V  qdS )c                 s   r   r   r   r   sr   r   r   r   6   r   zGfind_pessimistic_batches.<locals>.<lambda>.<locals>.<genexpr>.<genexpr>N)sumsupervisionsr   r   r   r   r   5   s    
r   r   r   r   r   r   r   5       c                 S   r
   )Nc                 s   r   r   r   r   r   r   r   r   8   r   r   r   r   r   r   r   r   8   r   c                 S   r
   )Nc                 s   s"    | ]}|j D ]}|jV  qqd S r   )r    r   )r   r   r   r   r   r   r   9   s    r   r"   r   r   r   r   r   9   r!   c                 S   r
   )Nc                 s   s     | ]}|j D ]}d V  qqdS )   N)r    )r   r   _r   r   r   r   =   s    r   r"   r   r   r   r   r   =   r!   )single_longest_cutsingle_longest_supervisionlargest_batch_cuts_duration#largest_batch_supervisions_durationmax_num_cutsmax_num_supervisionsz7Empty sampler encountered in find_pessimistic_batches()c                    s   i | ]}| qS r   r   )r   kfirst_batchr   r   
<dictcomp>J   r   z,find_pessimistic_batches.<locals>.<dictcomp>c                    s   i | ]	\}}|| qS r   r   )r   r+   fnr,   r   r   r.   K   s    )	leniternext
isinstancetupleStopIterationwarningswarnitems)	r   r   criteriatop_batches
top_valuesbatchcritr/   valr   r,   r   find_pessimistic_batches   s<   )


r?     	n_samplesc              	      s   g }g }g }g }g }g }g }g }	g }
t | } t|D ]}zt|  W n
 ty,   Y  nw t ts;td  d   jdd t	 dkr|
 d j d j  d j  |

 d j t	 d  j  d j  |	
t fddtdt	 D  d j     d}d}d} D ]2}|
|j |
tdd	 |jD  |
|d
 |d
   ||d
 7 }||d
 7 }||d
 7 }q|
| |
| |
| qt|}t|}t|}t|}t|}t|}dg d|ddt|dd|ddt|dd|ddt|dd|| dd|ddt|dd|ddt|dd|ddt|dd|| ddt|	ddt|ddt|
ddS )z
    Returns a human-readable string message about amount of padding diagnostics.
    Assumes that padding corresponds to segments without any supervision within cuts.
    zThe sampler returned a mini-batch with multiple CutSets: we will only report the padding estimate for the first CutSet in each mini-batch.r   F)	ascendingr#   c                    s    g | ]} d  j  | j  qS r   r   )r   ir<   r   r   
<listcomp>   s    z1report_padding_ratio_estimate.<locals>.<listcomp>c                 s   r   r   r   r   r   r   r   r      r   z0report_padding_ratio_estimate.<locals>.<genexpr> zAn average CUT has z.1fzs (std=zs) of supervisions vs. z)s) of total duration. Average padding is z	s), i.e. z.1%z.
An average BATCH has z's) of combined supervised duration vs. z2s) of combined total duration. Average padding is zD.
Expected variability of cut durations within a single batch is +/-z (two closest cuts: z, two most distant cuts: z).
    )r1   ranger2   r5   r3   r   r6   r7   sort_by_durationr0   appendr   r   padr   r    npjoinstd)r   rA   
supervisedtotalgapsbatch_supervisedbatch_total
batch_gapsmin_dur_diffsmean_dur_diffsmax_dur_diffsrD   	batch_sup	batch_tot	batch_gapcutm_supervisedm_totalm_gapsm_batch_supervisedm_batch_totalm_batch_gapsr   rE   r   report_padding_ratio_estimateY   s   
$









t








rc   rC   )r@   )r6   
statisticsr   typingr   r   numpyrM   lhotser   lhotse.dataset.sampling.baser   intstrfloatr?   rc   r   r   r   r   <module>   s    
N