o
    wiE                     @   s`  d dl Z d dlmZ d dlmZ d dlmZmZ d dlZ	d dl
mZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZ eG d
d deZeG dd deZ		d'de	jdeee B dedee dB dedB f
ddZeG dd deZG dd dZG dd dZG dd dZ G dd dZ!G d d! d!Z"d"edefd#d$Z#d"edefd%d&Z$dS )(    N)bisect_left)	dataclass)AnySequence)Cut)SamplingConstraintTokenConstraint)FixedBucketBatchSizeConstraint)ifnone)FormattableNeMoMultimodalConversationc                   @   s   e Zd ZU dZdZedB ed< dZedB ed< dZ	edB ed< dZ
edB ed< dZeed< dZd	d
 ZdeddfddZdefddZdefddZdddZdedefddZdS )MultimodalSamplingConstraintz
    Sampling strategy that customizes Lhotse samplers to measure sequence lengths as token counts.
    It provides a unified interface for audio and text examples - audio duration is converted to
    an equivalent token count.
    Ntoken_equivalent_duration
batch_sizebatch_tokensquadratic_factorFmeasure_total_lengthc                 C   s   t | j| j| jd| _d S )N)
max_tokensmax_examplesquadratic_length)r   r   r   r   	_internalself r   i/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/common/data/lhotse/sampling.py__post_init__>   s
   z*MultimodalSamplingConstraint.__post_init__examplereturnc                 C   s    |  |}||_| j| d S N)measure_length
num_tokensr   add)r   r   r    r   r   r   r!   E   s   
z MultimodalSamplingConstraint.addc                 C   
   | j  S r   )r   exceededr   r   r   r   r#   J      
z%MultimodalSamplingConstraint.exceededc                 C   r"   r   )r   close_to_exceedingr   r   r   r   r%   M   r$   z/MultimodalSamplingConstraint.close_to_exceedingc                 C   s   | j   d S r   )r   resetr   r   r   r   r&   P   s   z"MultimodalSamplingConstraint.resetc              
   C   s   t |tr+t|j| j }| jr)d}|jD ]}|dr$|t	|j
7 }q|| S |S t |trOz| jr8|jW S |jW S  ttfyN } ztd|d }~ww tdt| )Nr   tokenszCouldn't determine the length of a text example; have you provided both prompt_format and tokenizer when instantiating the dataloader?Unsupported example type: )
isinstancer   mathceildurationr   r   supervisions
has_customlenr'   r   total_lengthinput_lengthAttributeErrorAssertionErrorRuntimeErrortype)r   r   audio_len_in_tokenstext_tokensser   r   r   r   S   s,   



z+MultimodalSamplingConstraint.measure_length)r   N)__name__
__module____qualname____doc__r   float__annotations__r   intr   r   r   boolr   r   r   r!   r#   r%   r&   r   r   r   r   r   r      s   
 
r   c                	   @   s   e Zd ZU dZdZeed< dZee	 dB ed< dd Z
edefd	d
Zdedee	e	f e	B fddZddedededefddZdS ) FixedBucketBatchSizeConstraint2Da  
    Sampling strategy that customizes Lhotse samplers to support 2D bucket selection (it also supports 1D).
    It is intended only for audio examples (i.e., Lhotse Cut objects).

    When ``strict_2d`` is set, we only consider sub-buckets for a single bucket that is the best match.
    When set to ``False``, we'll promote an example to buckets with larger 1st dim if they can accommodate the 2nd dim.

    When ``max_ratio`` is set, it discards the examples that exceed a specific output-to-input length ratio.
    ``max_ratio`` must be a list with the same length as the number of buckets.
    ``max_ratio`` is only applied when ``strict_2d`` is set to ``True``.
    T	strict_2dN	max_ratioc                 C   s   t | jd trt| j| _| jd ur<t | jts"J d| j t| jt| jks>J dt| jdt| jd S d S )Nr   z+self.max_ratio must be a list, but we got: zlen(self.max_ratio)=z" != len(self.max_seq_len_buckets)=)r)   max_seq_len_bucketsr   npasarrayrD   r/   r   r   r   r   r   }   s   

z.FixedBucketBatchSizeConstraint2D.__post_init__r   c                 C   s   t | jtjS r   )r)   rE   rF   ndarrayr   r   r   r   bucketing_2d_enabled   s   z5FixedBucketBatchSizeConstraint2D.bucketing_2d_enabledr   c                 C   s   | j r
|jt|fS |jS r   )rI   r,   _measure_tokensr   r   r   r   r   r      s   z/FixedBucketBatchSizeConstraint2D.measure_lengthbucketsexample_lenc                 C   s(   |d u r	|  |}t| j|| j| jdS )N)strictrD   )r   find_smallest_bucketrE   rC   rD   )r   rL   r   rM   r   r   r   select_bucket   s
   
z.FixedBucketBatchSizeConstraint2D.select_bucket)NN)r:   r;   r<   r=   rC   rA   r?   rD   listr>   r   propertyrI   r   tupler   r   r@   rP   r   r   r   r   rB   l   s   
 	 rB   TrL   example_lensrN   rD   r   c                 C   sH  t |ttfrt| |}|t| krdS |S |rt| dddf |d }|| jd kr.dS |}|| jd k rZ| |df | |df krZ|d7 }|| jd k rZ| |df | |df ksCt| ||df |d }||| krodS || }|dur|d |d  || krdS |S tjt|| kdd}	t	|	}
|
s|	|
 r|

 S dS )a[  
    Find the smallest bucket that fits a given example.
    Each bucket and ``example_lens`` are floats (1-D bucketing)
    or tuples of (dim0, dim1, dim2, ...) (N-D bucketing, typically 2-D).
    Assumes the buckets have been sorted ascendingly.
    Returns a tuple of (smallest_bin, bin_idx), or (None, None) if no bucket fits the example.
    Nr      )axis)r)   r>   r@   r   r/   shaperF   allrG   argmaxitem)rL   rT   rN   rD   idx
dim0_begindim0_end
dim1_beginfit_idxdoes_fitmin_fit_idxr   r   r   rO      s2   
&& 
rO   c                   @   sL   e Zd ZU dZdZedB ed< dZeed< de	dee
eef B fdd	ZdS )
*MultimodalFixedBucketBatchSizeConstraint2Dz
    Sampling strategy that customizes Lhotse samplers to support both multimodal sampling and 2D bucket selection.
    It combines the capabilities of :class:`FixedBucketBatchSizeConstraint2D` and :class:`MultimodalSamplingConstraint`
    Nr   Fr   r   r   c                 C   s   t |tr"t|j| j }t|}| jr||fS | jr || S |S t |t	r9| jr0|j
|jfS | jr6|jS |j
S tdt| )Nr(   )r)   r   r*   r+   r,   r   rJ   rI   r   r   r1   output_lengthr0   r4   r5   )r   r   r6   r7   r   r   r   r      s   

z9MultimodalFixedBucketBatchSizeConstraint2D.measure_length)r:   r;   r<   r=   r   r>   r?   r   rA   r   rS   r   r   r   r   r   rb      s
   
 "rb   c                   @   <   e Zd ZdZdedB dedB ddfddZdefdd	ZdS )
DurationFilterz
    Callable, returns ``True`` if a cut's duration is in range [d_min, d_max] and ``False`` otherwise.
    Acts as a pass-through for objects of other type than Cut.
    d_minNd_maxr   c                 C   s    t |d| _t |td| _d S )Ninf)r
   rf   r>   rg   )r   rf   rg   r   r   r   __init__   s   zDurationFilter.__init__c                 C   sr   t |tr| j|j  ko| jkS   S t |tr7|jrdS tdd | D }| j|  ko4| jkS   S dS )NTc                 s   s    | ]}|j V  qd S r   )r,   ).0cr   r   r   	<genexpr>	  s    z*DurationFilter.__call__.<locals>.<genexpr>)	r)   r   rf   r,   rg   r   is_text_onlysum	list_cuts)r   r   tot_durr   r   r   __call__  s   

zDurationFilter.__call__r:   r;   r<   r=   r>   rj   rA   rr   r   r   r   r   re      s    re   c                   @   s@   e Zd ZdZdedB dedB deddfddZdefd	d
ZdS )TokenCountFiltera  
    Callable, returns ``True`` if an example's number of tokens is in range [t_min, t_max] and ``False`` otherwise.

    It is only applicable to data types that derive from class ``Formattable`` and lhotse ``Cut`` objects.
    Acts as a passthrough for Cuts.
    Raises exception if a non-Formattable and non-Cut data are provided.

    The ``measure_total_length`` option allows to select whether we should filter on context_ids length (=False)
    or input_ids length (=True).
    The difference is that for decoder-only models, we collapse input and output into a single sequence,
    so we should measure the example length using input_ids (measure_total_length=True).
    However, for models which have separate inputs and outputs such as encoder-decoder models,
    we want to measure the input lengths only here (measure_total_length=False),
    and enable ``TokenPerTokenFilter`` for additional filtering on the output sequence length.
    t_minNt_maxr   r   c                 C   s@   t |d| _t |td| _|| _| jdkp| jtdk | _d S )Nrh   ri   r   )r
   ru   r>   rv   r   enabled)r   ru   rv   r   r   r   r   rj      s   zTokenCountFilter.__init__c              
   C   s   | j rt|tr
dS t|tsJ d| z| jr|jn|j}W n ttfy8 } z	t	d| d|d }~ww | j
|  koD| jkS   S )NTzTokenCountFilter can only be applied to data examples that derive Formattable class. Formattable objects define properties input_length, output_length, and total_length that allow us to select the right sequence length for filtering. We got: z(Cannot measure token count for example: z -- did you forget to apply prompt formatting? If instantiating Lhotse dataloader, make sure you provided 'prompt_format' option and passed the tokenizer.)rw   r)   r   r   r   r0   r1   r2   r3   r4   ru   rv   )r   r   lengthr9   r   r   r   rr   &  s$   
zTokenCountFilter.__call__)r:   r;   r<   r=   r>   rA   rj   rr   r   r   r   r   rt     s    "rt   c                   @   rd   )
TokenPerSecondFilterz
    Callable, returns ``True`` if a cut's num_tokens (sum of len(tokens) for each supervision)
    is in range [tps_min, tps_max] and ``False`` otherwise.
    Acts as a pass-through for objects of other type than Cut.
    tps_minNtps_maxr   c                 C   d   t |d| _t|trtd}t |td| _||ks%J d|d||dkp.|tdk | _d S )Nrh   ri   ztps_min=z	 tps_max=r   )r
   rz   r)   r   r>   r{   rw   )r   rz   r{   r   r   r   rj   @     
zTokenPerSecondFilter.__init__c                 C   s8   t |tr| js
dS t|}| j|  ko| jkS   S NT)r)   r   rw   _measure_tpsrz   r{   )r   r   tpsr   r   r   rr   H  s   zTokenPerSecondFilter.__call__rs   r   r   r   r   ry   9      ry   c                   @   rd   )
TokenPerTokenFilterz
    Callable, returns ``True`` if a cut's num_tokens (sum of len(tokens) for each supervision)
    is in range [tps_min, tps_max] and ``False`` otherwise.
    Acts as a pass-through for audio examples (Cuts).
    tpt_minNtpt_maxr   c                 C   r|   )Nrh   ri   ztpt_min=z	 tpt_max=r   )r
   r   r)   r   r>   r   rw   )r   r   r   r   r   r   rj   V  r}   zTokenPerTokenFilter.__init__c                 C   sH   t |ts| js
dS |jjd |jjd  }| j|  ko!| jkS   S )NTr   )r)   r   rw   
answer_idsrW   context_idsr   r   )r   r   tptr   r   r   rr   ^  s   zTokenPerTokenFilter.__call__rs   r   r   r   r   r   O  r   r   c                   @   s0   e Zd ZdZdeddfddZdefddZdS )	BucketingFilterz
    Filters out examples that did not fit into any of the buckets.
    Intended mainly for 2D bucketing. This filter is only active when
    the constraint passed to it is of type ``FixedBucketBatchSizeConstraint2D``,
    and is otherwise disabled.
    sampling_constraintr   Nc                 C   s   || _ t| j t| _d S r   )
constraintr)   rB   rw   )r   r   r   r   r   rj   m  s   zBucketingFilter.__init__c                 C   s    | j sdS | j| jj|d uS r~   )rw   r   rP   rE   rK   r   r   r   rr   q  s   zBucketingFilter.__call__)r:   r;   r<   r=   r   rj   rA   rr   r   r   r   r   r   e  s    r   cutc                 C   sJ   t | dr
t| jS dd | jD }t|dksJ dtdd |D S )N	input_idsc                 S   s   g | ]	}t |d r|qS )r'   )hasattrrk   r8   r   r   r   
<listcomp>z  s    z#_measure_tokens.<locals>.<listcomp>r   zCannot measure the number of tokens with untokenized supervisions. Did you forget to provide the tokenizer argument to get_lhotse_dataloader_from_config() method?c                 s   s    | ]}t |jV  qd S r   )r/   r'   r   r   r   r   rm     s    z"_measure_tokens.<locals>.<genexpr>)r   r/   r   r-   ro   )r   supervisions_with_tokensr   r   r   rJ   w  s   

rJ   c                 C   s   t | }|| j S r   )rJ   r,   )r   r    r   r   r   r     s   
r   )TN)%r*   bisectr   dataclassesr   typingr   r   numpyrF   
lhotse.cutr   lhotse.datasetr   r   )lhotse.dataset.sampling.dynamic_bucketingr	   lhotse.utilsr
   1nemo.collections.common.data.lhotse.text_adaptersr   r   r   rB   rH   r>   rA   r@   rO   rb   re   rt   ry   r   r   rJ   r   r   r   r   r   <module>   sH   N.


7)*