o
    }oi]'                     @   s   d dl Z d dlmZmZmZmZ d dlZd dlZd dl	m
Z
 d dlmZmZ d dlmZ d dlmZ G dd de
Zd	ed
eeef dedefddZdS )    N)IteratorListOptionalUnion)DistributedSampler)AudioToBPEDatasetAudioToCharDataset)ASRModel)loggingc                   @   s   e Zd Z				ddededee ded	ed
edee deddfddZdefddZ	dee
j fddZdeee  fddZdefddZdS )SemiSortBatchSamplerTFN*   global_rank
world_size	durations
batch_sizebatch_shuffle	drop_lastrandomization_factorseedreturnc	           	      C   s   |du rd}t d n	t d| d |dk r!td| d|| _|| _tj|tjd	| _|| _	|| _
|| _d
| _|| _|| _|  | _t d dS )a   
        Semi Sorted Batching, as proposed in _SSB ("Speed up training with variable
        length inputs by efficient batching strategies.", Zhenhao Ge et al. (2021).).

        The Semi Sorted Batch Sampler (SSB) samples the indices by their duration
        with the addition of pseudo noise that is sampled from the uniform
        distribution \mathbb{U}\left[ -delta * r, delta * r \right], where delta is
        defined as the difference between the maximum and minimum duration and r is
        the randomization factor that controls the strength of the noise (when r = 0,
        there will be a strong sorting). The heuristic value of the r according to
        the experiments from paper is 0.2.

        The torch calls the set_epoch method from the distributed data loader sampler
        at the end of each epoch to shuffle the samples according to the seed and
        epoch number. So the SSB is passed to the dataloader as a sampler with the
        dataloader's batch size options and the batch_sampler option set to None to
        disable automatical batching. In this case, the sampler has become an iterator
        that returns a list of batch indices.

        Args:
            global_rank: Rank among all GPUs.
            world_size: The number of GPUs used.
            durations: Sample durations parsed from `dataset.manifest_processor`.
            batch_size: Micro batch size or batch size per singe gpu.
            batch_shuffle: Batch sort before each epoch.
            drop_last: Drop the last batch if the number of samples is less than batch
                size. Defaults to False.
            randomization_factor: The strength of noise that will be added to the sample
                duration. If no value is passed, the value 0.2 will be used.
            seed: Seed for batch shuffleling. Defaults to 42.

        Raises:
            ValueError: Wrong randomization factor value.
            RuntimeError: Unexpected behavior.

        .. SSB_:
            https://www.isca-archive.org/interspeech_2021/ge21_interspeech.pdf
        Ng?zHRandomization factor not found in config, default value 0.1 will be set.zA randomization factor z will be used.g        z4Randomization factor must be non-negative but found .)dtyper   z&Semi Sorted Batch Sampler will be used)r
   info
ValueErrorranknum_replicasnparrayfloat32r   shufflemicro_batch_sizer   epochr   r   _calculate_local_num_batcheslocal_num_batches)	selfr   r   r   r   r   r   r   r    r%   a/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/parts/utils/asr_batching.py__init__   s"   1
zSemiSortBatchSampler.__init__c                 C   sZ   t | j}| jr||| j 8 }t|| j }| j|| j  | j }||7 }|| j }|S N)lenr   r   r    mathceilr   )r$   init_num_samplesglobal_num_batchesnum_batches_padr#   r%   r%   r&   r"   f   s   

z1SemiSortBatchSampler._calculate_local_num_batchesc              	   C   s  t | j}t | j}|| | j d }t jj| |t| jd}t | j| }d}| j	rRt|| j
 }t jjt||dd}t ||}tdt| d tt|| j
 }|dkrktd| d	 g S | j|| j  | j }	|| jk rtd| d
| j d|	 d |	dkrt jjdt||	| j
 d}
t j|||
 fdd}|| jd | j }t| j
t|| j
}t j||dd}t|| jkrtdt| d| j d|S )N   )lowhighsizer   F)replacezDrop last is set to True, so z samples will be dropped.zThe number of all batches is zr, than dataloader will be empty. To avoid this try to decrease batch size or world size or set drop_last to False.z', which is less than the world size of z. SSB Sampler will add zA batches. To avoid this try to decrease batch size or world size.)axiszNumber of calculated indices z4 is not equal to calculated number of local batches r   )r   maxr   minr   randomuniformr)   argsortr   r    choicedeleter
   warningr*   r+   r   randintconcatenater   rangesplitr#   RuntimeError)r$   max_durationmin_durationboundnoisesorted_indicestailexcluder-   pad_batches_numbatch_indeces_padlocal_indices	size_masklocal_batchesr%   r%   r&   _make_batchesy   s\   


z"SemiSortBatchSampler._make_batchesc                 c   sn    |   }| jr t }|| j| j d  tj| j|d}nt	d| j}t
|D ]	\}}|| V  q+d S )N   )	generatorr   )rN   r   torch	Generatormanual_seedr   r!   randpermr#   arange	enumerate)r$   rM   gindices_indexr%   r%   r&   __iter__   s   zSemiSortBatchSampler.__iter__c                 C   s   | j S r(   )r#   )r$   r%   r%   r&   __len__   s   zSemiSortBatchSampler.__len__)TFNr   )__name__
__module____qualname__intr   boolr   floatr'   r"   r   r   rN   r   r[   r\   r%   r%   r%   r&   r      s8    	

JBr   modeldatasetconfigr   c                 C   s   t |tst |tstdt| ddd |jjjD }t| j	| j
||d |dd|dd	|d
d|ddd}|S )ag  
    Instantiates a Semi Sorted (Batch) Sampler.

    Args:
        model: ASR Model.
        dataset: Dataset which allow iterate over all object and parse durations.
        config: Train, Vaidation or Test dataset config.

    Raises:
        ValueError: Wrong dataset type.

    Returns:
        SemiSortBatchSampler: Semi Sorted Batch Sampler class.
    z\Only AudioToCharDataset or AudioToBPEDataset supported with semi sorted batching, but found r   c                 S   s   g | ]}|j qS r%   )duration).0sampler%   r%   r&   
<listcomp>   s    z1get_semi_sorted_batch_sampler.<locals>.<listcomp>r   r   Tr   Fr   Nsemi_sort_sampler_seedr   )r   r   r   r   r   r   r   r   )
isinstancer   r   r   typemanifest_processor
collectiondatar   r   r   get)rc   rd   re   r   samplerr%   r%   r&   get_semi_sorted_batch_sampler   s$   



rr   )r*   typingr   r   r   r   numpyr   rQ   torch.utils.data.distributedr   'nemo.collections.asr.data.audio_to_textr   r   %nemo.collections.asr.models.asr_modelr	   
nemo.utilsr
   r   dictrr   r%   r%   r%   r&   <module>   s&    2
