o
    }oi'w                     @   s   d dl Z d dlZd dlmZmZmZmZmZ d dlZ	d dl
Z
d dlm  mZ d dlmZmZ g dZG dd deZG dd deZG d	d
 d
eZdS )    N)AnyListOptionalTupleUnion)DatasetIterableDataset)ConcatDatasetConcatMapDatasetCodeSwitchedDatasetc                       s   e Zd ZdZ								ddee d	ed
edededee	 de
e dedef fddZdd Zdd Zdd Zedd Zedd Zedd Z  ZS ) r	   a  
    A dataset that accepts as argument multiple datasets and then samples from them based on the specified
    sampling technique.

    Args:
        datasets (list): A list of datasets to sample from.
        shuffle (bool): Whether to shuffle individual datasets. Only works with non-iterable datasets.
            Defaults to True.
        sampling_technique (str): Sampling technique to choose which dataset to draw a sample from.
            Defaults to 'temperature'. Currently supports 'temperature', 'random' and 'round-robin'.
        sampling_temperature (int): Temperature value for sampling. Only used when sampling_technique = 'temperature'.
            Defaults to 5.
        sampling_scale: Gives you the ability to upsample / downsample the dataset. Defaults to 1.
        sampling_probabilities (list): Probability values for sampling. Only used when sampling_technique = 'random'.
        seed: Optional value to seed the numpy RNG.
        global_rank (int): Worker rank, used for partitioning map style datasets. Defaults to 0.
        world_size (int): Total number of processes, used for partitioning map style datasets. Defaults to 1.
    Ttemperature      Nr   datasetsshufflesampling_techniquesampling_temperaturesampling_scalesampling_probabilitiesseedglobal_rank
world_sizec
                    s  t    g d}
|| _d gt| | _|| _|| _|	| _i | _|| _	|dkr6t
j| _|| jd< || jd< n1|dkrVt
j| _|rB|n
dt| gt| | jd< || jd< n|dkr_t
j| _ntd|
 d	d
| _t|d
 trud| _nd| _t|D ]5\}}t|t}|r| jdkr|s| jdkrtd| jdkr|  jt||	 7  _q||  jt|7  _q|| j	dkrt| j| j	 | _td| d| j  d S d S )N)r   randomround-robinr   r   r   r   pr   z1Currently we only support sampling techniques in .r   iterablemapzIAll datasets in ConcatDataset must be of the same kind (Iterable or Map).z	applying z  sampling scale, concat ds len: )super__init__r   len	iterablesr   r   r   sampling_kwargsr   r	   temperature_generatorindex_generatorrandom_generatorround_robin_generator
ValueErrorlength
isinstancer   kind	enumerateintlogginginfo)selfr   r   r   r   r   r   r   r   r   supported_sampling_techniquesidxdataset
isiterable	__class__ X/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/common/data/dataset.pyr   /   sH   





zConcatDataset.__init__c                 C   s:   t |tr	| S tt|}| jrtj| t|S N)	r)   r   __iter__nparanger    r   r   iter)r/   r2   indicesr6   r6   r7   get_iterablei   s   
zConcatDataset.get_iterablec                 c   s   t  }|d u r| j}d}d}n|j}|j}tt|| j|}| jdkrktt| jD ]>}t| j| | j	 | j
 }|t| j| | j	  }| j
| j	d krVt| j| }t|| ||}t | j| || j|< q,t| jD ]\}}	| |	}
|
| j|< qpd}| j| jfi | j}||k r|d7 }zt|}W n
 ty   Y d S w zt| j| }| jdkr| j| | }|V  W n ty   | | j| | j|< |d8 }Y nw ||k sd S d S Nr   r   r   )pt_dataget_worker_infor(   idnum_workersr    ranger*   r   r   r   Subsetr+   r>   r!   r$   r"   nextStopIteration)r/   worker_infomax_elementswidwnumr1   	start_idxend_idxr=   r2   r   nind_genindvalr6   r6   r7   r9   r   sN   



zConcatDataset.__iter__c                 C      | j S r8   r(   r/   r6   r6   r7   __len__      zConcatDataset.__len__c           
      k   s    | d}|std| dd }tj|}g }t| }| D ]	}|t| q t|t| }t	|d| }|t| }	 |j
t||d}	|	V  qD)Nr   z?Temperature generator expects a 'temperature' keyword argument.r   r   Tr   )getr'   r:   r   RandomStater    appendarraysumpowerchoicer;   )
r   kwargstempr   np_rnglengthsnumr2   r   rP   r6   r6   r7   r#      s"   
z#ConcatDataset.temperature_generatorc                 k   s"    t | }	 t|D ]}|V  q
qr8   )r    rD   )r   r_   rc   ir6   r6   r7   r&      s   z#ConcatDataset.round_robin_generatorc                 k   sj    | d}|std| dd }tj|}t| }t||kr&td	 |jt||d}|V  q')Nr   zKRandom generator expects a 'p' keyowrd argument for sampling probabilities.r   zELength of probabilities list must be equal to the number of datasets.TrW   )rX   r'   r:   r   rY   r    r^   r;   )r   r_   r   r   ra   rc   rP   r6   r6   r7   r%      s   
zConcatDataset.random_generator)Tr   r   r   NNr   r   )__name__
__module____qualname____doc__r   r   boolstrr,   floatr   r   r>   r9   rU   staticmethodr#   r&   r%   __classcell__r6   r6   r4   r7   r	      sL    	
:	)

r	   c                       s`   e Zd ZdZ				ddee dededeee	  d	ee f
 fd
dZ
dd Zdd Z  ZS )r
   a  
    A dataset that accepts as argument multiple datasets and then samples from them based on the specified
    sampling technique.

    Args:
        datasets (list): A list of datasets to sample from.
        sampling_technique (str): Sampling technique to choose which dataset to draw a sample from.
            Defaults to 'temperature'. Currently supports 'temperature', 'random' and 'round-robin'.
        sampling_temperature (int): Temperature value for sampling. Only used when sampling_technique = 'temperature'.
            Defaults to 5.
        sampling_probabilities (list): Probability values for sampling. Only used when sampling_technique = 'random'.
        seed: Optional value to seed the numpy RNG.
    r   r   Nr   r   r   r   r   c                    s  t    | _dd  jD  _| _| _| _tj	| _
g  _dgt j } fdd jD } jdkrt jt j }t|t j }	|	D ]7}
||
 }||
 | } j|
|f ||
  d7  < ||
 t||
 krd||
<  j
t j|
 ||
< qQd S  jdkr|d u st|t jkrtdt j d	|d urt| d
 t j}n! jdkrtdd  jD }t|d j }ntd| |t| }tt j}t }t|t jk rG j
j||d}
 j|
 }||
 }||
 | } j|
|f ||
  d7  < ||
 t|kr< j
t|||
< d||
< ||
 t|t jk sd S d S )Nc                 S      g | ]}t |qS r6   r    .0xr6   r6   r7   
<listcomp>       z-ConcatMapDataset.__init__.<locals>.<listcomp>r   c                    s   g | ]
} j t|qS r6   )ra   permutationr    rp   rT   r6   r7   rs          r   r   r   zNeed z probabilities; got Noner   c                 S   rn   r6   ro   rp   r6   r6   r7   rs     rt   z'Couldn't interpret sampling technique: )ar   )r   r   r   rb   r   r   r   r:   r   rY   ra   r=   r    maxr;   rZ   ru   r'   r[   r]   r\   setr^   add)r/   r   r   r   r   r   dataset_positionsshuffled_indicestotal_lengthdataset_ids
dataset_idpositionindexr   choicesexhausted_datasetsr2   r4   rT   r7   r      sj   






zConcatMapDataset.__init__c                 C   s
   t | jS r8   )r    r=   rT   r6   r6   r7   rU   !  s   
zConcatMapDataset.__len__c                 C   s   | j | \}}| j| | S r8   )r=   r   )r/   r1   r   dataset_indexr6   r6   r7   __getitem__$  s   zConcatMapDataset.__getitem__)r   r   NN)re   rf   rg   rh   r   r   rj   r,   r   rk   r   rU   r   rm   r6   r6   r4   r7   r
      s&    
Gr
   c                (       s   e Zd ZdZ															
		
		d.dee deee  dede	de	dedede	de	de	dee
eee f  dee	 de	de	dededede	ded f& fd d!Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- Z  ZS )/r   a	  
    A dataset that accepts as argument multiple sub-datasets (usually from different languages, but that's not required) and then
    samples from them in order to create synthetic code-switched samples of up to N different sub-datasets

    Args:
        datasets (list): A list of datasets
        lang_probs (list): A list of probabilities (which must sum to 1) corresponding to the sampling probability for each dataset
        shuffle (bool): Whether to shuffle individual datasets. Only works with non-iterable datasets.
            Defaults to True.
        min_duration (int): the minimum duration (secs) of each synthetic code-switched sample. Will draw randomly until this is hit.
            Defaults to 4
        max_duration (int): the maximum duration (secs) of each synthetic code-switched sample.
            Defaults to 20
        min_monolingual (float): this percentage of the dataset will be original monolingual samples
            Defaults to 0.3 - means 30%
        db_norm (float): will normalise the composite CS sample to this DB level
            Defaults to -25.0
        pause_start (int): inserts silence equal to this value (msecs) at the start of each CS sample
            Defaults to 0
        pause_join (int): inserts silence equal to this value (msecs) between all language changes in the CS sample
            Defaults to 0
        pause_end (int): terminates all CS samples with silence equal to this value (msecs)
            Defaults to 0
        sampling_scales (list or float): gives you the ability to upsample/downsample each individual dataset
        seed: Optional value to seed the numpy RNG.
        global_rank (int): Worker rank, used for partitioning map style datasets. Defaults to 0.
        world_size (int): Total number of processes, used for partitioning map style datasets. Defaults to 1.
        pure_random (bool): If true, then always draw random sample from lang_probs. If false, you only draw from those datasets
            which you haven't sampled from yet for the composite sample
        force_monochannel (bool): If true, then all output audio will be mono-channel
        infinity_mode (bool): If true, then the dataset iterable will generate an infinite amount of samples
        sample_rate (int): the sample rate of all audio being sent to this Dataset
        augmentor (AudioAugmentor): The any perturbations you wish to have applied on the CS samples
    NT      333333?      9r   r   F>  r   
lang_probsr   min_durationmax_durationmin_monolingualdb_normpause_start
pause_join	pause_endsampling_scalesr   r   r   pure_randomforce_monochannelinfinity_modesample_rate	augmentorAudioAugmentorc                    s  t    t|dkrtd|_ttt|_tj_	dd jD _
dd jD _|_|_|_|_|_|_|	_|
_|_|_|_|_|_|_|_d_ d u rsfddjD _ntjt ksJ d fddjD _ttj _ d urt!tsfd	djD _"n(d urt!trttjkrd
d t#jD _"n	dd jD _"t$jD ]8\}}t!|t%}|rdj|<  jt&t|j"|  7  _qdj|<  jt&t|| j"|  7  _q|d urtj'(| t)jjd  dr4jjd  j*_*d S t)jjd  dryt!jjd  jtrytjjd  jdkryt)jjd  jd dryjjd  jd j*_*d S t)jjd  drt!jjd  jtrtjjd  jdkrt)jjd  jd drt!jjd  jd jtrtjjd  jd jdkrt)jjd  jd jd drjjd  jd jd j*_*d S t+d)Nr   zGCodeSwitchedDataset must receive a non-zero length datasets dict objectc                 S      i | ]}|d qS r8   r6   rq   kr6   r6   r7   
<dictcomp>k      z0CodeSwitchedDataset.__init__.<locals>.<dictcomp>c                 S   r   r8   r6   r   r6   r6   r7   r   l  r   c                    s   i | ]
}|d t  j qS )g      ?)r    langsrq   lrT   r6   r7   r   ~  rv   zKSize mismatch between languages and respective probs in CodeSwitchedDatasetc                    s   i | ]}| | qS r6   r6   r   )r   r6   r7   r         c                    s   i | ]}| qS r6   r6   r   )r   r6   r7   r     r   c                 S   s   i | ]\}}||qS r6   r6   )rq   r   vr6   r6   r7   r     r   c                 S   r   )r   r6   r   r6   r6   r7   r     r   r   r   
collate_fnr   zJCodeSwitchedDataset could not locate a valid dataset collate_fn to bind to),r   r   r    r'   r   listrD   r   rz   	langs_setlang_iterables	lang_kindr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r(   	prob_dictr:   r[   valuesr   r)   r   zipr+   r   r,   r   r   hasattrr   RuntimeError)r/   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   langr2   r3   r4   )r   r   r/   r7   r   M  s   



"
&
" $zCodeSwitchedDataset.__init__c                 C   sD   | j | }t|tr| S tt|}| jrtj| t	|S r8   )
r   r)   r   r9   r:   r;   r    r   r   r<   )r/   r   r2   r=   r6   r6   r7   get_iterable_by_lang  s   

z(CodeSwitchedDataset.get_iterable_by_langc              	   C   s  t g }d}g }g }tj | jk}|| jk r| jr|r0tt	|dks0tt	|t| j
kr;tjj| j
| jd}n(ttt| jjt| jt	| }||  }tjjt| jt	| |d}| |^}}	}
}}|  dkrvqt|| j }|| | jkrq|j|
jkr||
j}|jdkr| jr|jdd}||7 }|| ||    t j!||
gdd}|rn|| jk stt	dd |D }t|dkrt"d|d dk}|rtj#t$| j%| j d	 |d j&d f|d j'd
}ntj#t$| j%| j d	 f|d j'd
}t(|D ]m\}}|st)|}|d| j*d  t+d|d jddd   }|t|d k rt|r]tj|tj#t$| j,| j d	 |d j&d f|j'd
dd}ntj|tj#t$| j,| j d	 f|j'd
dd}tj||dd}q|rtj|tj#t$| j-| j d	 |d j&d f|j'd
dd}ntj|tj#t$| j-| j d	 f|j'd
dd}| j.d urdd l/}ddl0m1} t23 }|j4||| jdd |5d |j6|| jd}| j.7| |j8}t j9||j'|jdt j9t||	jd: |t j9t||jd: fS )Nr   rW   r   )dimc                 S   s   g | ]}|j qS r6   )ndim)rq   sr6   r6   r7   rs     r   z>CodeSwitchedDataset.build_single_CS_sample.<locals>.<listcomp>zxMixture of audios with different number of channels in CodeSwitchedDataset. All sources must be same number of channels.g     @@)shapedtypeg      $@g      4@g{Gz?   )axisg      ?)AudioSegmentWAV)format)	target_sr)r   device)r   );torch
LongTensorr:   r   randr   r   r   r    rz   r   r^   r   r[   r   r   r   rX   r   r\   get_sample_from_languagecount_nonzeroitemr   r   r   tor   r   meanrZ   cpunumpycatr   zerosr,   r   r   r   r+   
trim_zerosr   maximumr   r   r   	soundfile(nemo.collections.asr.parts.preprocessingr   ioBytesIOwriteseek	from_fileperturbsamplestensorlong)r/   	comp_textcreated_sample_duration_seccreated_sample_langscreated_sample_audios	pure_monolang_idr   audio	audio_lenlabels
labels_len_sample_durationsample_channelsmultichannel
comp_audior1   wavwav_normsfr   mbcomp_audio_asr6   r6   r7   build_single_CS_sample  s   


&&

- 
.  	 

z*CodeSwitchedDataset.build_single_CS_samplec           	      C   s   t  }|d u r| j}d}d}n|j}|j}tt|| j|}| jD ]M}| j| dkrgt| j	| | j
 | j }|t| j	| | j
  }| j| j
d krSt| j	| }t|| ||}t | j	| || j	|< | || j|< q"|S r?   )r@   rA   r(   rB   rC   r    rD   r   r   r   r   r   rE   r   r   )	r/   rH   rI   rJ   rK   r   rL   rM   r=   r6   r6   r7   prep_underlying_datasetsc  s$   
z,CodeSwitchedDataset.prep_underlying_datasetsc                 C   sX   	 zt | j| }| j| dkr| j| | }|W S  ty*   | || j|< Y nw q)NTr   )rF   r   r   r   rG   r   )r/   r   rQ   r6   r6   r7   r     s   z,CodeSwitchedDataset.get_sample_from_languagec                 c   sL    |   }| jr	 |  V  q	d}||k r$|  V  |d7 }||k sd S d S )NTr   r   )r   r   r   )r/   rI   rN   r6   r6   r7   r9     s   

zCodeSwitchedDataset.__iter__c                 C   rR   r8   rS   rT   r6   r6   r7   rU     rV   zCodeSwitchedDataset.__len__)NTr   r   r   r   r   r   r   NNr   r   FTFr   N)re   rf   rg   rh   r   r   r   rk   ri   r,   r   r   r   r   r   r   r9   rU   rm   r6   r6   r4   r7   r   )  s    &
	
m 
r   )r   r-   typingr   r   r   r   r   r   r:   r   torch.utils.datautilsdatar@   r   r   __all__r	   r
   r   r6   r6   r6   r7   <module>   s    1^