o
    ϯiz                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	Z	d dl
Zd dlZd dlZd dlm  mZ d dlmZ d dlZd dlZd dlmZ d dlmZmZmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%Z%d dl&Z&d dl'Z'd dl(m)Z) d d	l*m+Z+m,Z, d d
l*m-Z-m.Z. d dl/m0Z1 d dl2m3Z3 d dl2m4Z4 d dl2m5Z5 zd dl6mZ7 W n e8y   dZ7Y nw zd dl9Z9W n e8y   dZ9Y nw e3:dZ;e4:dZ<e5:dZ=dKddZ>ej?@e$eAjBdZCejDeCddZEdd ZFdd ZGdd ZHdd  ZIG d!d" d"eZJeG d#d$ d$ZKdLd%d&ZLd'd( ZMd)d* ZNd+ZOd,ZPd-ZQd.ZRdMd/d0ZSd1d2 ZTdNd4d5ZUd6d7 ZVd8d9 ZWd:d; ZX	<	=	>	?		dOd@dAZYg dBfdCdDZZdEdF Z[dGdH Z\dIdJ Z]dS )P    N)	dataclass)Image)Dataset
DataLoaderSubsetRandomSampler)DistributedSampler)partial)Path)suppress)get_tar_path_from_dataset_namedataset_split)load_pload_class_label)tokenize)BertTokenizer)RobertaTokenizer)BartTokenizerzbert-base-uncasedzroberta-basezfacebook/bart-baserobertaM   c                 C   s   |dkrt | dS |dkr!t| dd|dd}dd	 | D S |d
kr7t| dd|dd}dd	 | D S |dkrMt| dd|dd}dd	 | D S dS )ztokenizer for different models
    tmodel is default to roberta as it is the best model for our task
    max_length is default to 77 from the OpenAI CLIP parameters
    We assume text to be a single string, but it can also be a list of strings
    transformerr   bert
max_lengthTpt)padding
truncationr   return_tensorsc                 S      i | ]
\}}|| d qS r   squeeze.0kv r$   L/home/ubuntu/.local/lib/python3.10/site-packages/laion_clap/training/data.py
<dictcomp>A       ztokenizer.<locals>.<dictcomp>r   c                 S   r   r   r   r    r$   r$   r%   r&   K   r'   bartc                 S   r   r   r   r    r$   r$   r%   r&   U   r'   N)clip_tokenizerr   bert_tokenizeritemsroberta_tokenizerbart_tokenizer)texttmodelr   resultr$   r$   r%   	tokenizer0   s<   r1   zaudioset_textmap.npyTallow_picklec                 C   s   | d  dS )N    @float32)astypexr$   r$   r%   int16_to_float32]   s   r9   c                 C   s   t j| ddd} | d dS )N            ?)a_mina_maxr4   int16)npclipr6   r7   r$   r$   r%   float32_to_int16a   s   rA   c                 C   s   | d  tjS )Nr4   )typetorchr5   r7   r$   r$   r%   int16_to_float32_torchf   s   rD   c                 C   s    t j| ddd} | d t jS )Nr:   r;   )minmaxr4   )rC   clamprB   r>   r7   r$   r$   r%   float32_to_int16_torchj   s   rH   c                   @   sF   e Zd ZdddZdd Zdd Zdd	 Zd
d Zdd Zdd Z	dS )
ToyDatasetFc                 C   s   |d | _ |d | _t|d| _tj|dd| _t| jd | _	| j d | _
|| _|s1|   n%g | _t| j	D ]}| jd | }t|d	krO| j| q9t| j| _	td
| j	  td| j
  dS )a  Toy Dataset for testing the audioset input with text labels
        Parameters
        ----------
            index_path: str
                the link to the h5 file of each audio
            idc: str
                the link to the npy file, the number of samples in each class
            config: dict
                the audio cfg file
           eval_model (bool): to indicate if the dataset is a testing dataset
        	audio_cfgtext_cfgrTr2   
audio_name	class_numtargetr   ztotal dataset size: %dzclass num: %dN)rJ   rK   h5pyFilefpr?   loadipclen
total_sizeclasses_num	eval_modegenerate_queuequeuerangesumappendlogginginfo)self
index_pathrT   configrX   irO   r$   r$   r%   __init__q   s$   


zToyDataset.__init__c                 C   s>   t |}td|d }tj||d  |d | gdd}|S )Nr      )axis)rU   randomrandintr?   concatenate)r`   r8   	frame_num	shift_len
new_sampler$   r$   r%   time_shifting   s   "zToyDataset.time_shiftingc                    s   g  _ t j  jk r-g t j}t|   j  fdd|D 7  _ t j  jk s j d  j  _ td j dd    d S )Nc              	      s0   g | ]} j | td t j | d  qS )r   re   )rT   rg   rh   rU   )r!   dr`   r$   r%   
<listcomp>   s    $z-ToyDataset.generate_queue.<locals>.<listcomp>zqueue regenerated:%s)	rZ   rU   rV   r[   rW   rg   shuffler^   r_   )r`   	class_setr$   ro   r%   rY      s   

zToyDataset.generate_queuec                 C   s2   | j d }tdt|| d }||||  S )N	crop_sizer   re   )rJ   rg   rh   rU   )r`   r8   rt   crop_posr$   r$   r%   crop_wav   s   
zToyDataset.crop_wavc                 C   sD   t t|dk }dd|d d  d |d  }t|d }|S )Nr   zThe sounds of z, z and )_AUDIOSET_MAPr?   wherejoinr1   )r`   rO   events
event_textr.   r$   r$   r%   prompt_text   s   "zToyDataset.prompt_textc              	   C   s^  | j | }| jd |  }| jd |  dd}| jd | }| jd | d}| |}t|d}t|d	 | d
| j	d  }	W d
   n1 sQw   Y  t
|	| j	d kscJ dtt|	| j	d
d
d
d
d
f }
tj|
|
 |
 |
 gdd  }
tddg}|dkrd|
dd
d
d
d
d
f< ||||	||||
d}|S )a  Load waveform, text, and target of an audio clip

        Parameters
        ----------
            index: int
                the index number
        Return
        ------
            output: dict {
                "hdf5_path": str,
                "index_in_hdf5": int,
                "audio_name": str,
                "waveform": list (audio_length,),
                "target": list (class_num, ),
                "text": torch.tensor (context_length,)
            }
                the output dictionary
        rM   	hdf5_pathz../workspacez+/home/la/kechen/Research/ke_zsasp/workspaceindex_in_hdf5rO   r5   rL   waveformNclip_sampleszThe sample length is not matchr   dimTFg        re   )r~   r   rM   r   class_labelr.   longer
mel_fusion)rZ   rR   decodereplacer6   r}   rP   rQ   r9   rJ   rU   get_melrC   
from_numpycatclonecpunumpyrg   choice)r`   indexs_indexrM   r~   r_idxrO   r.   fr   mel_specr   	data_dictr$   r$   r%   __getitem__   sF   

$*
zToyDataset.__getitem__c                 C   s   | j S )N)rV   ro   r$   r$   r%   __len__   s   zToyDataset.__len__NF)
__name__
__module____qualname__rd   rm   rY   rv   r}   r   r   r$   r$   r$   r%   rI   p   s    
 KrI   c                   @   s   e Zd ZU eed< eed< dS )DataInfo
dataloadersamplerN)r   r   r   r   __annotations__r   r$   r$   r$   r%   r      s   
 r   c                    s~  t | trg }| D ]}|t|||dd  q	n|sDt D ]}|| dv r* nqt| D ]}|| dv r: nq/d| d| d}tt| }tj	
| }|d uritt|d t fdd|D }nAtj	|d	}	tj	|d
}
tj	|	rtt|	d t fdd|D }ntj	|
rtt|
d }ntd|  dt|}t | trt|t| fS ||fS )Nsizefilepath_is_localr   /z./json_files/z/sizes.jsonrL   c              
      s(   g | ]}t  tj|d d qS )z.tar -z.tar)intospathbasenamer   r!   shardsizesr$   r%   rp     s    z$get_dataset_size.<locals>.<listcomp>
sizes.jsonr   c                    s    g | ]}t  tj| qS r$   )r   r   r   r   r   r   r$   r%   rp   "  s     z#Cannot find sizes file for dataset z&. Please specify the path to the file.)
isinstancelistr]   get_dataset_sizer   keyssplitbraceexpandr   r   dirnamejsonrS   openr\   rz   existsastliteral_evalread	ExceptionrU   )shardsr   r   	size_listsnshards_listdir_pathrV   sizes_filenamelen_filename
num_shardsr$   r   r%   r     sT   



r   c                 C   sP   dt jd< d\}}| D ]\}}|d7 }|t|7 }t|t|ks#J q||fS )N0	WDS_EPOCH)r   r   re   )r   environrU   )r   
n_elements	n_batchesimagestextsr$   r$   r%   count_samples7  s   
r   c                 C   s   t dt|  d dS )zUCall in an exception handler to ignore any exception, isssue a warning, and continue.zHandling webdataset error (z). Ignoring.T)r^   warningrepr)exnr$   r$   r%   log_and_continueA  s   r   i  i  i  i  c                    s    fddt t D }i }i }|s&tjdrtd t| d d} t| ddd}t	
|}W d   n1 s=w   Y  tt|| }	t| |	}
|
D ]}|| ||< || ||< qTt| |	dd	 | D |fS )
z*
    Sample a proportion of the data.
    c                    s2   i | ]}t j | d  t j | d qS )re   r   )r   r   r   r!   rc   inputsr$   r%   r&   Q  s    $zsample_prop.<locals>.<dictcomp>r   rL   zUTF-8)encodingNc                 S   s   g | ]\}}t j||qS r$   )r   r   rz   r    r$   r$   r%   rp   f  s    zsample_prop.<locals>.<listcomp>)r[   rU   r   r   r   removewgetdownloadr   r   rS   r   rg   sampler   r\   valuesr+   )sizefiler   
proportionr   file_path_dictsampled_filepath_dictsampled_size_dictr   	load_dictLsubkeysr"   r$   r   r%   sample_propM  s.   



r   c                 C   sh   t jj|d |d |d |d dddd d|d |d |d	 d
| j}|| }t jjd d|}|jS )Nsample_ratewindow_sizehop_sizeTreflectg       @mel_binsfminfmax)r   n_fft
win_length
hop_lengthcenterpad_modepowernormonesidedn_melsf_minf_max)top_db)
torchaudio
transformsMelSpectrogramtodeviceAmplitudeToDBT)
audio_datarJ   mel_tfmelr$   r$   r%   r   k  s&   r   Fc                 C   s"  |rt ntj}| s t||kr|dkrtdg}n|dkrt||}	||d  d }
|	jd }|
|krMtj|	|	|	|	gdd}|| d< td	g}nt	t
td||
 d d
}t|d dkridg|d< t|d dkrvdg|d< tj|d }tj|d }tj|d }|	|||
 ddf }|	|||
 ddf }|	|||
 ddf }tjj|
|d gd|	d d }tj||||gdd}|| d< tdg}ntd| dt|| }tjd|d }||||  }n~t||k rX|dkr"t|t| }||}tj|d|t| fddd}n6|dkr7tj|d|t| fddd}n!|dkrPt|t| }||d d| }ntd| d|dkrqt||}	tj|	|	|	|	gdd}|| d< td	g}W d   n	1 sw   Y  || d< || d< | S )a)  
    Calculate and add audio features to sample.
    Sample: a dict containing all the data of current sample.
    audio_data: a tensor of shape (T) containing audio data.
    max_len: the maximum length of audio data.
    data_truncating: the method of truncating data.
    data_filling: the method of filling data.
    audio_cfg: a dict containing audio configuration. Comes from model_cfg['audio_cfg'].
    require_grad: whether to require gradient for audio data.
        This is useful when we want to apply gradient-based classifier-guidance.
    
rand_truncTfusionr   re   r   r   r   F      Nr   )sizezdata_truncating  not implemented	repeatpadconstant)modevaluepadrepeatzdata_filling r   r   )r
   rC   no_gradrU   tensorr   shapestackr?   array_splitr   r[   rg   r   torchvisionr   ResizeNotImplementedErrorrh   r   r
  Fr	  )r   r   max_lendata_truncatingdata_fillingrJ   require_gradgrad_fnr   r   chunk_framestotal_framesr   ranges	idx_front
idx_middleidx_backmel_chunk_frontmel_chunk_middlemel_chunk_back
mel_shrinkoverflowidxn_repeatr$   r$   r%   get_audio_features  s   




"







Xr&  c                 C   s   |d u s|dkr| d }|S |dkr$d|   v r| d }|S | d }|S |dkrFd|   v r@| d d u r:| d }|S | d }|S | d }|S td| d)	Nnoner.   alltext_augment_allaugment_onlytext_augment_t5ztext_augment_selection r  )r   r  )json_dict_rawtext_augment_selectionr   r$   r$   r%   select_text  s*   	
r.  c
                 C   s&  | | \}
}t t|
d }
t| |
||||} | |= | | }t||	}|| d< t|tr>t|d tr>t|dkr>t	|}|| d< t
||d| d< |durltt|}d|tt| |d < t| | d	< | |= | d
 dd d | | d< | d
 dd d | | d< || d< | S )z7
    Preprocess a single sample for wdsdataloader.
    r   	full_textre   raw_text)r/   r.   Ntagr   __key__r   rw   .rM   	text_nameaudio_orig_sr)rD   rH   r&  r.  r   r   strrU   rg   r   r1   r?   zerosin1dr   rC   r  floatr   )r   	audio_exttext_extr  rJ   r/   class_index_dictr  r  r-  r   orig_srr,  r   class_labelsr$   r$   r%   preprocess_single  s(   
$

r?  c                    s^  t |j}|j}|j}|j}	|j}
g }| D ]}|t||||||
||||	
 qi }|d 	 D ]z t
|d   trki | < |d   	 D ]!}g }tt|D ]}|||   |  qRt||  |< qHq1t
|d   tjrt fdd|D | < q1t
|d   tjrtt fdd|D | < q1 fdd|D | < q1~|S )z^
    Collate function for wdsdataloader.
    batch: a list of dict, each dict is a sample
    r   c                       g | ]}|  qS r$   r$   r!   r   r"   r$   r%   rp   l      z.collate_fn_with_preprocess.<locals>.<listcomp>c                    r@  r$   r$   rA  rB  r$   r%   rp   n  rC  c                    r@  r$   r$   rA  rB  r$   r%   rp   p  rC  )copydeepcopyr<  r  r  r-  r/   r]   r?  r   r   dictr[   rU   rC   vstackTensorr  r?   ndarrayr  )batchr:  r;  r  rJ   argsr<  r  r  r-  r/   data_preprocessedr   
batch_dictkktmprc   r$   rB  r%   collate_fn_with_preprocessH  s:   $rP  flacr    S r;   c	                 C   sZ  |du r| j dur| j  }|r| jn| j}	|	dusJ |dur"|}
ntjtj|	d d}
|dkr@t|
|	||d\}}}	}n	t|	||d\}}|s\|rW| j	}|sVt
dn| jp[d}t|	g}|sg| jr|tjtt| jdtjtjtjtd	tjttt| jd
g n|tjtjtd	g |ttj |tj | j!|p| j t"t#||||d | dd tj$| }|s| jr| j!| j% }t&'|| }t(d| j)}t&'|| }|| }|| }|*|}nt&'|| j! }i }| j+rd|d< |r| j,r| j,}nt(d| j!| j) }nd}tj-|fdd| j)d|d|}||_.||_/t0|dS )z*
    Get a dataset for wdsdataloader.
    Nr   r   r;   )r   r   zCurrently, number of dataset samples must be specified for training dataset. Please specify via `--train-num-samples` if no dataset length info present.)bufsizeinitialseed)handler)rS  rT  rngrJ   )r:  r;  r  rJ   rK  )r   collation_fnre   
forkservermultiprocessing_contextr  FT)
batch_sizerr   num_workers
pin_memoryprefetch_factor)1
remotedata
train_dataval_datar   r   rz   r   r   r   train_num_samplesRuntimeErrorval_num_sampleswdsSimpleShardListparallel_evalextend
detshuffle_SHARD_SHUFFLE_SIZE_SHARD_SHUFFLE_INITIALrU  split_by_nodesplit_by_workertarfile_to_samplesr   rr   _SAMPLE_SHUFFLE_SIZE_SAMPLE_SHUFFLE_INITIALrg   Randomr]   r   torch_audiobatchedr[  r   rP  DataPipeline
world_sizemathceilrF   workers
with_epochhorovodr^  	WebLoadernum_batchesnum_samplesr   )rK  	model_cfgis_trainr:  r;  r  r   r   r   input_shardssizefilepathr}  r   _pipelinedatasetglobal_batch_sizer|  r\  num_worker_batcheskwargsr^  r   r$   r$   r%   get_wds_datasetu  s   









r  )__url__r2  r   r.   r0  rM   r4  r5  c                    s4   t t  ksJ d fddtt  D S )zQ
    Return a dictionary of the batch, with keys as the names of the fields.
    z4batch must have same number of keys as keys argumentc                    s   i | ]	}|  | qS r$   r$   r   rJ  r   r$   r%   r&   )  s    z'wds_batch_list2dict.<locals>.<dictcomp>)rU   r[   r  r$   r  r%   wds_batch_list2dict  s   r  c           
      C   s   |r| j n| j}|r| jn| j}|r|sJ | }t||||d}t|}| jr0|r0t|ddnd }t|| j	d| j
||d}	||	_t|	|	_t|	|S )N)rX   F)rr   )r[  rr   r\  r   	drop_last)r`  ra  	train_ipcval_ipcrI   rU   distributedr   r   r[  rx  r}  r|  r   )
rK  r~  r  ra   ipc_pathrX   r  r}  r   r   r$   r$   r%   get_toy_dataset-  s.   

r  c                 C   s&   | dkrt S | dkrtS td|  )N
webdatasettoyzUnsupported dataset type: )r  r  
ValueError)dataset_typer$   r$   r%   get_dataset_fnI  s
   r  c                    s  i }t | j| _| jd u rg d| _| jdkrat| j| j| j | j| j	| j
d| _| j
d u r1g | _
| jd u r9g | _| j
| j   rK fdd| jD n| j}|| _t|g d| j d| j	d d| _| jrpt| j| |dd	|d
< | jrt| j| |dd	|d< |S )N)trainunbalanced_trainbalanced_trainr  )islocalr   dataset_pathfull_datasetc                    s   g | ]}| vr|qS r$   r$   )r!   r   excluded_eval_datasetsr$   r%   rp   i  s    zget_data.<locals>.<listcomp>)validtestevalre   T)r  r  Fval)r   class_label_pathr<  datasetinfosr  r   datasetnamesr_  dataset_proportiondatasetpathfull_train_datasetr`  exclude_eval_datasetval_dataset_namesra  r  )rK  r~  datar  r$   r  r%   get_dataR  sP   



	
	

r  )r   r   )NT)Tr   )rQ  r   rR  r;   NN)^r   r   r^   rv  r   rg   rP   dataclassesr   r   r   r?   pandaspdrC   torch.nn.functionalnn
functionalr  torchvision.datasetsdatasetstorchvision.transformsr  r  re  PILr   torch.utils.datar   r   r   torch.utils.data.distributedr   	functoolsr   pathlibr	   r   tempfilerD  
contextlibr
   clap_module.utilsr   r   r   r   clap_moduler   r)   transformersr   r   r   horovod.torchhvdImportErrorr   from_pretrainedr*   r,   r-   r1   r   rz   __file__parent_AUDIOSET_MAP_PATHrS   rx   r9   rA   rD   rH   rI   r   r   r   r   rj  rk  ro  rp  r   r   r&  r.  r?  rP  r  r  r  r  r  r$   r$   r$   r%   <module>   s    



) 
5


'k31
 $
	