o
    Ni                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddl	m
  mZ ddlmZ dZdZdZd	Zd
ZdZg dZdd Zdd ZG dd dejjZdS )zCREMA-D dataset.    )absolute_import)division)print_functionNab  
@article{cao2014crema,
  title={{CREMA-D}: Crowd-sourced emotional multimodal actors dataset},
  author={Cao, Houwei and Cooper, David G and Keutmann, Michael K and Gur, Ruben C and Nenkova, Ani and Verma, Ragini},
  journal={IEEE transactions on affective computing},
  volume={5},
  number={4},
  pages={377--390},
  year={2014},
  publisher={IEEE}
}
a  
CREMA-D is an audio-visual data set for emotion recognition. The data set
consists of facial and vocal emotional expressions in sentences spoken in a
range of basic emotional states (happy, sad, anger, fear, disgust, and neutral).
7,442 clips of 91 actors with diverse ethnic backgrounds were collected.
This release contains only the audio stream from the original audio-visual
recording.
The samples are splitted between train, validation and testing so that samples 
from each speaker belongs to exactly one split.
z1https://github.com/CheyneyComputerScience/CREMA-DzEhttps://storage.googleapis.com/tfds-data/manual_checksums/crema_d.txtzihttps://raw.githubusercontent.com/CheyneyComputerScience/CREMA-D/master/processedResults/summaryTable.csvzYhttps://media.githubusercontent.com/media/CheyneyComputerScience/CREMA-D/master/AudioWAV/)NEUHAPSADANGFEADISc                 C   s   t | |krtdjt | |dtdd | D }td| dkr)td| g }d}| D ]\}}|}||7 }||t|| t|| f q/|d	 d
 |d	 d |f|d	< |S )a  Computes boundary indices for each of the splits in split_probs.

  Args:
    split_probs: List of (split_name, prob), e.g. [('train', 0.6), ('dev', 0.2),
      ('test', 0.2)]
    n_items: Number of items we want to split.

  Returns:
    The item indices of boundaries between different splits. For the above
    example and n_items=100, these will be
    [('train', 0, 60), ('dev', 60, 80), ('test', 80, 100)].
  z]Not enough items for the splits. There are {splits} splits while there are only {items} items)splitsitemsc                 s       | ]\}}|V  qd S N ).0namepr   r   U/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/audio/crema_d.py	<genexpr>N       z,_compute_split_boundaries.<locals>.<genexpr>   g:0yE>z"Probs should sum up to 1. probs={}g        r   )len
ValueErrorformatsumabsappendint)split_probsn_itemstotal_probssplit_boundariessum_pr   r   prevr   r   r   _compute_split_boundaries=   s$   "r%   c                 C   s   t tdd | D }tj|}|| t|t|}i }|D ]\}}}	t||	D ]}
||||
 < q+q!t	
t}| D ]\}}|| }|| | q<|S )aD  Split items to train/dev/test, so all items in group go into same split.

  Each group contains all the samples from the same speaker ID. The samples are
  splitted between train, validation and testing so that samples from each
  speaker belongs to exactly one split.

  Args:
    items_and_groups: Sequence of (item_id, group_id) pairs.
    split_probs: List of (split_name, prob), e.g. [('train', 0.6), ('dev', 0.2),
      ('test', 0.2)]
    split_number: Generated splits should change with split_number.

  Returns:
    Dictionary that looks like {split name -> set(ids)}.
  c                 s   r   r   r   )r   item_idgroup_idr   r   r   r   o   r   z-_get_inter_splits_by_group.<locals>.<genexpr>)sortedsetnprandomRandomStateshuffler%   r   rangecollectionsdefaultdictadd)items_and_groupsr   split_numbergroupsrngr"   group_id_to_split
split_namei_starti_endisplit_to_idsr&   r'   splitr   r   r   _get_inter_splits_by_group_   s   

r=   c                   @   s4   e Zd ZdZejdZdd Zdd Z	dd Z
d	S )
CremaDz:The audio part of CREMA-D dataset for emotion recognition.z1.0.0c              	   C   sD   t jj| tt jt jjdddt jjtt	dt
jddttdS )Nwavi>  )file_formatsample_rate)namesaudiolabel
speaker_id)rD   rE   )builderdescriptionfeaturessupervised_keyshomepagecitation)tfdscoreDatasetInfo_DESCRIPTIONrI   FeaturesDictAudio
ClassLabellistLABELStfstring	_HOMEPAGE	_CITATION)selfr   r   r   _info   s   zCremaD._infoc                 C   sb  | t |dti}g }g }g }tg d}tjj|d >}|D ]3}|	 
dd dd}	|	r9|	|v r:q$tjtd|	 }
||
 ||	
dd	  ||	 q$W d
   n1 sbw   Y  |d|i}tt|d |}tt||}g d}t||d	}tjjtjjd|d idtjjtjjd|d idtjjtjjd|d idgS )zReturns SplitGenerators.summary_table)FileName1040_ITH_SAD_XX1006_TIE_NEU_XX1013_WSI_DIS_XX1017_IWW_FEA_XX,r   " z%s.wav_r   N	all_files))traingffffff?)
validationg?)testg?file_paths_and_namesrg   )r   
gen_kwargsrh   ri   )download_checksums_CHECKSUMS_URLdownloadSUMMARY_TABLE_URLr)   rV   iogfileGFilestripr<   replaceospathjoinWAV_DATA_URLr   rT   zipr=   rM   rN   SplitGeneratorSplitTRAIN
VALIDATIONTEST)rZ   
dl_managercsv_pathall_wav_filesspeaker_ids	wav_names	bad_filesflinewav_namewav_pathwav_and_speaker_idsr   r   r   r   r   _split_generators   sF   

	


zCremaD._split_generatorsc                 c   sF    |D ]\}}| dd }| dd }|||d}||fV  qdS )zYields examples.re   r      rC   N)r<   )rZ   rj   	file_path	file_namerF   rE   exampler   r   r   _generate_examples   s   zCremaD._generate_examplesN)__name__
__module____qualname____doc__rM   rN   VersionVERSIONr[   r   r   r   r   r   r   r>      s    'r>   )r   
__future__r   r   r   r/   ru   numpyr*   tensorflow.compat.v2compatv2rV   tensorflow_datasets.public_api
public_apirM   rY   rP   rX   rm   ro   rx   rU   r%   r=   rN   GeneratorBasedBuilderr>   r   r   r   r   <module>   s&   ""