o
    Ni                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddlZ	ddl
m  mZ ddlmZ dZdZdZd	Zd
d Zdd ZG dd dejjZdS )zDementiaBank dataset.    )absolute_import)division)print_functionNz
@article{boller2005dementiabank,
  title={Dementiabank database guide},
  author={Boller, Francois and Becker, James},
  journal={University of Pittsburgh},
  year={2005}
}
a  
DementiaBank is a medical domain task. It contains 117 people diagnosed with
Alzheimer Disease, and 93 healthy people, reading a description of an image, and
the task is to classify these groups.
This release contains only the audio part of this dataset, without the text
features.
z$dementia/English/Pitt/Control/cookiez%dementia/English/Pitt/Dementia/cookiec                 C   s   t | |krtdjt | |dtdd | D }td| dkr)td| g }d}| D ]\}}|}||7 }||t|| t|| f q/|d	 d
 |d	 d |f|d	< |S )a  Computes boundary indices for each of the splits in split_probs.

  Args:
    split_probs: List of (split_name, prob), e.g. [('train', 0.6), ('dev', 0.2),
      ('test', 0.2)]
    n_items: Number of items we want to split.

  Returns:
    The item indices of boundaries between different splits. For the above
    example and n_items=100, these will be
    [('train', 0, 60), ('dev', 60, 80), ('test', 80, 100)].
  z]Not enough items for the splits. There are {splits} splits while there are only {items} items)splitsitemsc                 s       | ]\}}|V  qd S N ).0namepr	   r	   Z/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/audio/dementiabank.py	<genexpr>D       z,_compute_split_boundaries.<locals>.<genexpr>   g:0yE>z"Probs should sum up to 1. probs={}g        r   )len
ValueErrorformatsumabsappendint)split_probsn_itemstotal_probssplit_boundariessum_pr   r   prevr	   r	   r   _compute_split_boundaries3   s$   "r   c                 C   s   t tdd | D }tj|}|| t|t|}i }|D ]\}}}	t||	D ]}
||||
 < q+q!t	
t}| D ]\}}|| }|| | q<|S )a  Split items to train/dev/test, so all items in group go into same split.

  Each group contains all the samples from the same speaker ID. The samples are
  splitted so that all each speaker belongs to exactly one split.

  Args:
    items_and_groups: Sequence of (item_id, group_id) pairs.
    split_probs: List of (split_name, prob), e.g. [('train', 0.6), ('dev', 0.2),
      ('test', 0.2)]
    split_number: Generated splits should change with split_number.

  Returns:
    Dictionary that looks like {split name -> set(ids)}.
  c                 s   r   r   r	   )r
   item_idgroup_idr	   r	   r   r   d   r   z-_get_inter_splits_by_group.<locals>.<genexpr>)sortedsetnprandomRandomStateshuffler   r   rangecollectionsdefaultdictlistr   )items_and_groupsr   split_numbergroupsrngr   group_id_to_split
split_namei_starti_endisplit_to_idsr    r!   splitr	   r	   r   _get_inter_splits_by_groupU   s   

r7   c                   @   sF   e Zd ZdZejdZe	d
eeZdd Zdd Zdd	 Zd
S )Dementiabankz>The DementiaBank dataset for voice classification of Dementia.z1.0.0z
  manual dir should contain 2 folders with mp3 files:

  * {}
  * {}

  Which were downloaded from https://media.talkbank.org/dementia/English/Pitt/
  This dataset requires registration for downloading.
  c              	   C   sD   t jj| tt jt jjdddt jjddgdtj	ddd	t
d
S )Nmp3iD  )file_formatsample_ratedementiacontrol)namesaudiolabel
speaker_id)r@   rA   zhttps://dementia.talkbank.org/)builderdescriptionfeaturessupervised_keyshomepagecitation)tfdscoreDatasetInfo_DESCRIPTIONrE   FeaturesDictAudio
ClassLabeltfstring	_CITATION)selfr	   r	   r   _info   s   zDementiabank._infoc                 C   s.  t j|jt}t j|jt}g }tjj	d
|D ]}t j|\}}|d\}}|d|d}	||	|f qtjj	d
|D ]}t j|\}}|d\}}|d|d}	||	|f qEg d}
t||
d}tjjtjjd|d	 id
tjjtjjd|d id
tjjtjjd|d id
gS )zReturns SplitGenerators.z{}/*.mp3-r=   r?   r<   ))traingffffff?)
validationg?)testg?r   examplesrV   )r   
gen_kwargsrW   rX   )ospathjoin
manual_dir_CONTROL_FOLDER_DEMENTIA_FOLDERrP   iogfileglobr   r6   r   r7   rI   rJ   SplitGeneratorSplitTRAIN
VALIDATIONTEST)rS   
dl_managercontrol_folderdementia_folderexamples_and_speaker_idsfname_
short_namerB   exampler   r   r	   r	   r   _split_generators   s<   


zDementiabank._split_generatorsc                 c   s.    |D ]}t j|d \}}||fV  qdS )zYields examples.r@   N)r[   r\   r6   )rS   rY   rp   rn   keyr	   r	   r   _generate_examples   s
   zDementiabank._generate_examplesN)__name__
__module____qualname____doc__rI   rJ   VersionVERSIONtextwrapdedentr   r_   r`   MANUAL_DOWNLOAD_INSTRUCTIONSrT   rq   rs   r	   r	   r	   r   r8   v   s    
#r8   )rw   
__future__r   r   r   r)   r[   rz   numpyr$   tensorflow.compat.v2compatv2rP   tensorflow_datasets.public_api
public_apirI   rR   rL   r_   r`   r   r7   rJ   GeneratorBasedBuilderr8   r	   r	   r	   r   <module>   s"   	"!