o
    Siy                     @   s   d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	m
Z
mZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlmZmZ dd	lmZmZmZ d
Z			ddedeee
e f de	e dedeeeeeeef f f f
ddZdS )a  
This recipe supports Chinese TTS corpora: WenetSpeech4TTS.

Paper: https://arxiv.org/abs/2406.05763v3
HuggingFace Dataset: https://huggingface.co/datasets/Wenetspeech4TTS/WenetSpeech4TTS

Download using huggingface-cli:
huggingface-cli login
huggingface-cli download --repo-type dataset --local-dir $DATA_DIR Wenetspeech4TTS/WenetSpeech4TTS

Extract the downloaded data:
for folder in Standard Premium Basic; do
  for file in "$folder"/*.tar.gz; do
    tar -xzvf "$file" -C "$folder"
  done
done
    N)Path)DictOptionalSequenceUnion)tqdm)SupervisionSegmentSupervisionSetfix_manifests$validate_recordings_and_supervisions)	RecordingRecordingSet)manifests_existread_manifests_if_cached)Pathlikeresumable_downloadsafe_extract)BasicPremiumStandardr      
corpus_dirdataset_parts
output_dirnum_jobsreturnc                 C   s\  t | } |  sJ d|  |dks|d dkrt}nt|tr/|tv s,J d| |g}i }|durGt |}|jddd t||dd	}i }i }i }t| d
 d 7}|D ],}	|	 	 }	|	d ||	d < d|	d vrv|	d ||	d < d|	d v r|	d ||	d < qXW d   n1 sw   Y  i }
i }i }t| d d }|D ]}	|	 	 }	t
|	d |
|	d < qW d   n1 sw   Y  t| d d }|D ]}	|	 	 }	t
|	d ||	d < qW d   n1 sw   Y  t| d d }|D ]}	|	 	 }	t
|	d ||	d < qW d   n	1 sw   Y  |D ]}t||ddr:td| d q$g }g }|dkrH|}|}n|dkrR|}|}n|}|
}t| d| dD ]\}}|dsmJ | |dd  }| std|  qat|}|| |jjd |jdddd }| std|  qat|d }| }|d  	d!d }|d  }W d   n	1 sw   Y  |t||d"|jdd#||||dd$d% qat|}t |}t!||\}}t"|| |dur#|#|d&| d'  |#|d(| d'  ||d)||< q$|S )*a  
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'Basic', 'Premium'.
        By default we will prepare all parts.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param num_jobs: the number of parallel workers parsing the data.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    zNo such directory: allr   zUnsupported dataset part: NT)parentsexist_okwenetspeech4tts)r   r   prefix	filelistszBasic_filelist.lstr   r   r   DNSMOS_P808ScoreszBasic_DNSMOS.lstzPremium_DNSMOS.lstzStandard_DNSMOS.lst)partr   r    zWenetSpeech4TTS subset: z already prepared - skipping.r   zPreparing WenetSpeech4TTS )descz../   zNo such file: txtswavsz.wavz.txtr	g        Chinese)	timestampdns_mos)idrecording_idstartdurationchannellanguagetextcustomwenetspeech4tts_supervisions_z	.jsonl.gzwenetspeech4tts_recordings_)
recordingssupervisions)$r   is_dirWENETSPEECH4TTS
isinstancestrmkdirr   openstripsplitfloatr   logginginfor   items
startswithis_filewarningr   	from_fileappendparentnamereplace	readlinesr   r0   getr   from_recordingsr	   from_segmentsr
   r   to_file)r   r   r   r   	manifestsbasic_wav_scp_dictpremium_wav_scp_dictstandard_wav_scp_dictflinebasic_dns_mos_dictpremium_dns_mos_dictstandard_dns_mos_dictr#   r7   r8   wav_scp_dictdns_mos_dictwav_namewav_path	recordingtxt_pathlinesr3   r+    rb   R/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/wenetspeech4tts.pyprepare_wenetspeech4tts,   s   

	











rd   )r   Nr   ) __doc__rB   reshutiltarfilepathlibr   typingr   r   r   r   r   lhotser   r	   r
   r   lhotse.audior   r   lhotse.recipes.utilsr   r   lhotse.utilsr   r   r   r:   r<   intrd   rb   rb   rb   rc   <module>   s6    	