o
    SiyR                  /   @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dl	Z	d dl
m
Z
 d dlmZmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ e dd Ze e	jde	jddde	jdde	jdddddde	jdde	jddddde	jdde	jdddddde	jdd d!d"d#e	jd$d%d&d'd#e	jd(d)d*d+e	jd,d-d.d/d#e	jd0d1d2d3d#e	jd4dd5d#ded6ee d7ee d8ee d9ed:ed;ee d<ed=ed>efd?d@Z e e	jdAe	jddddde	jde	jddde	jd$dBdCdDd#e	jd,d-d.d/d#e	jdEddFd#e	jd0dGd2dHd#e	jdIddJd#	2	ddAededKed<edLedMedNefdOdPZ!e e	jdAe	jddddde	jde	jddde	jdQe	"dRdSgdRdTdUe	jdVe#dWdXddYe	jdZe#d[d\ddYe	jd]e#d^d_ddYe	jd`e#d^daddYe	jdbe#d[dcddYe	jdde#dedfddYe	jdgdhe	jdddiddjdUe	jdkddld#e	jdmdne	jdddiddodUe	jdpd$eddqdUe	jdrded2dsdUe	jdtduedvdwdxe	jdydzedd{dxe	jd|d,e#d}d~dUe	jddedddUe	jdddd#e	jdedddUe	jdGd0ed2dHdUdAededede#de#de#de#de#de#dee dedee fddZ$e e	jdde	jdddddde	jdde	jdddddde	jdd%ddd#e	jd,d-d.d/d#e	jd0d1d2d3d#e	jddddd6edee d:ed<ed=edefddZ%e e	jde	jddde	jdde	jdddddde	jdde	jddddde	jdde	jdddddde	jdd d!d"d#e	jdzdddd#e	jd0d1d2d3d#ded6ee d7ee d8ee d9eded=efddZ&dS )    N)partial)chain)Path)Optional)tqdm)CutSetRecordingSetSupervisionSet)cli)ParallelExecutorload_manifest_lazy_or_eager)PythonLiteralOptionexactly_one_not_nullc                   C   s   dS )z&Workflows using corpus creation tools.N r   r   r   N/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/bin/modes/workflows.py	workflows   s   r   out_cutsT)
allow_dash)typez-mz--recordings-manifestF)existsdir_okayr   z'Path to an existing recording manifest.)r   helpz-rz--recordings-dir)r   	file_okayzNDirectory with recordings. We will create a RecordingSet for it automatically.z-cz--cuts-manifestz"Path to an existing cuts manifest.z-ez--extensionwavz=Audio file extension to search for. Used with RECORDINGS_DIR.)defaultr   z-nz--model-namebasez3One of Whisper variants (base, medium, large, etc.)z-lz
--languagez2Language spoken in the audio. Inferred by default.)r   z-dz--devicecpuz%Device on which to run the inference.z-jz--jobs   z"Number of jobs for audio scanning.z)--force-nonoverlapping/--keep-overlappingzaIf True, the Whisper segment time-stamps will be processed to make sure they are non-overlapping.recordings_manifestrecordings_dircuts_manifest	extension
model_namelanguagedevicejobsforce_nonoverlappingc
              	   C   s   ddl m}
 t|||sJ d|durt|}n|dur*tj|d| |d}nt| }t| #}t	|
|||||	dt
|dd	D ]	}|j|d
d qGW d   dS 1 s\w   Y  dS )a  
    Use OpenAI Whisper model to annotate either RECORDINGS_MANIFEST, RECORDINGS_DIR, or CUTS_MANIFEST.
    It will perform automatic segmentation, transcription, and language identification.

    RECORDINGS_MANIFEST, RECORDINGS_DIR, and CUTS_MANIFEST are mutually exclusive. If CUTS_MANIFEST
    is provided, its supervisions will be overwritten with the results of the inference.

    Note: this is an experimental feature of Lhotse, and is not guaranteed to yield
    high quality of data.
    r   )annotate_with_whispersOptions RECORDINGS_MANIFEST, RECORDINGS_DIR, and CUTS_MANIFEST are mutually exclusive and at least one is required.N*.patternnum_jobs)r$   r#   r%   r'   zAnnotating with WhispertotaldescTflush)lhotser(   r   r   	from_filefrom_dirr   to_eageropen_writerr   lenwrite)r   r   r    r!   r"   r#   r$   r%   r&   r'   annotate_with_whisper_manifestwritercutr   r   r   r(      s4   D
"r(   in_cutsz--bundle-nameWAV2VEC2_ASR_BASE_960HzeOne of torchaudio pretrained 'bundle' variants (see: https://pytorch.org/audio/stable/pipelines.html)z&--normalize-text/--dont-normalize-textzBy default, we'll try to normalize the text by making it uppercase and discarding symbols outside of model's character level vocabulary. If this causes issues, turn the option off and normalize the text yourself.z
--num-jobszNumber of parallel jobs to run.z&--check-language/--dont-check-languagezYIf `False`, warnings about non-existent language tags in supervisions will be suppressed.bundle_namenormalize_textr-   check_languagec                 C   s|   ddl m} t| }t|%}	t||||||d|dt|ddD ]	}
|	j|
dd q"W d	   d	S 1 s7w   Y  d	S )
a  
    Use a pretrained ASR model from torchaudio to force align IN_CUTS (a Lhotse CutSet)
    and write the results to OUT_CUTS.
    It will attach word-level alignment information (start, end, and score) to the
    supervisions in each cut.

    This is based on a tutorial from torchaudio:
    https://pytorch.org/audio/stable/tutorials/forced_alignment_tutorial.html

    In order to use a multilingual alignment model, use `--bundle_name MMS_FA`.
    (based on the multilingual tutorial: https://pytorch.org/audio/main/tutorials/forced_alignment_for_multilingual_data_tutorial.html)

    Note: this is an experimental feature of Lhotse, and is not guaranteed to yield
    high quality of data.
    r   )align_with_torchaudioF)r@   r%   rA   r-   verboserB   Aligningr.   Tr1   N)r3   rC   r   r   r7   r   r8   r9   )r>   r   r@   r%   rA   r-   rB   align_with_torchaudio_cutsr<   r=   r   r   r   rC   {   s&   8	
"rC   z--methodindependentconversationalzThe simulation method to use: independent - each speaker is simulated independently, conversational - the speakers are simulated as a group, using overall silence/overlap statistics.)r   r   r   z--locg        zVThe minimum silence duration between two consecutive utterances from the same speaker.)r   r   r   show_defaultz--scaleg       @zThe scale parameter of the exponential distribution used to sample the silence duration between two consecutive utterances from a speaker.z--same-spk-pauseg      ?z>The mean pause duration between utterances of the same speakerz--diff-spk-pausez@The mean pause duration between utterances of different speakersz--diff-spk-overlapzBThe mean overlap duration between utterances of different speakersz--prob-diff-spk-overlapg      ?zCThe probability of overlap between utterances of different speakersz--fit-to-supervisionsz-f)r   r   zDPath to a supervision set to learn the distributions for simulation.z --reverberate/--dont-reverberatez5If True, the simulated meetings will be reverberated.z--rir-recordingsz--rira  Path to a recording set containing RIRs. If provided, the simulated meetings will be reverberated using the RIRs from this set. A directory containing recording sets can also be provided, in which case each meeting will use a recording set sampled from this directory.z--num-meetingszNNumber of meetings to simulate. Either this of `num_repeats` must be provided.z--num-repeatszNumber of times to repeat each input cut. The resulting cuts will be used as a finite set of utterances to use for simulation. Either this of `num_meetings` must be provided.z--num-speakers-per-meetingz-s2zNumber of speakers per meeting. One or more integers can be provided (comma-separated). In this case, the number of speakers will be sampled uniformly from the provided list, or using the distribution provided in `speaker-count-probs`.)clsr   r   z--speaker-count-probsz-pzA list of probabilities for each speaker count. The length of the list must be equal to the number of elements in `num-speakers-per-meeting`.z--max-duration-per-speakerg      4@z2Maximum duration of a single speaker in a meeting.z--max-utterances-per-speakerz-u   z6Maximum number of utterances per speaker in a meeting.z(--allow-3fold-overlap/--no-3fold-overlapzIf True, the simulated meetings will allow more than 2 speakers to overlap. This is only relevant for the `conversational` method.z--seedi  z Random seed for reproducibility.methodlocscalesame_spk_pausediff_spk_pausediff_spk_overlapprob_diff_spk_overlapfit_to_supervisionsreverberaterir_recordingsc                 K   s  |dkrddl m} |||d}|d n|dkr)ddl m} |||||d}ntd	| |	d
urCtd t|	td}|| t| }td |j	|fi |}|
rtd |r||
 rht|tdg}n
dd |dD }|j|g|R  }n||}td || d
S )aK  
    Simulate meeting-style mixtures using a provided CutSet containing single-channel
    cuts. Different simulation techniques can be selected using the `--method` option.
    Currently, the following methods are supported:

    - independent: each speaker is simulated independently, using the provided cuts as a finite
        set of utterances.

    - conversational: the speakers are simulated as a group, using overall silence/overlap
        statistics.

    The number of speakers per meeting is sampled uniformly from the range provided in
    `--num-speakers-per-meeting`.

    The number of meetings to simulate is controlled by either `--num-meetings` or
    `--num-repeats`. If the former is provided, the same number of meetings will be
    simulated. If the latter is provided, the provided cuts will be repeated `num_repeats`
    times, and the resulting cuts will be used as a finite set of utterances to use for simulation.

    The simulated meetings can be optionally reverberated using the RIRs from a provided
    recording set. If no RIRs are provided, we will use a fast random approximation technique
    to simulate the reverberation. The RIRs can be provided as a single recording set, or as
    a directory containing multiple recording sets. In the latter case, the RIRs will be sampled
    from the provided directory.

    rH   r   )"SpeakerIndependentMeetingSimulator)rO   rP   allow_3fold_overlaprI   )ConversationalMeetingSimulator)rQ   rR   rS   rT   z#Unknown meeting simulation method: Nz=Fitting the meeting simulator to the provided supervisions...)manifest_clszSimulating meetings...z'Reverberating the simulated meetings...c                 S   s   g | ]}t |qS r   r   ).0pr   r   r   
<listcomp>  s    z%simulate_meetings.<locals>.<listcomp>z
*.jsonl.gzz Saving the simulated meetings...)#lhotse.workflows.meeting_simulationrX   poprZ   
ValueErrorprintr   r	   fitsimulateis_filer   globrV   to_file)r>   r   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   kwargsrX   	simulatorrZ   supsrG   
mixed_cutsrirsr   r   r   simulate_meetings   sT    >

rm   z-oz--output-supervisions-manifestzOPath to the output supervisions manifest or a directory where it will be saved.zsilero-vad-16kz8One of activity detector: silero_vad_16k, silero_vad_8k.z--force_downloadz+Forced cache clearing and model downloading)is_flagr   output_supervisions_manifestforce_downloadc                 C   s  ddl }ddlm}m} |d ||d}	|	|}
|
du r/td| dt|	  t	  t
|   }| r?| sLtdt|  t	  |du rS|jnt
|  }| rt
|j}d	D ]}||rv|dt|  }qf|d
| d7 }|| }|j stdt|  t	  tdt|  d tt| }|rtd |
  ntd |
d td|d t|
|d}t||ddd}td|d tt||}td|d |t| tdt|dd dS )a  
    Use activity detection methods (e.g., Silero VAD) to detect and annotate
    the segmentation of Lhotse RecordingSets and save the results in the
    SupervisionSet manifest. The output manifest will be saved in the path
    specified by OUTPUT_SUPERVISIONS_MANIFEST. If OUTPUT_SUPERVISIONS_MANIFEST
    is not provided, the output manifest will be saved in the same directory
    as RECORDINGS_MANIFEST.

    Note: this is an experimental feature and it does not guarantee
    high-quality performance and data annotation.
    r   N)SileroVAD8kSileroVAD16kignore)silero_vad_8ksilero_vad_16kzUnknown activity detector: z. Supported detectors: zRecordings manifest not found: )z.gzz.jsonlz.jsonz.yaml_supervisions_z	.jsonl.gzz5Parent directory for output manifest does not exist: zLoading recordings from z...z"Removing model state from cache...z Checking model state in cache...r   z(Making activity detection processor for )r%   TzRunning VAD)init_fnr-   rD   descriptionz!Running activity detection using zSaving z results ...zResults saved to:
)sep)warnings#lhotse.workflows.activity_detectionrq   rr   filterwarningsgetrb   listsysexitr   
expanduserabsoluter   re   strparentis_dirnameendswithr8   r   r4   rp   r   r   r	   from_segmentsr   from_iterablerg   )r   ro   r#   r%   r&   rp   r{   rq   rr   	detectorsdetector_kls	recs_path	sups_pathr   ext
recordingsdetector_init_fn	processorsupervisionsr   r   r   activity_detection  sn   8





r   z--is-personalized-mosz@Flag to indicate if personalized MOS score is needed or regular.is_personalized_mosc                 C   s   ddl m} t|||sJ d|durt|}n|dur*tj|d| |d}nt| }t|  }	t	|||dt
|dd	D ]	}
|	j|
d
d qDW d   dS 1 sYw   Y  dS )a  
    Use Microsoft DNSMOS P.835 prediction model to annotate either RECORDINGS_MANIFEST, RECORDINGS_DIR, or CUTS_MANIFEST.
    It will predict DNSMOS P.835 score including SIG, NAK, and OVRL.

    See the original repo for more details: https://github.com/microsoft/DNS-Challenge/tree/master/DNSMOS

    RECORDINGS_MANIFEST, RECORDINGS_DIR, and CUTS_MANIFEST are mutually exclusive. If CUTS_MANIFEST
    is provided, its supervisions will be overwritten with the results of the inference.
    r   )annotate_dnsmosr)   Nr*   r+   )r   z-Annotating with DNSMOS P.835 prediction modelr.   Tr1   )r3   r   r   r   r4   r5   r   r6   r7   r   r8   r9   )r   r   r    r!   r"   r   r&   annotate_dnsmos_r;   r<   r=   r   r   r   r   >  s.   3
"r   )r   T)'r   	functoolsr   	itertoolsr   pathlibr   typingr   clickr   r3   r   r   r	   lhotse.bin.modes.cli_baser
   lhotse.parallelr   lhotse.serializationr   lhotse.utilsr   r   groupr   commandargumentoptionr   intboolr(   rC   Choicefloatrm   r   r   r   r   r   r   <module>   s  
	
 5-
			
6b\