o
    Si                     @   s  d dl Z d dlmZmZmZmZ d dlZd dlmZm	Z	m
Z
mZmZ d dlmZ d dlmZmZ 				dd	ee
ef d
edededee dee	ddf fddZ	dde
d
edededee f
ddZ	dded
edededee f
ddZdee fddZdS )    N)	GeneratorListOptionalUnion)CutSetMonoCutRecordingSetSupervisionSegmentadd_durations)trim_supervisions_to_recordings)fastcopyis_module_availablebasecpuFmanifest
model_namedeviceforce_nonoverlappingdownload_rootreturnc                 k   sr    t ds	J dt| trt| ||||fi |E dH  dS t| tr5t| ||||fi |E dH  dS td)a  
    Use OpenAI Whisper model to annotate either RECORDINGS_MANIFEST, RECORDINGS_DIR, or CUTS_MANIFEST.
    It will perform automatic segmentation, transcription, and language identification. If
    the first argument is a CutSet, it will overwrite the supervisions with the results of the inference.

    Note: this is an experimental feature of Lhotse, and is not guaranteed to yield
    high quality of data.

    See the original repo for more details: https://github.com/openai/whisper

    :param manifest: a ``RecordingSet`` or ``CutSet`` object.
    :param language: specify the language if known upfront, otherwise it will be auto-detected.
    :param model_name: one of available Whisper variants (base, medium, large, etc.).
    :param device: Where to run the inference (cpu, cuda, etc.).
    :param force_nonoverlapping: if True, the Whisper segment time-stamps will be processed to make
        sure they are non-overlapping.
    :param download_root: if specified, the model will be downloaded to this directory. Otherwise,
        it will be downloaded to the default location specfied by whisper.
    :param decode_options: additional options to pass to the ``whisper.transcribe`` function.
    :return: a generator of cuts (use ``CutSet.open_writer()`` to write them).
    whisperzThis function expects OpenAI Whisper to be installed. You can install it via 'pip install git+https://github.com/openai/whisper.git' (see https://github.com/openai/whisper for details).Nz;The ``manifest`` must be either a RecordingSet or a CutSet.)r   
isinstancer   _annotate_recordingsr   _annotate_cuts
ValueError)r   r   r   r   r   decode_options r   L/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/workflows/whisper.pyannotate_with_whisper   s2   


	r   
recordingsc                 +   s    ddl }|j|||d}| D ]U  jdkr%td j d j d qt d	 
d}|jd||d	| fd
dd D }	  }
|	ra|rUt|	n|	}	tt |	dd|
_|
V  qdS )zE
    Helper function that annotates a RecordingSet with Whisper.
    r   Nr   r      zSkipping recording '
'. It has 4 channels, but we currently only support mono input.>  modelaudioc                    st   g | ]6}|d  |d  dkrt  j d|d d jt|d ddt|d  |d  dd	|d
  d dqS endstartr   -id06d   )ndigitsr$   )sampling_ratetextlanguage)r,   recording_idr*   durationr1   r2   )r	   r,   roundr
   strip.0segment	recordingresultr   r   
<listcomp>]   s    
z(_annotate_recordings.<locals>.<listcomp>segmentsF)r   supervisionsverboser   )r   
load_modelnum_channelsloggingwarningr,   torch
from_numpyresample
load_audiosqueeze
transcribeto_cut_postprocess_timestampslistr   r?   )r   r   r   r   r   r   r   r&   r'   r?   cutr   r:   r   r   D   s6   

r   cutsc                 +   s    ddl }|j|||d}| D ]I  jdkr%td j d j d qt d	 
d}|jd||d	| fd
dd D }	t |rQt|	n|	d}
|
V  qdS )z?
    Helper function that annotates a CutSet with Whisper.
    r   Nr    r!   zSkipping cut 'r"   r#   r$   r%   c                    s|   g | ]:}|d  |d  dkrt  j d|d d jt|d ddtt|d   j|d  dd	|d
  d dqS r(   )r	   r,   r3   r5   r
   minr4   r6   r7   rN   r<   r   r   r=      s     
z"_annotate_cuts.<locals>.<listcomp>r>   )r?   r   )r   rA   rB   rC   rD   r,   rE   rF   rG   rH   rI   rJ   r   rL   )rO   r   r   r   r   r   r   r&   r'   r?   new_cutr   rQ   r   r   z   s,   

r   r?   c                 C   sv   ddl m} t| dd d} t| dk r| S g }|d| D ]\}}|j|jkr.|j|jd}|| q|| |S )z
    Whisper tends to have a lot of overlapping segments due to inaccurate end timestamps.
    Under a strong assumption that the input speech is non-overlapping, we can fix that
    by always truncating to the start timestamp of the next segment.
    r   )sliding_windowc                 S   s   | j S N)r*   )sr   r   r   <lambda>   s    z)_postprocess_timestamps.<locals>.<lambda>)key   )r)   )cytoolzrS   sortedlenr)   r*   trimappend)r?   rS   outcurnxtr   r   r   rL      s   
rL   )r   r   FNrT   )rC   typingr   r   r   r   rE   lhotser   r   r   r	   r
   	lhotse.qar   lhotse.utilsr   r   strboolr   r   r   rL   r   r   r   r   <module>   s`    

>
;
2