o
    9wiC[                     @   s  d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z
ddlZddlZddlZddlmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZmZ dd	lmZmZ g d
Z ddgZ!ddddddZ"i dddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2i d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdQdRdSdTZ#d}dUe$dVe$dWee$ fdXdYZ%	Z	[	[	[d~d\ee d]ej&j'd^e(d_ee$e
j)ej*f dVe$d`e$dae+dbe+dce+ddefdedfZ,	 ddgdhZ-didj Z.eG dkdl dlZ/ddmdnZ0eG dodp dpZ1eG dqdr drZ2ddtduZ3eG dvdw dwZ4dxdy Z5dd{d|Z6dS )z+
Forced Alignment with Whisper
C. Max Bain
    N)	dataclass)IterableOptionalUnionList)Wav2Vec2ForCTCWav2Vec2Processor)SAMPLE_RATE
load_audio)interpolate_nans)AlignedTranscriptionResultSingleSegmentSingleAlignedSegmentSingleWordSegmentSegmentData)PunktSentenceTokenizerPunktParameters)drvsmrmrsprofjazhWAV2VEC2_ASR_BASE_960HVOXPOPULI_ASR_BASE_10K_FRVOXPOPULI_ASR_BASE_10K_DEVOXPOPULI_ASR_BASE_10K_ESVOXPOPULI_ASR_BASE_10K_IT)enfrdeesitz.jonatasgrosman/wav2vec2-large-xlsr-53-japanesez3jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cnnlz+jonatasgrosman/wav2vec2-large-xlsr-53-dutchukz*Yehor/wav2vec2-xls-r-300m-uk-with-small-lmptz0jonatasgrosman/wav2vec2-large-xlsr-53-portuguesearz,jonatasgrosman/wav2vec2-large-xlsr-53-arabiccsz#comodoro/wav2vec2-xls-r-300m-cs-250ruz-jonatasgrosman/wav2vec2-large-xlsr-53-russianplz,jonatasgrosman/wav2vec2-large-xlsr-53-polishhuz/jonatasgrosman/wav2vec2-large-xlsr-53-hungarianfiz-jonatasgrosman/wav2vec2-large-xlsr-53-finnishfaz-jonatasgrosman/wav2vec2-large-xlsr-53-persianelz+jonatasgrosman/wav2vec2-large-xlsr-53-greektrz'mpoyraz/wav2vec2-xls-r-300m-cv7-turkishdaz(saattrupdan/wav2vec2-xls-r-300m-ftspeechhez%imvladikon/wav2vec2-xls-r-300m-hebrewviznguyenvulebinh/wav2vec2-base-vikoz"kresnik/wav2vec2-large-xlsr-koreanurz)kingabzpro/wav2vec2-large-xls-r-300m-Urdutez(anuragshas/wav2vec2-large-xlsr-53-teluguhiz#theainerd/Wav2Vec2-large-xlsr-hindicaz%softcatala/wav2vec2-large-xlsr-catalamlz!gvs/wav2vec2-large-xlsr-malayalamnoz!NbAiLab/nb-wav2vec2-1b-bokmaal-v2nnzNbAiLab/nb-wav2vec2-1b-nynorskskz#comodoro/wav2vec2-xls-r-300m-sk-cv8slz(anton-l/wav2vec2-large-xlsr-53-slovenianhrz%classla/wav2vec2-xls-r-parlaspeech-hrrozgigant/romanian-wav2vec2euz'stefan-it/wav2vec2-large-xlsr-53-basqueglz!ifrz/wav2vec2-large-xlsr-galiciankaz"xsway/wav2vec2-large-xlsr-georgianlvz'jimregan/wav2vec2-large-xlsr-latvian-cvtlz/Khalsuu/filipino-wav2vec2-l-xls-r-300m-officiallanguage_codedevice
model_namec              
   C   s6  |d u r%| t v rt |  }n| tv rt|  }ntd|  d td|  |tjjv rLd}tjj| }|jd|id	|}|
 }dd t|D }nEztj||d	}	tj||d	}W n tyx }
 zt|
 td
 td| dd }
~
ww d}|	|}|	j }dd |	j  D }| ||d}||fS )Nz;There is no default alignment model set for this language (z).                Please find a wav2vec2.0 model finetuned on this language in https://huggingface.co/models, then pass the model name in --align_model [MODEL_NAME]z%No default align-model for language: 
torchaudio	model_dir)	dl_kwargsc                 S   s   i | ]	\}}|  |qS  lower).0icrJ   rJ   O/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/whisperx/alignment.py
<dictcomp>^       z$load_align_model.<locals>.<dictcomp>)	cache_dirziError loading model from huggingface, check https://huggingface.co/models for finetuned wav2vec2.0 modelszThe chosen align_model "z" could not be found in huggingface (https://huggingface.co/models) or torchaudio (https://pytorch.org/audio/stable/pipelines.html#id14)huggingfacec                 S   s   i | ]	\}}|  |qS rJ   rK   )rM   charcoderJ   rJ   rP   rQ   j   rR   )language
dictionarytype)DEFAULT_ALIGN_MODELS_TORCHDEFAULT_ALIGN_MODELS_HFprint
ValueErrorrG   	pipelines__all____dict__	get_modelto
get_labels	enumerater   from_pretrainedr   	Exception	tokenizer	get_vocabitems)rD   rE   rF   rH   pipeline_typebundlealign_modellabelsalign_dictionary	processorealign_metadatarJ   rJ   rP   load_align_modelM   s8   



rr   nearestF
transcriptmodelalign_model_metadataaudiointerpolate_methodreturn_char_alignmentsprint_progresscombined_progressreturnc	           M   
      s  t |st|trt|}t |}t|jdkr|d}|jd t	 }	|d  |d }
|d }t| }i }t
| D ]\}}|r]|d | d }|rRd|d  n|}td	|d
d t|d t|d   }t|d t|d   }|d }|
tvr|d}n|}g g }}t
|D ]=\}}| }|
tvr|dd}||k rq|t|| d krq|  v r|| || q|d || qg }t
|D ]\}}t fdd| D r|| q|| qt }tt|_t|}t||} |||| d||< q<g }!t
| D ]\}}|d }"|d }#|d }|"|#|g dd}$|r6g |$d< t|| d dkrRtd|d  d |!|$ q|"|	krhtd|d  d |!|$ qd|| d }% fdd|%D }&t|"t	 }'t|#t	 }(|dd|'|(f })|)jd dk rt |)jd g|}*t j j!"|)dd|)jd  f})nd}*t # 6 |d kr||)||*d!\}+},n|d"kr||)|j$}+nt%d#| d$t j&|+dd%}+W d   n	1 sw   Y  |+d ' ( }-d}. ) D ]\}}/|d&ks|d'kr|/}.qt*|-|&|.}0t+|0|-|&|.dd(}1|1du r>td|d  d) |!|$ qt,|1|%}2|#|" }3|3|)-d |0-dd  }4g }5d}6t
|D ]d\}}d*\}7}8}9||| d+ v r|2|| d+ .| }:t/|:j0|4 |" d,}7t/|:j1|4 |" d,}8t/|:j2d,}9|5||7|8|9|6d- |
tv r|6d7 }6q]|t|d ks||d  dkr|6d7 }6q]t34|5}5g };d|5d.< t
|| d/ D ]\}<\}=}>|5j5|5j.|=k|5j.|>k@  }?|<|5j5|5j.|=k|5j.|>k@ d.f< ||=|> }@|?d 6 }A|?|?d0 dk }B|Bd 7 }Cg }D|?d1 8 D ]d}6|?j5|?d1 |6k }Ed|Ed0 9 : }Ft|Fdkr8q|E|Ed0 dk }E|Ed 6 }G|Ed 7 }Ht/|Ed2 ; d,}Id3|Fi}Jt<=|Gsc|G|Jd< t<=|Hsm|H|Jd< t<=|Isw|I|Jd2< |D|J q|;|@|A|C|Dd4 |r|?g d5 }?|?j>dd6d7 |??d8}?d9d |?D }?|?|;d d< qt34|;};t@|;d |d:|;d< t@|;d |d:|;d< djd;d<}K|
tv rdj|Kd< |rd;|Kd< |;jAddgd=d>B|K};|;?d8};|!|;7 }!qg }L|!D ]	}|L|d? 7 }Lq|!|Ld@S )AzG
    Align phoneme recognition predictions to known transcription.
       r   rX   rW   rY   d   2      z
Progress: z.2fz%...text |*c                    s   g | ]}|   v qS rJ   )keysrM   rO   model_dictionaryrJ   rP   
<listcomp>       zalign.<locals>.<listcomp>)
clean_char	clean_cdx	clean_wdxsentence_spansstartendN)r   r   r   wordscharsr   r   zFailed to align segment ("zU"): no characters in this segment found in model dictionary, resorting to original...z?"): original start time longer than audio duration, skipping... c                    s   g | ]}  |d qS )getr   r   rJ   rP   r      r   r   i  rG   )lengthsrT   zAlign model of type z not supported.)dimz[pad]z<pad>)
beam_widthz."): backtrack failed, resorting to original...)NNNr      )rU   r   r   scoreword-idxzsentence-idxr   rU   r   r   word)r   r   r   r   )rU   r   r   r   T)inplacerecordsc                 S   s   g | ]}d d |  D qS )c                 S   s   i | ]\}}|d kr||qS r   rJ   )rM   keyvalrJ   rJ   rP   rQ   g      z$align.<locals>.<listcomp>.<dictcomp>)ri   )rM   rU   rJ   rJ   rP   r   g  r   )methodsum)r   r   F)as_indexr   )segmentsword_segments)Ctorch	is_tensor
isinstancestrr
   
from_numpylenshape	unsqueezer	   rd   r\   lstriprstripLANGUAGES_WITHOUT_SPACESsplitrL   replacer   appendanyr   setPUNKT_ABBREVIATIONSabbrev_typesr   listspan_tokenizejoinint	as_tensorrb   r:   
functionalpadinference_modelogitsNotImplementedErrorlog_softmaxcpudetachri   get_trellisbacktrack_beammerge_repeatssizeindexroundr   r   r   pd	DataFramelocminmaxuniquetoliststripmeannpisnanfillnato_dictr   groupbyagg)Mrt   ru   rv   rw   rE   rx   ry   rz   r{   MAX_DURATION
model_lang
model_typetotal_segmentssegment_datasdxsegmentbase_progresspercent_completenum_leadingnum_trailingr   per_wordr   r   cdxrU   char_r   wdxwrdpunkt_paramsentence_splitterr   aligned_segmentst1t2aligned_seg
text_cleantokensf1f2waveform_segmentr   	emissions_emissionblank_idrV   trellispathchar_segmentsdurationratiochar_segments_arrword_idxr   r   r   char_segaligned_subsegmentssdx2sstartsend
curr_charssentence_textsentence_start	end_charssentence_endsentence_words
word_chars	word_text
word_startword_end
word_scoreword_segmentagg_dictr   rJ   r   rP   alignq   sp  













	




$






r  c              
   C   s   |  d}t|}t||f}t| dd |f d|dd df< td |ddd f< td|| d d df< t|d D ].}t||dd f | ||f  ||d df t| | |dd  | ||d dd f< q@|S )Nr   r}   infr   )	r   r   r   zeroscumsumfloatrangemaximumget_wildcard_emission)r   r   r   	num_frame
num_tokensr   trJ   rJ   rP   r     s   
$&r   c                 C   s   d|  krt | k sJ  J t|tjst|n|}|dk}| |jdd  }|  }td||< |	 }t
|||}|S )aB  Processing token emission scores containing wildcards (vectorized version)

    Args:
        frame_emission: Emission probability vector for the current frame
        tokens: List of token indices
        blank_id: ID of the blank token

    Returns:
        tensor: Maximum probability score for each token position
    r   r   )r   -inf)r   r   r   Tensortensorclamplongcloner  r   where)frame_emissionr   r   wildcard_maskregular_scoresmax_valid_scoreresultrJ   rJ   rP   r    s    r  c                   @   s&   e Zd ZU eed< eed< eed< dS )Pointtoken_index
time_indexr   N)__name__
__module____qualname__r   __annotations__r  rJ   rJ   rJ   rP   r+    s   
 r+  c                 C   sH  |  dd |  dd }}t|||||f   g}|dkrz|dks(J ||d |f }t||d  || g|d }| |d |f | }	| |d |d f | }
|d8 }|
|	kra|d8 }|
|	krg|n|  }|t||| |dks"|dkr||d |f   }|t||d | |d8 }|dks~|d d d S )Nr   r}   r   )r   r+  expitemr  r   )r   r   r   r   r  jr   p_stayp_changestayedchangedprobrJ   rJ   rP   	backtrack  s(   r:  c                   @   s"   e Zd ZU ee ed< eed< dS )Pathpointsr   N)r.  r/  r0  r   r+  r1  r  rJ   rJ   rJ   rP   r;    s   
 r;  c                   @   s6   e Zd ZU dZeed< eed< eed< ee ed< dS )	BeamStatezState in beam search.r,  r-  r   r   N)	r.  r/  r0  __doc__r   r1  r  r   r+  rJ   rJ   rJ   rP   r=    s   
 r=     c              
   C   s@  |  dd |  dd }}t||| ||f t|||||f   gd}|g}|r|d jdkrg }	|D ]}
|
j|
j}}|dkrFq8||d |f }t||d  || g|d }| |d |f }|dkrs| |d |d f ntd}t	
|s|
j }|t||d |   |	t||d ||d |dkrt	
|s|
j }|t|d |d |   |	t|d |d ||d q8t|	dd ddd	| }|sn	|r|d jdks4|sd	S |d }|j}|j}|dkr||d |f   }|jt||d | |d8 }|dks|jd	d	d
 S )a  Standard CTC beam search backtracking implementation.

    Args:
        trellis (torch.Tensor): The trellis (or lattice) of shape (T, N), where T is the number of time steps
                                and N is the number of tokens (including the blank token).
        emission (torch.Tensor): The emission probabilities of shape (T, N).
        tokens (List[int]): List of token indices (excluding the blank token).
        blank_id (int, optional): The ID of the blank token. Defaults to 0.
        beam_width (int, optional): The number of top paths to keep during beam search. Defaults to 5.

    Returns:
        List[Point]: the best path
    r   r}   )r,  r-  r   r   r  c                 S   s   | j S Nr   )xrJ   rJ   rP   <lambda>3  s    z backtrack_beam.<locals>.<lambda>T)r   reverseNr   )r   r=  r+  r2  r3  r,  r-  r  r  mathisinfr   copyr   sorted)r   r   r   r   r   TJ
init_statebeams
next_beamsbeamr  r4  r5  r6  
stay_scorechange_scorenew_path	best_beamr9  rJ   rJ   rP   r     sh   
$


"+
r   c                   @   sB   e Zd ZU eed< eed< eed< eed< dd Zedd Z	d	S )
Segmentlabelr   r   r   c                 C   s*   | j  d| jdd| jdd| jddS )Nz	(z4.2fz): [5dz, ))rT  r   r   r   selfrJ   rJ   rP   __repr__N  s   *zSegment.__repr__c                 C   s   | j | j S r@  )r   r   rW  rJ   rJ   rP   lengthQ  s   zSegment.lengthN)
r.  r/  r0  r   r1  r   r  rY  propertyrZ  rJ   rJ   rJ   rP   rS  G  s   
 rS  c                    s   d\}}g }|t  k rc|t  k r0 | j | jkr0|d7 }|t  k r0 | j | jkst fddt||D ||  }|t| | j  | j |d  jd | |}|t  k s|S )Nr   r   r}   c                 3   s    | ]} | j V  qd S r@  rA  )rM   kr   rJ   rP   	<genexpr>[  s    z merge_repeats.<locals>.<genexpr>)r   r,  r   r  r   rS  r-  )r   rt   i1i2r   r   rJ   r^  rP   r   U  s$     $r   r   c                 C   s   g }d\}}|t | k rb|t | ks| | j|krX||krQ| || }ddd |D }tdd |D tdd |D  }|t|| | j| |d  j| |d }|}n|d7 }|t | k s|S )	Nr\  r   c                 S   s   g | ]}|j qS rJ   )rT  rM   segrJ   rJ   rP   r   n  s    zmerge_words.<locals>.<listcomp>c                 s   s    | ]	}|j |j V  qd S r@  )r   rZ  rb  rJ   rJ   rP   r_  o  s    zmerge_words.<locals>.<genexpr>c                 s   s    | ]}|j V  qd S r@  )rZ  rb  rJ   rJ   rP   r_  o  s    r}   )r   rT  r   r   r   rS  r   r   )r   	separatorr   r`  ra  segsr   r   rJ   rJ   rP   merge_wordsg  s   $$rf  )NN)rs   FFF)r   )r   r?  )r   )7r>  rE  dataclassesr   typingr   r   r   r   numpyr   pandasr   r   rG   transformersr   r   whisperx.audior	   r
   whisperx.utilsr   whisperx.typesr   r   r   r   r   nltk.tokenize.punktr   r   r   r   rZ   r[   r   rr   r:   Moduledictndarrayr   boolr  r   r  r+  r:  r;  r=  r   rS  r   rf  rJ   rJ   rJ   rP   <module>   s
   	
 !"&*	

  
!
&
S