o
    i/                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlZd dl	Z
d dlZd dlm  mZ ddlmZmZmZ ddlmZ erGddlmZ dejd	efd
dZejddde
jfddZejdddde
jfddZd7ddZdejde
jfddZ eG dd dZ!ddddd d!ed"ee d#ejd$ed%ed&e"dee! fd'd(Z#d)ee! d*e$d+e$fd,d-Z%d.d/d0d1ee& dd d!ed#ejd$ed2e$d3e$d4e"fd5d6Z'dS )8    N)	dataclass)TYPE_CHECKINGList   )
HOP_LENGTHSAMPLE_RATETOKENS_PER_SECOND)	Tokenizer)Whisperxfilter_widthc              	   C   s   |d }| j d |kr| S | j }dkr| ddddf } |dkr'|d dks+J dd}tj| |d |d ddfdd} | jr_zdd	lm} || |}W n ttj	fy^   t
d
 Y nw |du rt| d|d d d|d f }|dkr||d }|S )zMApply a median filter of width `filter_width` along the last dimension of `x`   Nr   r   z&`filter_width` should be an odd numberreflect)mode)median_filter_cudaz}Failed to launch Triton kernels, likely due to missing CUDA toolkit; falling back to a slower median kernel implementation....r   r   )shapendimFpadis_cuda
triton_opsr   RuntimeError
subprocessCalledProcessErrorwarningswarnunfoldsort)r   r   	pad_widthr   resultr    r"   `/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/sense_voice/whisper_lib/timing.pymedian_filter   s,    "r$   T)nopythontracec                 C   s   | j d d }| j d d }d| dd d f< d| d d df< g }|dks(|dkrj||d |d f | ||f dkrD|d8 }|d8 }n| ||f dkrQ|d8 }n| ||f dkr^|d8 }ntd|dks(|dks(t|}|d d dd d f jS )Nr   r   r   zUnexpected trace[i, j]r   )r   append
ValueErrornparrayT)r&   ijr!   r"   r"   r#   	backtrace7   s$   



r.   )r%   parallelc                 C   s$  | j \}}tj|d |d ftjdtj }tj|d |d ftjd }d|d< td|d D ]]}td|d D ]S}||d |d f }||d |f }|||d f }	||k rc||	k rc|d}
}n||k rq||	k rq|d}
}n|	d}
}| |d |d f |
 |||f< ||||f< q9q0t|S )Nr   dtyper   r   r   )r   r)   onesfloat32infranger.   )r   NMcostr&   r-   r,   c0c1c2ctr"   r"   r#   dtw_cpuP   s$   
"
 r>      c                 C   s8  ddl m} | j\}}||k sJ d|tj| d|d ftjd d |||   ||| }|j	
 }t|| d |d tj }d|d< | }tj|tjd}|d	 ||||d|d|d|||d
	 |j	 d |d || d   |d || d d d d |d f }t|  S )Nr   )
dtw_kernelz$M should be smaller than BLOCK_SIZE=r   )valuer   r   r0   )r   )
BLOCK_SIZE   )r   r@   r   r   r   r)   r4   flattenreshaper+   
contiguoustorchr2   cuda
zeros_likeint32strider.   cpunumpy)r   rB   r@   r7   r6   x_skewr8   r&   r"   r"   r#   dtw_cudaj   s,   
:
JrO   returnc              	   C   sH   | j rzt| W S  ttjfy   td Y nw t|  	 
 S )NzsFailed to launch Triton kernels, likely due to missing CUDA toolkit; falling back to a slower DTW implementation...)r   rO   r   r   r   r   r   r>   doublerL   rM   )r   r"   r"   r#   dtw   s   
rR   c                   @   s:   e Zd ZU eed< ee ed< eed< eed< eed< dS )
WordTimingwordtokensstartendprobabilityN)__name__
__module____qualname__str__annotations__r   intfloatr"   r"   r"   r#   rS      s   
 rS      g      ?)medfilt_widthqk_scalemodelr
   	tokenizertext_tokensmel
num_framesra   rb   c                   st  t |dkrg S tg |j|j||j| j}d g| jj	   fddt
| jjD }t 8 | |d|dd }	|	t |jd d |jf }
|
jdd}|tt ||f  W d    n1 sow   Y  |D ]}|  qvt fdd| j jD }|d d d d d |d f }|| jdd}tj|dd	d
d\}}|| | }t||}|jdd}|t |jd }t| \}}|||jg \}}t |dkrg S ttdd |d d D d}tjt|ddd t!}|| t" }||d d  }||dd   }fddt#|d d |dd  D }dd t#|||||D S )Nr   c                    s(   g | ]\}}|j |f fd d	qS )c                    s     ||d d S )Nr   r   )__setitem__)_insoutsindexQKsr"   r#   <lambda>       z+find_alignment.<locals>.<listcomp>.<lambda>)
cross_attnregister_forward_hook).0r,   blockrm   r"   r#   
<listcomp>   s    z"find_alignment.<locals>.<listcomp>r   )dimc                    s   g | ]
\}} | | qS r"   r"   )rs   _l_hrm   r"   r#   ru      s    r   TF)rv   keepdimunbiased)axisr   c                 S   s   g | ]}t |qS r"   )lenrs   r=   r"   r"   r#   ru      rp   )r   r   )constant_valuesc                    s"   g | ]\}}t  || qS r"   )r)   mean)rs   r,   r-   )text_token_probsr"   r#   ru          c                 S   s&   g | ]\}}}}}t |||||qS r"   )rS   )rs   rT   rU   rV   rW   rX   r"   r"   r#   ru      s    )$r}   rG   tensorsot_sequenceno_timestampseottodevicedimsn_text_layer	enumeratedecoderblocksno_grad	unsqueezesoftmaxr)   arangetolistremovestackalignment_headsindicesr+   std_meanr$   r   rR   split_to_word_tokensr   cumsumdiffastypeboolr   zip)rc   rd   re   rf   rg   ra   rb   rU   hookslogitssampled_logitstoken_probshookweightsstdr   matrixtext_indicestime_indiceswordsword_tokensword_boundariesjumps
jump_timesstart_times	end_timesword_probabilitiesr"   )rn   r   r#   find_alignment   sj   






 
$

r   	alignment	prependedappendedc                 C   s  t | d }t | d }|dkrD| | }| | }|jdr:|j |v r:|j|j |_|j|j |_d|_g |_n|}|d8 }|dksd}d}|t | k r| | }| | }|jdsv|j|v rv|j|j |_|j|j |_d|_g |_n|}|d7 }|t | k sNd S d S )Nr   r   r     )r}   rT   
startswithstriprU   endswith)r   r   r   r,   r-   previous	followingr"   r"   r#   merge_punctuations   s4   r   u   "'“¿([{-u   "'.。,，!！?？:：”)]}、)prepend_punctuationsappend_punctuationssegmentsr   r   last_speech_timestampc              	      s  t | dkrd S  fdd| D }	ttj|	}
t| |
||fi |}tdd |D }||  }t |dkr@t	|nd}t
dt|}|d }t |dkrd}td	t |D ]3}|| j|| j |kr|| j|v r||| j| || _q\||d	  j|v r|| j| || _q\t||| | d d
 t t }d}t| |	D ]7\}}
d}g }|t |k r|t |
k r|| }|jr|t|jt||j dt||j d|jd |t |j7 }|d	7 }|t |k r|t |
k st |dkr|d d | |d krp|d d |d d  |ks.t |d	krp|d	 d |d d  |d krpt |d	kra|d	 d |d	 d  |krat|d	 d d |d	 d | }| |d d< |d	 d< td|d d | |d d< |d |d d k r|d d |d d krtdt
|d d | |d |d d< n|d d |d< |d |d d kr|d d |d d k rt|d d | |d |d d< n|d d |d< |d }||d< qd S )Nr   c                    s"   g | ]} fd d|d D qS )c                    s   g | ]	}| j k r|qS r"   )r   )rs   tokenrd   r"   r#   ru     s    z2add_word_timestamps.<locals>.<listcomp>.<listcomp>rU   r"   )rs   segmentr   r"   r#   ru     r   z'add_word_timestamps.<locals>.<listcomp>c                 S   s   g | ]}|j |j qS r"   )rW   rV   r~   r"   r"   r#   ru   #  s    g        gffffff?r   u   .。!！?？r   seek)rT   rV   rW   rX   rW      rV   g      ?r   r   )r}   list	itertoolschainfrom_iterabler   r)   r*   nonzeromedianminr_   r5   rW   rV   rT   r   r   r   r   r'   dictroundrX   rU   max)r   rc   rd   rf   rg   r   r   r   kwargstext_tokens_per_segmentre   r   word_durationsmedian_durationmax_durationsentence_end_marksr,   time_offset
word_indexr   saved_tokensr   timingboundaryr"   r   r#   add_word_timestamps  st   
	0,"0*0$
r   )r?   )(r   r   r   dataclassesr   typingr   r   numbarM   r)   rG   torch.nn.functionalnn
functionalr   audior   r   r   rd   r	   rc   r
   Tensorr^   r$   jitndarrayr.   r>   rO   rR   rS   r_   r   r\   r   r   r   r"   r"   r"   r#   <module>   sz    
$
	
O)	