o
    wi0                     @   sb   d dl Zd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZ G dd dZdS )	    N)cached_property)Any)ASRModel)
Hypothesis)get_segment_offsetsget_words_offsetsc                   @   s  e Zd ZdZedd Zedd Zedd Zedd	 Z	ed
d Z
edd Zedd Zedd Zedd Zedd Zedd Zedd Zedd ZededefddZededefd d!Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0d1 Zd2S )3BaseTimestampsTestz
    Base class for testing timestamps in decoders (CTC and RNNT).
    This class defines common test methods that can be inherited by both
    test_ctc_decoding.py and test_rnnt_decoding.py.
    c                 C   s8   d}t j|rtj|dd}|jS tjddd}|jS )Nz9/home/TestData/asr/stt_en_conformer_transducer_small.nemocpu)map_location!stt_en_conformer_transducer_small)ospathexistsr   restore_fromfrom_pretrained	tokenizer)self
model_pathmodel r   k/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/tests/collections/asr/decoding/test_timestamps.pybpe_tokenizer   s   z BaseTimestampsTest.bpe_tokenizerc                 C   sl   dddddddddddddd	d	dd
dddddddddddddddddddddddg
}|S )Ner      charstart_offset
end_offset             .         	   
            ?            r   r   char_offsetsr   r   r   char_offsets_chars(   s   









z%BaseTimestampsTest.char_offsets_charsc                 C   s"   dddddddddd	d
dgS )Nr   r   r   wordr   r   ze.r    r%   e?r(   r.   r   r   r   r   r   "word_offsets_chars_expected_output8   s   


z5BaseTimestampsTest.word_offsets_chars_expected_outputc                 C      ddddddddgS )Nze e r   r"   r4   z e? r&   r0   r   r7   r   r   r   2word_offsets_chars_expected_output_other_delimiter@      

zEBaseTimestampsTest.word_offsets_chars_expected_output_other_delimiterc                 C   r9   )Nze e.r   r%   segmentr   r   r6   r(   r.   r   r7   r   r   r   segment_offsets_expected_outputG   r;   z2BaseTimestampsTest.segment_offsets_expected_outputc                 C   s   ddddgS )Nze e. e?r   r.   r<   r   r7   r   r   r   #segment_offsets_expected_output_gapN   s   
z6BaseTimestampsTest.segment_offsets_expected_output_gapc              	   C   sD   ddddddddddd	dd
dddddddddddg}|S )Nnineteenr   r   r   z##thr   rer    r!   sevenr"   r$   z##tyr%   eightyr&   r'   r   r1   r   r   r   char_offsets_wpeT      





	z#BaseTimestampsTest.char_offsets_wpec                 C   s,   dddddddddd	d
dddddgS )N
nineteenthr   r   r4   rA   r    r!   seventyr"   r%   rC   r&   r'   r   r7   r   r   r    word_offsets_wpe_expected_outputa   
   



z3BaseTimestampsTest.word_offsets_wpe_expected_outputc                 C   r9   )NrF   r   r   r4   zseventy eightyr"   r'   r   r7   r   r   r   0word_offsets_wpe_expected_output_other_delimiterj   r;   zCBaseTimestampsTest.word_offsets_wpe_expected_output_other_delimiterc              	   C   D   ddddddddddddd	dd
ddd
ddddddg}|S )Ndiscussr   r   r   absoluter!   'r"   reallyr$   friendr%   shipr&   r'   r   r1   r   r   r   char_offsets_bpeq   rE   z#BaseTimestampsTest.char_offsets_bpec              	   C   rK   )Nu
   ▁discussr   r   r   u   ▁absoluter!   rN   r"   u	   ▁reallyr$   u	   ▁friendr%   rQ   r&   r'   r   r1   r   r   r   encoded_char_offsets_bpe~   rE   z+BaseTimestampsTest.encoded_char_offsets_bpec                 C   s,   ddddddddddddd	dd
dgS )NrL   r   r   r4   z	absolute'r"   rO   r$   
friendshipr'   r   r7   r   r   r    word_offsets_bpe_expected_output   rI   z3BaseTimestampsTest.word_offsets_bpe_expected_outputc                 C   r9   )Nzdiscuss absolute'r   r"   r4   rT   r$   r'   r   r7   r   r   r   0word_offsets_bpe_expected_output_other_delimiter   r;   zCBaseTimestampsTest.word_offsets_bpe_expected_output_other_delimiterhypdecodingc           	      C   sd  | j dusJ t| j tsJ d| j v sJ d| j v sJ d| j v s$J d| j v s+J tdd| j }| j|j}t	t
dd	 |}t| j d t|ksRJ d
d | j d D }||j|ksfJ g }g }|D ]}|| |d |jv r|d| g }ql|r|d| t| j d t|ksJ dd | j d D }||j|ksJ dS )z5Test character-level timestamps for both CTC and RNNTNtimestepr   r5   r=   \s+r   c                 S   s   | dkS )N r   xr   r   r   <lambda>       z:BaseTimestampsTest.check_char_timestamps.<locals>.<lambda>c                 S      g | ]}|d  qS r5   r   .0tsr   r   r   
<listcomp>       z<BaseTimestampsTest.check_char_timestamps.<locals>.<listcomp>c                 S   r`   r=   r   rb   r   r   r   re      rf   )	timestamp
isinstancedictrA   subtextstripsplitword_seperatorlistfilterlenjoinappendsegment_seperators)	rW   rX   hypothesis_textwordswords_from_timestampssegmentsr=   r5   segments_from_timestampsr   r   r   check_char_timestamps   s2   
z(BaseTimestampsTest.check_char_timestampsc                    s|  j dusJ tj tsJ dj v sJ dj v sJ dj v s$J dj v s+J tj}ttdd |} fdd	j d D }d
d	 |D }ttdd |}t|t|ks_J tddj	 }dd	 j d D }| j
|ks}J tfdd	 jD }jd  jvr|d7 }j jv rd}tj d |ksJ dd	 j d D }| j
|ksJ dS )z3Test subword-level timestamps for both CTC and RNNTNrY   r   r5   r=   c                 S      | dvS N)r[   r   #r   r\   r   r   r   r^      r_   z=BaseTimestampsTest.check_subword_timestamps.<locals>.<lambda>c                    s    g | ]}t  j|d  qS )r   )rq   r   tokens_to_text)rc   data)rX   r   r   re      s     z?BaseTimestampsTest.check_subword_timestamps.<locals>.<listcomp>c                 S   s   g | ]	}|D ]}|qqS r   r   )rc   subwordr   r   r   r   re      s    c                 S   r}   r~   r   r\   r   r   r   r^      r_   rZ   r   c                 S   r`   ra   r   rb   r   r   r   re      rf   c                    s   g | ]} j |qS r   )rm   count)rc   	seperator)rW   r   r   re      s    rg   r   r   c                 S   r`   rh   r   rb   r   r   r   re      rf   )ri   rj   rk   rq   rm   rr   rs   rA   rl   rn   rp   rt   sumrv   )rW   rX   chars	all_charsrw   ry   segments_countr{   r   )rX   rW   r   check_subword_timestamps   s.   
z+BaseTimestampsTest.check_subword_timestampsc                 C   0   t | jd ddh d| jjd}|| jksJ d S )Nr   r      r#   !r,   r2   encoded_char_offsetsword_delimiter_chartokenizer_typesupported_punctuationdecode_tokens_to_str)r   r3   decoding_charr   r8   r   word_offsetsr   r   r   test_word_offsets_chars      	z*BaseTimestampsTest.test_word_offsets_charsc                 C   r   )Nr   r#   r   )r2   r   r   r   r   r   )r   r3   r   r   r:   r   r   r   r   &test_word_offsets_char_other_delimiter   r   z9BaseTimestampsTest.test_word_offsets_char_other_delimiterc                 C   r   )Nr   wper   r   )r   rD   decoding_subword_wper   rH   r   r   r   r   test_word_offsets_subword_wpe      	z0BaseTimestampsTest.test_word_offsets_subword_wpec                 C   r   )NrA   r   r   r   )r   rD   r   r   rJ   r   r   r   r   -test_word_offsets_subword_wpe_other_delimiter  r   z@BaseTimestampsTest.test_word_offsets_subword_wpe_other_delimiterc                 C   2   t | j| jddh d| jjd}|| jksJ d S )Nr   bper   r   )r   rR   rS   decoding_subword_bper   rU   r   r   r   r   test_word_offsets_subword_bpe  s   	z0BaseTimestampsTest.test_word_offsets_subword_bpec                 C   r   )NrO   r   r   r   )r   rR   rS   r   r   rV   r   r   r   r   -test_word_offsets_subword_bpe_other_delimiter   s   	z@BaseTimestampsTest.test_word_offsets_subword_bpe_other_delimiterc                 C   s*   t | jg dh dd}|| jksJ d S )N)r#   r   r,   r   )r   segment_delimiter_tokensr   )r   r8   r>   r   segment_offsetsr   r   r   test_segment_offsets_delimiter,  s   z1BaseTimestampsTest.test_segment_offsets_delimiterc                 C   s$   t | jg i dd}|| jksJ d S )Nr(   )r   r   r   segment_gap_threshold)r   r8   r?   r   r   r   r   test_segment_offsets_gap5  s   z+BaseTimestampsTest.test_segment_offsets_gapN) __name__
__module____qualname____doc__r   r   propertyr3   r8   r:   r>   r?   rD   rH   rJ   rR   rS   rU   rV   staticmethodr   r   r|   r   r   r   r   r   r   r   r   r   r   r   r   r   r      sP    












#!	r   )os.pathr   rA   	functoolsr   typingr   nemo.collections.asr.modelsr   +nemo.collections.asr.parts.utils.rnnt_utilsr   0nemo.collections.asr.parts.utils.timestamp_utilsr   r   r   r   r   r   r   <module>   s   