o
    }oi,                     @   sR   d dl Zd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 G dd dZdS )    N)cached_property)Any)ASRModel)
Hypothesisc                   @   s  e Zd ZdZedd Zedd Zedd Zedd	 Z	ed
d Z
edd Zedd Zedd Zedd Zedd Zedd Zedd ZededefddZededefddZd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0S )1BaseTimestampsTestz
    Base class for testing timestamps in decoders (CTC and RNNT).
    This class defines common test methods that can be inherited by both
    test_ctc_decoding.py and test_rnnt_decoding.py.
    c                 C   s8   d}t j|rtj|dd}|jS tjddd}|jS )Nz9/home/TestData/asr/stt_en_conformer_transducer_small.nemocpu)map_location!stt_en_conformer_transducer_small)ospathexistsr   restore_fromfrom_pretrained	tokenizer)self
model_pathmodel r   b/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/asr/decoding/test_timestamps.pybpe_tokenizer   s   z BaseTimestampsTest.bpe_tokenizerc                 C   sl   dddddddddddddd	d	dd
dddddddddddddddddddddddg
}|S )Ner      charstart_offset
end_offset             .         	   
            ?            r   r   char_offsetsr   r   r   char_offsets_chars'   s   









z%BaseTimestampsTest.char_offsets_charsc                 C   s"   dddddddddd	d
dgS )Nr   r   r   wordr   r   ze.r   r#   e?r&   r,   r   r   r   r   r   "word_offsets_chars_expected_output7   s   


z5BaseTimestampsTest.word_offsets_chars_expected_outputc                 C      ddddddddgS )Nze e r   r    r2   z e? r$   r.   r   r5   r   r   r   2word_offsets_chars_expected_output_other_delimiter?      

zEBaseTimestampsTest.word_offsets_chars_expected_output_other_delimiterc                 C   r7   )Nze e.r   r#   segmentr   r   r4   r&   r,   r   r5   r   r   r   segment_offsets_expected_outputF   r9   z2BaseTimestampsTest.segment_offsets_expected_outputc                 C   s   ddddgS )Nze e. e?r   r,   r:   r   r5   r   r   r   #segment_offsets_expected_output_gapM   s   
z6BaseTimestampsTest.segment_offsets_expected_output_gapc              	   C   sD   ddddddddddd	dd
dddddddddddg}|S )Ni   r   r   r   ~   r   u   r   r   D   r    r"   9   r#   z   r$   r%   r   r/   r   r   r   char_offsets_wpeS      





	z#BaseTimestampsTest.char_offsets_wpec                 C   s,   dddddddddd	d
dddddgS )N
nineteenthr   r   r2   rer   r   seventyr    r#   eightyr$   r%   r   r5   r   r   r    word_offsets_wpe_expected_output`   
   



z3BaseTimestampsTest.word_offsets_wpe_expected_outputc                 C   r7   )NrF   r   r   r2   zseventy eightyr    r%   r   r5   r   r   r   0word_offsets_wpe_expected_output_other_delimiteri   r9   zCBaseTimestampsTest.word_offsets_wpe_expected_output_other_delimiterc              	   C   sD   ddddddddddddd	dddd
dddddddg}|S )Ni  r   r   r   i  r   r"   r       i]  r#   in  r$   r%   r   r/   r   r   r   char_offsets_bpep   rE   z#BaseTimestampsTest.char_offsets_bpec                 C   s,   ddddddddddddd	dd
dgS )Ndiscussr   r   r2   z	absolute'r    reallyr"   
friendshipr%   r   r5   r   r   r    word_offsets_bpe_expected_output}   rK   z3BaseTimestampsTest.word_offsets_bpe_expected_outputc                 C   r7   )Nzdiscuss absolute'r   r    r2   rQ   r"   r%   r   r5   r   r   r   0word_offsets_bpe_expected_output_other_delimiter   r9   zCBaseTimestampsTest.word_offsets_bpe_expected_output_other_delimiterhypdecodingc           	      C   sd  | j dusJ t| j tsJ d| j v sJ d| j v sJ d| j v s$J d| j v s+J tdd| j }| j|j}t	t
dd	 |}t| j d t|ksRJ d
d | j d D }||j|ksfJ g }g }|D ]}|| |d |jv r|d| g }ql|r|d| t| j d t|ksJ dd | j d D }||j|ksJ dS )z5Test character-level timestamps for both CTC and RNNTNtimestepr   r3   r;   \s+r   c                 S   s   | dkS )N r   xr   r   r   <lambda>       z:BaseTimestampsTest.check_char_timestamps.<locals>.<lambda>c                 S      g | ]}|d  qS r3   r   .0tsr   r   r   
<listcomp>       z<BaseTimestampsTest.check_char_timestamps.<locals>.<listcomp>c                 S   r]   r;   r   r_   r   r   r   rb      rc   )	timestamp
isinstancedictrG   subtextstripsplitword_seperatorlistfilterlenjoinappendsegment_seperators)	rT   rU   hypothesis_textwordswords_from_timestampssegmentsr;   r3   segments_from_timestampsr   r   r   check_char_timestamps   s2   
z(BaseTimestampsTest.check_char_timestampsc                    s|  j dusJ tj tsJ dj v sJ dj v sJ dj v s$J dj v s+J tj}ttdd |} fdd	j d D }d
d	 |D }ttdd |}t|t|ks_J tddj	 }dd	 j d D }| j
|ks}J tfdd	 jD }jd  jvr|d7 }j jv rd}tj d |ksJ dd	 j d D }| j
|ksJ dS )z3Test subword-level timestamps for both CTC and RNNTNrV   r   r3   r;   c                 S      | dvS N)rX   r   #r   rY   r   r   r   r[      r\   z=BaseTimestampsTest.check_subword_timestamps.<locals>.<lambda>c                    s    g | ]}t  j|d  qS )r   )rn   r   tokens_to_text)r`   data)rU   r   r   rb      s     z?BaseTimestampsTest.check_subword_timestamps.<locals>.<listcomp>c                 S   s   g | ]	}|D ]}|qqS r   r   )r`   subwordr   r   r   r   rb      s    c                 S   rz   r{   r   rY   r   r   r   r[      r\   rW   r   c                 S   r]   r^   r   r_   r   r   r   rb      rc   c                    s   g | ]} j |qS r   )rj   count)r`   	seperator)rT   r   r   rb      s    rd   r   r   c                 S   r]   re   r   r_   r   r   r   rb      rc   )rf   rg   rh   rn   rj   ro   rp   rG   ri   rk   rm   rq   sumrs   )rT   rU   chars	all_charsrt   rv   segments_countrx   r   )rU   rT   r   check_subword_timestamps   s.   
z+BaseTimestampsTest.check_subword_timestampsc                 C   ,   | j j| jd dh dd}|| jksJ d S Nr      r!   !r*   r0   encoded_char_offsetsword_delimiter_charsupported_punctuation)decoding_charget_words_offsetsr1   r6   r   word_offsetsr   r   r   test_word_offsets_chars      z*BaseTimestampsTest.test_word_offsets_charsc                 C   r   )Nr!   r   r   )r   r   r1   r8   r   r   r   r   &test_word_offsets_char_other_delimiter   r   z9BaseTimestampsTest.test_word_offsets_char_other_delimiterc                 C   ,   | j jd | jdh dd}|| jksJ d S r   )decoding_subword_wper   rD   rJ   r   r   r   r   test_word_offsets_subword_wpe      z0BaseTimestampsTest.test_word_offsets_subword_wpec                 C   r   )NrG   r   r   )r   r   rD   rL   r   r   r   r   -test_word_offsets_subword_wpe_other_delimiter   r   z@BaseTimestampsTest.test_word_offsets_subword_wpe_other_delimiterc                 C   r   r   )decoding_subword_bper   rN   rR   r   r   r   r   test_word_offsets_subword_bpe   r   z0BaseTimestampsTest.test_word_offsets_subword_bpec                 C   r   )NrP   r   r   )r   r   rN   rS   r   r   r   r   -test_word_offsets_subword_bpe_other_delimiter  r   z@BaseTimestampsTest.test_word_offsets_subword_bpe_other_delimiterc                 C   s.   | j j| jg dh dd}|| jksJ d S )N)r!   r   r*   r   )offsetssegment_delimiter_tokensr   )r   _get_segment_offsetsr6   r<   r   segment_offsetsr   r   r   test_segment_offsets_delimiter  s   z1BaseTimestampsTest.test_segment_offsets_delimiterc                 C   s(   | j j| jg i dd}|| jksJ d S )Nr&   )r   r   r   segment_gap_threshold)r   r   r6   r=   r   r   r   r   test_segment_offsets_gap  s   z+BaseTimestampsTest.test_segment_offsets_gapN)__name__
__module____qualname____doc__r   r   propertyr1   r6   r8   r<   r=   rD   rJ   rL   rN   rR   rS   staticmethodr   r   ry   r   r   r   r   r   r   r   r   r   r   r   r   r   r      sL    











#!





	r   )os.pathr
   rG   	functoolsr   typingr   nemo.collections.asr.modelsr   +nemo.collections.asr.parts.utils.rnnt_utilsr   r   r   r   r   r   <module>   s   