o
    i(                     @   sf   d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	 dZ
dd Zdd ZG d	d
 d
ZdS )    N)defaultdict)ListOptionalTupleuV   [’!"#$%&'()*+,-./:;<>=?@，。?★、…【】《》？“”‘’！[\]^_`{|}~\s]+c                 C   sr   g }|   }t|dkr7td|}|d ur|d}n|dd }|| ||ddd}t|dks|S )Nr   z[A-Za-z!?,<>()\']+     )lowerlenrematchgroupappendreplacestrip)	input_strtokenssr   word r   J/home/ubuntu/.local/lib/python3.10/site-packages/funasr/utils/kws_utils.pysplit_mixed_label   s   
r   c                 C   s  t  }t  }| |v r|| f }|||  f }||fS t| }|D ]a}|dks.|dks.|dkr3|d }q |dks;|dkr@|d }q |dksP|dksP|d	ksP|d
krU|d }q ||v r_||f }q ||v rp|| D ]}||f }qgq ttd|}|D ]}||f }qyq |D ]c}||v r||| f }q|dkrd|v r||d f }q||d f }q|dkrd|v r||d f }q||d f }qd|v r||d f }td| d q||d f }td| d q||fS )N!silz(sil)z<sil>)r   <blank>)r   z(noise)znoise)z(noisez<noise>)<unk>r   silr   'z)' is not in token set, replace with <unk>z+' is not in token set, replace with <blank>)tupler   r   sub
symbol_strlogginginfo)txtsymbol_tablelexicon_table
tokens_str
tokens_idxpartspartchr   r   r   query_token_set   sR   


 
r*   c                   @   s   e Zd ZdZdejjdedede	fddZ
				
ddejdejdedededeeee  ejf fddZdd ZdejdejfddZdejfddZdS )KwsCtcPrefixDecoderz.Decoder interface wrapper for CTCPrefixDecode.ctckeywords
token_listseg_dictc                    s   | _ | _i }|D ]	}||||< q
dh _i  _| _ j ddd}|D ].}t	|||\}	}
i  j|< |
 j| d< d
dd |
D  j| d<  fd	d
|
D  q,dS )zInitialize class.

        Args:
            ctc (torch.nn.Module): The CTC implementation.
                For example, :class:`espnet.nets.pytorch_backend.ctc.CTC`

        r   r   r   ,token_idc                 s   s    | ]	}d t | V  qdS )z%s N)str.0ir   r   r   	<genexpr>n   s    z/KwsCtcPrefixDecoder.__init__.<locals>.<genexpr>	token_strc                    s   g | ]} j |qS r   )keywords_idxsetaddr3   selfr   r   
<listcomp>o       z0KwsCtcPrefixDecoder.__init__.<locals>.<listcomp>N)r,   r.   indexr8   keywords_tokenkeywords_strr   r   splitr*   join)r;   r,   r-   r.   r/   token_tabletokenkeywords_listkeywordstrsindexsr   r:   r   __init__Q   s    
zKwsCtcPrefixDecoder.__init__N      logitslogits_lengthskeywords_tokensetscore_beam_sizepath_beam_sizereturnc              
   C   s  | d}|}t ddg ffg}td|D ]R}	||	 }
tdd }|
|\}}g }g }t| | D ])\}}|durQ|dkrP||v rP|| || q6|dkr_|| || q6t|dkrgq|D ]}|
| 	 }|D ]\}\}}}t|dkr|d nd}|dkr|| \}}}|||  ||  }|
 }|||f||< qs||krtj|dd	d
s|| \}}}|||  }|
 }||d d kr||d d< |	|d d< |||f||< tj|dd	d
s||f }|| \}}}|||  }|
 }|t||	|d |||f||< qs||f }|| \}}}|r6||d d kr5||d d< |	|d d< n|
 }|t||	|d |||  ||  }|||f||< qsqit| dd dd}|d| }qdd |D }|S )a   CTC prefix beam search inner implementation

        Args:
            logits (torch.Tensor): (1, max_len, vocab_size)
            logits_lengths (torch.Tensor): (1, )
            keywords_tokenset (set): token set for filtering score
            score_beam_size (int): beam size for score
            path_beam_size (int): beam size for path

        Returns:
            List[List[int]]: nbest results
        r         ?        c                   S   s
   ddg fS )NrS   r   r   r   r   r   <lambda>   s   
 z1KwsCtcPrefixDecoder.beam_search.<locals>.<lambda>Ng?gư>)abs_tolprobframe)rD   rX   rW   c                 S   s   | d d | d d  S )Nr   r   r   )xr   r   r   rT      r=   T)keyreversec                 S   s6   g | ]}|d  |d d  |d d  |d d fqS )r   r      r   )r4   yr   r   r   r<      s   6 z3KwsCtcPrefixDecoder.beam_search.<locals>.<listcomp>)sizer   ranger   topkziptolistr   r
   itemcopymathisclosedictsorteditems)r;   rL   rM   rN   rO   rP   maxlen	ctc_probscur_hypstprobs	next_hypstop_k_probstop_k_indexfilter_probsfilter_indexrW   idxr   psprefixpbpnb	cur_nodeslastn_pbn_pnbnodesn_prefixhypsr   r   r   beam_searchq   s   









*zKwsCtcPrefixDecoder.beam_searchc                 C   s   t |t |k r
dS t |t |kr||krdS dS tt |t | D ]#}|| |d krGtt |D ]}|||  || krB nq4|  S q$dS )NrU   r   )r
   r_   )r;   	main_list
check_listr5   jr   r   r   
is_sublist   s   zKwsCtcPrefixDecoder.is_sublistc                 C   s   |  ||| j}d }d}|D ]P}|d }|d }t|t|ks"J | j D ]+}	| j|	 d }
| ||
}|dkrR|	}t||t|
 D ]
}||| d 9 }qE nq'|d ur^t|} nq|d urhd||fS dS )	NrR   r   r\   r1   rU   rW   T)FNN)	r   r8   r
   r?   keysr   r_   re   sqrt)r;   rL   rM   r   hit_keyword	hit_scoreone_hyp
prefix_idsprefix_nodesr   laboffsetrt   r   r   r   _decode_inside   s.   

z"KwsCtcPrefixDecoder._decode_insiderY   c                 C   s>   | j |d d }t|dg}| 	||S )zGet an initial state for decoding.

        Args:
            x (torch.Tensor): The encoded feature tensor

        Returns: decode result

        r   r   )
r,   softmax	unsqueezedetachsqueezecputorchtensorr^   r   )r;   rY   raw_logpxlenr   r   r   decode  s    
zKwsCtcPrefixDecoder.decode)NrJ   rK   )__name__
__module____qualname____doc__r   nnModuler2   listrg   rI   Tensorsetintr   r   r   r   r   r   r   r   r   r   r+   N   sD    
$
k
!r+   )r   r    r   re   collectionsr   typingr   r   r   r   r   r*   r+   r   r   r   r   <module>   s    3