o
    }oit+                     @   s   d dl Z d dlZd dlmZmZmZmZmZ d dlZ	d dl
Z
d dlmZ d dlmZmZ d dlmZmZmZmZ G dd dZG dd	 d	eZdS )
    N)IterableListOptionalTupleUnion)TokenizerSpec)NeuralModule	typecheck)LengthsTypeLogprobsType
NeuralTypePredictionsTypec                   @   sn   e Zd Zdee defddZedd Zedd Z	ed	d
 Z
edd ZdefddZdefddZdS )_TokensWrapper
vocabulary	tokenizerc                    s   | _ | _|d u r fddtt j D  _t j  _ jd ur7t jdr7 jjd ur7 jj _d S d j v rD 	d _d S d j v rQ 	d _d S d _d S )Nc                    s   i | ]} j | |qS  r   ).0iselfr   c/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/modules/flashlight_decoder.py
<dictcomp>!       z+_TokensWrapper.__init__.<locals>.<dictcomp>unk_id <unk>)
r   r   rangelenreverse_map	vocab_lenhasattrr   
unknown_idtoken_to_id)r   r   r   r   r   r   __init__   s   "


z_TokensWrapper.__init__c                 C      | j S Nr!   r   r   r   r   blank.      z_TokensWrapper.blankc                 C   r&   r'   )r#   r   r   r   r   r   2   r*   z_TokensWrapper.unk_idc                 C   r&   r'   r   r   r   r   r   vocab6   r*   z_TokensWrapper.vocabc                 C   s
   | j d S )N   r(   r   r   r   r   
vocab_size:   s   
z_TokensWrapper.vocab_sizetokenc                 C   s.   || j krdS | jd ur| j|S | j| S )Nr   )r)   r   r$   r    )r   r.   r   r   r   r$   ?   s
   


z_TokensWrapper.token_to_idtextc                 C   s   | j d ur| j |S t|S r'   )r   text_to_tokenslist)r   r/   r   r   r   r0   H   s   
z_TokensWrapper.text_to_tokensN)__name__
__module____qualname__r   strr   r%   propertyr)   r   r+   r-   r$   r0   r   r   r   r   r      s    



	r   c                       s   e Zd ZdZddddddddej df
ded	ee d
ee	 dee dee de
de
dededededef fddZdee
 fddZdee
 fddZe deejejf fddZ  ZS ) FlashLightKenLMBeamSearchDecodera~  
    @property
    def input_types(self):
        """Returns definitions of module input ports.
        """
        return {
            "log_probs": NeuralType(('B', 'T', 'D'), LogprobsType()),
        }

    @property
    def output_types(self):
        """Returns definitions of module output ports.
        """
        return {"hypos": NeuralType(('B'), PredictionsType())}
    N    g      9@g       @g      g        lm_pathr   r   lexicon_path
boost_path	beam_sizebeam_size_tokenbeam_threshold	lm_weight
word_score
unk_weight
sil_weightc           %         sz  zddl m}m}m}m}m}m}m} ddlm	}m
} W n ty'   tdw t   |j _t|| _ jj _ jj _ jj _|d urv|| _| j _ jd _|d urt|ddd}d	d
 |D }dd |D }W d    n1 sw   Y  ni }| D ]}| jvr j| q|| j _| j j _ jd}t  j! D ]K\}\}} j|} j"||\}}|D ]3} fdd
|D }  jj| v rt#d| d| d|  dd qΈ j$| |||vr|nt%||  qq|! D ]B\}}!| jvrH j|} j&|} fdd
|D }  jj| v r>t#d| d| d|  dd q j$| |t%|! q j'|j( ||t)|||	|
||d jd	 _*| j* j j j j jg d _+d S ddl m,}"m-}# dd  jj.d jj.v rg ndg D }$||$ _|| j _|#|t)|||	|d jd _*|" j* j j jg  _+d S )Nr   )LMCriterionTypeKenLMLexiconDecoderLexiconDecoderOptionsSmearingModeTrie)create_word_dict
load_wordszFlashLightKenLMBeamSearchDecoder requires the installation of flashlight python bindings from https://github.com/flashlight/text. Please follow the build instructions there.r   rutf_8)encodingc                 S   s   g | ]	}|  d qS )	)stripsplit)r   liner   r   r   
<listcomp>       z=FlashLightKenLMBeamSearchDecoder.__init__.<locals>.<listcomp>c                 S   s   i | ]	}|d  |d qS )r   r,   r   r   wr   r   r   r      rT   z=FlashLightKenLMBeamSearchDecoder.__init__.<locals>.<dictcomp>Fc                       g | ]} j |qS r   tokenizer_wrapperr$   r   r.   r   r   r   rS      r   z#tokenizer has unknown id for word[ z ] r   T)flushc                    rW   r   rX   rZ   r   r   r   rS      r   )	r<   r=   r>   r?   r@   	unk_score	sil_scorelog_addcriterion_type)LexiconFreeDecoderLexiconFreeDecoderOptionsc                 S   s   i | ]}||ggqS r   r   rU   r   r   r   r      s    )r<   r=   r>   r?   r]   r^   r_   )/flashlight.lib.text.decoderrC   rD   rE   rF   rG   rH   rI   flashlight.lib.text.dictionaryrJ   rK   ModuleNotFoundErrorsuperr%   CTCr_   r   rY   r-   r)   r   silencelexicon	word_dict	get_indexunk_wordopenkeys	add_entrylmtriestart	enumerateitemsscoreprintinsertfloatr0   smearMAXintdecoder_optsdecoderr`   ra   r+   )%r   r9   r   r   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   frboost_wordswordstart_stater   	spellingsword_idx_rt   spellingspelling_idxsboostr`   ra   d	__class__r   r   r%   `   s   $	






	

	z)FlashLightKenLMBeamSearchDecoder.__init__idxsc                    s   dd t |D } jdk rt fdd|}n	t fdd|}t|}|d  jkr3|dd }|d	  jkr@|dd	 }t|S )
z;Normalize tokens by handling CTC blank, ASG replabels, etc.c                 s   s    | ]}|d  V  qdS )r   Nr   )r   gr   r   r   	<genexpr>   s    z?FlashLightKenLMBeamSearchDecoder._get_tokens.<locals>.<genexpr>r   c                    s   |  j ko	|  jkS r'   )r)   rg   xr   r   r   <lambda>   s    z>FlashLightKenLMBeamSearchDecoder._get_tokens.<locals>.<lambda>c                    s
   |  j kS r'   )r)   r   r   r   r   r      s   
 r,   Nr   )	itertoolsgroupbyrg   filterr1   torch
LongTensor)r   r   r   r   r   _get_tokens   s   

z,FlashLightKenLMBeamSearchDecoder._get_tokens
token_idxsc                 C   sH   g }t |D ]\}}|| jkrq|dks|||d  kr!|| q|S )a"  Returns frame numbers corresponding to every non-blank token.
        Parameters
        ----------
        token_idxs : List[int]
            IDs of decoded tokens.
        Returns
        -------
        List[int]
            Frame numbers corresponding to every non-blank token.
        r   r,   )rr   r)   append)r   r   	timestepsr   	token_idxr   r   r   _get_timesteps   s   

z/FlashLightKenLMBeamSearchDecoder._get_timesteps	log_probsc           
         s   t |tjrt| }| dkr|d}| 	 }|
 \}}}g }t|D ]#}| d| |d  } j|||}	| fdd|	D  q+|S )N   r      c                    s<   g | ]}  |j|j |j fd d|jD dqS )c                    s    g | ]}|d kr j |qS )r   )ri   	get_entry)r   r   r   r   r   rS     s     zGFlashLightKenLMBeamSearchDecoder.forward.<locals>.<listcomp>.<listcomp>)tokensrt   r   words)r   r   rt   r   r   )r   resultr   r   r   rS     s    

z<FlashLightKenLMBeamSearchDecoder.forward.<locals>.<listcomp>)
isinstancenpndarrayr   
from_numpyrw   dim	unsqueezecpu
contiguoussizer   data_ptrstrider|   decoder   )
r   r   	emissionsBTNhyposbemissions_ptrresultsr   r   r   forward  s    

z(FlashLightKenLMBeamSearchDecoder.forward)r2   r3   r4   __doc__mathinfr5   r   r   r   rz   rw   r%   r   r   r   no_gradr   r   r   Tensorr   __classcell__r   r   r   r   r7   O   sR    	
}$r7   )r   r   typingr   r   r   r   r   numpyr   r   1nemo.collections.common.tokenizers.tokenizer_specr   nemo.core.classesr   r	   nemo.core.neural_typesr
   r   r   r   r   r7   r   r   r   r   <module>   s   4