o
    wiv                     @   s   d Z ddlZddlmZmZ ddlmZ G dd dZG dd deZG d	d
 d
eZ	eee	dZ
ddeee  dedefddZdS )z
A collection of simple character based parsers. These parser handle cleaning and tokenization by default.
We currently support English.
    N)ListOptional)cleanersc                   @   s   e Zd ZdZdddddddee dededed	ed
efddZdede	ee  fddZ
dede	e fddZdedee fddZdd ZdS )
CharParserzFunctor for parsing raw strings into list of int tokens.

    Examples:
        >>> parser = CharParser(['a', 'b', 'c'])
        >>> parser('abc')
        [0, 1, 2]
    T)unk_idblank_iddo_normalizedo_lowercasedo_tokenizelabelsr   r   r	   r
   r   c                C   sP   || _ || _|| _|| _|| _|| _dd t|D | _tdd |D | _	dS )a,  Creates simple mapping char parser.

        Args:
            labels: List of labels to allocate indexes for. Essentially,
                this is a id to str mapping.
            unk_id: Index to choose for OOV words (default: -1).
            blank_id: Index to filter out from final list of tokens
                (default: -1).
            do_normalize: True if apply normalization step before tokenizing
                (default: True).
            do_lowercase: True if apply lowercasing at normalizing step
                (default: True).
        c                 S   s   i | ]\}}||qS  r   ).0indexlabelr   r   p/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/common/parts/preprocessing/parsers.py
<dictcomp>B   s    z'CharParser.__init__.<locals>.<dictcomp>c                 S   s   g | ]
}t |d kr|qS )   )len)r   r   r   r   r   
<listcomp>C   s    z'CharParser.__init__.<locals>.<listcomp>N)
_labels_unk_id	_blank_id_do_normalize_do_lowercase_do_tokenize	enumerate_labels_mapset_special_labels)selfr   r   r   r	   r
   r   r   r   r   __init__#   s   zCharParser.__init__textreturnc                 C   s4   | j r| |}|d u rd S | js|S | |}|S N)r   
_normalizer   	_tokenize)r    r"   text_tokensr   r   r   __call__E   s   

zCharParser.__call__c                 C   s   |  }| jr| }|S r$   )stripr   lowerr    r"   r   r   r   r%   Q   s   zCharParser._normalizec                    s   g }t |dD ]1\}}|dkr| jd j | jv r*| j|  q	|D ]}| j| j q,q	 fdd|D }|S )N r   c                    s   g | ]	}| j kr|qS r   )r   )r   tokenr    r   r   r   h   s    z(CharParser._tokenize.<locals>.<listcomp>)r   splitappendr   getr   r   )r    r"   tokensword_idwordcharr   r.   r   r&   Y   s   
zCharParser._tokenizec                 C   s   i }| j  D ]\}}|||< qd|t| j < d|t| j d < d|t| j d < g }|D ]}||vr4q-|||   q-d|S )Nz<BOS>z<EOS>r   z<P>    )r   itemsr   r0   itemjoin)r    	str_inputr_mapkvoutir   r   r   decodel   s   

zCharParser.decodeN)__name__
__module____qualname____doc__r   strintboolr!   r   r(   r%   r&   rA   r   r   r   r   r      s0    
"r   c                       sL   e Zd ZdZddddZd fdd		Zd
d Zdedee fddZ	  Z
S )ENCharParserz,Incorporates english-specific parsing logic.plusandpercent)+&%NTc                    s0   t  j|i | d| _|r|  | _|| _dS )a  Creates english-specific mapping char parser.

        This class overrides normalizing implementation.

        Args:
            *args: Positional args to pass to `CharParser` constructor.
            **kwargs: Key-value args to pass to `CharParser` constructor.
        N)superr!   _table_ENCharParser__make_trans_tableabbreviation_version)r    rS   
make_tableargskwargs	__class__r   r   r!      s
   


zENCharParser.__init__c                 C   N   t j}| jD ]}||d}q| jD ]}||d}qt|dt| }|S Nr7   r,   stringpunctuationPUNCTUATION_TO_REPLACEreplacer   rF   	maketransr   r    r]   r5   r   tabler   r   r   __make_trans_table      

zENCharParser.__make_trans_tabler"   r#   c                 C   s4   zt j|| j| j| jd}W |S  ty   Y d S w )N)r\   rb   punctuation_to_replacerS   )r   
clean_textrQ   r^   rS   	Exceptionr+   r   r   r   r%      s   	zENCharParser._normalize)NT)rB   rC   rD   rE   r^   r!   rR   rF   r   r%   __classcell__r   r   rW   r   rI   ~   s    rI   c                       sH   e Zd ZdZdddZ fddZdd Zd	ed
ee fddZ	  Z
S )RUCharParserz,Incorporates russian-specific parsing logic.u   плюсu   е)rM   u   ёc                    s    t  j|i | |  | _dS )a  Creates cyrillic-specific mapping char parser.
        This class overrides normalizing implementation.
        Args:
            *args: Positional args to pass to `CharParser` constructor.
            **kwargs: Key-value args to pass to `CharParser` constructor.
        N)rP   r!   _RUCharParser__make_trans_tablerQ   )r    rU   rV   rW   r   r   r!      s   zRUCharParser.__init__c                 C   rY   rZ   r[   ra   r   r   r   rc      rd   zRUCharParser.__make_trans_tabler"   r#   c                 C   s0   zt j|| j| jd}W |S  ty   Y d S w )N)r\   rb   re   )r   rf   rQ   r^   rg   r+   r   r   r   r%      s   
zRUCharParser._normalize)rB   rC   rD   rE   r^   r!   rj   rF   r   r%   rh   r   r   rW   r   ri      s    
ri   )baseenrurk   r   namer#   c                 K   s@   |t vrtd| du rttj} t | }|dd| i|}|S )a{  Creates parser from labels, set of arguments and concise parser name.

    Args:
        labels: List of labels to allocate indexes for. If set to
            None then labels would be ascii table list. Essentially, this is an
            id to str mapping (default: None).
        name: Concise name of parser to create (default: 'base').
            (default: -1).
        **kwargs: Other set of kwargs to pass to parser constructor.

    Returns:
        Instance of `CharParser`.

    Raises:
        ValueError: For invalid parser name.

    Examples:
        >>> type(make_parser(['a', 'b', 'c'], 'en'))
        ENCharParser
    zInvalid parser name.Nr   r   )NAME_TO_PARSER
ValueErrorlistr\   	printable)r   rn   rV   parser_typeparserr   r   r   make_parser   s   
ru   )Nrk   )rE   r\   typingr   r   +nemo.collections.common.parts.preprocessingr   r   rI   ri   ro   rF   ru   r   r   r   r   <module>   s   d2*$