o
    wiu                     @   s.   d dl Z d dlZd dlmZ G dd dZdS )    N)chainc                   @   s   e Zd ZdZg dZddgZg dZg dZg dZdgZ	d	d
gZ
dgZdgZg dZ							dddZdd Zdd Zdd ZdS )MosesPunctNormalizerz
    This is a Python port of the Moses punctuation normalizer from
    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/normalize-punctuation.perl
    )
)z\r )z\(z ()z\)z) z + )z\) ([.!:?;,])z)\g<1>)z\( ()z \)))z(\d) %z\g<1>%)z ::)z ;;)`')''z " ))u   „"u   “r   u   ”r   )u   –-)u   —z - r   )   ´r   )u   ([a-zA-Z])‘([a-zA-Z])\g<1>'\g<2>)u   ([a-zA-Z])’([a-zA-Z])r   )u   ‘r   )u   ‚r      ’r   )r   r   )u   ´´r   u   …z...))    « r   )u   « r   )   «r   )    » r   )u    »r   )   »r   )
)u    %%)u   nº u   nº )u    :r	   )u    ºCu    ºC)u    cmz cm)u    \??)u    \!!)u    ;r
   )u   , z, r   )z"([,.]+)z\g<1>")z,"z",)z(\.+)"(\s*[^<])z"\g<1>\g<2>)
   (\d) (\d)z\g<1>,\g<2>)r   z\g<1>.\g<2>)$)u   ，,)u   。\s*. )u   、r   r   r   )u   ∶r	   )u   ：r	   )u   ？r   )u   《r   )u   》r   )u   ）r   )u   ！r   )u   （r   )u   ；r
   )u   」r   )u   「r   )u   ０0)u   １1)u   ２2)u   ３3)u   ４4)u   ５5)u   ６6)u   ７7)u   ８8)u   ９9)u   ．\s*r    )u   ～~r   r   )u   ━r   )u   〈<)u   〉>)u   【[)u   】])u   ％r   enTFc                 C   s   |rd| j d< d| jd< d| jd< | j| j | j| jg| _|r&| jd| j |r?|dkr4| j| j n|d	v r?| j| j	 |rT|d
v rM| j| j
 n| j| j tt| j | _|| _|| _dS )a  
        :param language: The two-letter language code.
        :type lang: str
        :param penn: Normalize Penn Treebank style quotations.
        :type penn: bool
        :param norm_quote_commas: Normalize quotations and commas
        :type norm_quote_commas: bool
        :param norm_numbers: Normalize numbers
        :type norm_numbers: bool
        :param perl_parity: exact parity with perl script
        :type perl_parity: bool
        )r   r      )r   z "r   )r   z"       r0   )deesfr)r4   r5   czcsr6   N)NORMALIZE_UNICODEFRENCH_QUOTESEXTRA_WHITESPACEHANDLE_PSEUDO_SPACESsubstitutionsinsertNORMALIZE_UNICODE_IF_NOT_PENNappendEN_QUOTATION_FOLLOWED_BY_COMMA$DE_ES_FR_QUOTATION_FOLLOWED_BY_COMMADE_ES_CZ_CS_FROTHERlistr   pre_replace_unicode_punctpost_remove_control_chars)selflangpennnorm_quote_commasnorm_numbersrF   rG   perl_parity rN   Q/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/sacremoses/normalize.py__init__   s.   



zMosesPunctNormalizer.__init__c                 C   sJ   | j r| |}| jD ]\}}t||t|}q| jr!| |}| S )z?
        Returns a string with normalized punctuation.
        )	rF   replace_unicode_punctr=   resubstrrG   remove_control_charsstriprH   textregexpsubstitutionrN   rN   rO   	normalize   s   

zMosesPunctNormalizer.normalizec                 C   s&   | j D ]\}}t||t|}q|S )N)REPLACE_UNICODE_PUNCTUATIONrR   rS   rT   rW   rN   rN   rO   rQ      s   z*MosesPunctNormalizer.replace_unicode_punctc                 C   s   t dd|S )Nz\p{C}r   )regexrS   )rH   rX   rN   rN   rO   rU      s   z)MosesPunctNormalizer.remove_control_charsN)r0   TTTFFF)__name__
__module____qualname____doc__r;   r?   r9   r:   r<   rA   rB   rC   rD   r\   rP   r[   rQ   rU   rN   rN   rN   rO   r   
   s6    	)
7r   )rR   r]   	itertoolsr   r   rN   rN   rN   rO   <module>   s   