o
    wi"                     @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlmZm	Z	 d dl
mZmZ d dlmZ d dlmZ ed	d
gdZejdedejdddddejdddddejdddddejdddddde d d! Zeejd"d  d#k rejnejZe d$d% Zd&d' Zd(d) Zed*ejd+d,ddd-d.ejd/d0ddd1d.ejd2d3d4d5ejd6d7d8d5ed9d: Z ed;ejd<d0ddd=d.ed>d? Z!ed@ejdAddddBd.ejdCdDdddEd.ejdFd3dddGd.ejdHd7dddId.edJdK Z"edLejdMdNddOdPejdQd,dddRd.ejdSd3dddTd.edUdV Z#edWejdMdNddXdPejdQd,dddRd.ejdSd3dddYd.edZd[ Z$ed\ejd]d,ddd^d.ed_d` Z%dS )a    N)deepcopypartialupdate_wrapper)MosesTokenizerMosesDetokenizer)MosesTruecaserMosesDetruecaser)MosesPunctNormalizerparallelize_preprocessz-hz--help)help_option_namesT)chaincontext_settingsz
--languagez-lenz+Use language specific rules when tokenizing)defaulthelpz--processesz-j   zNo. of processes.z
--encodingz-eutf8zSpecify encoding of file.z--quietz-qFzDisable progress bar.)is_flagr   r   c                 C   s   d S N )languageencoding	processesquietr   r   K/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/sacremoses/cli.pycli   s   r   .   c                 K   s~   t jd|d.}|}| D ]}|t|fi |}q|r%|D ]}t | qW d    d S W d    d S 1 s8w   Y  d S )Nstdinr   )clickget_text_streamlistecho)
processorsr   kwargsfiniteratorprocitemr   r   r   process_pipeline$   s   "r-   c                    s    fdd}t | fi |S )z\Helper decorator to rewrite a function so that
    it returns another function from it.
    c                     s    fdd}t |fi | S )Nc                    s    | fi |S r   r   )streamr(   fr   r   	processor5   s   z.processor.<locals>.new_func.<locals>.processorr   )r(   r1   r/   r   r   new_func4   s   zprocessor.<locals>.new_funcr   )r0   r(   r2   r   r/   r   r1   /   s   r1   c                 c   sD    |dkr| D ]}||V  qd S t || || dD ]}|V  qd S )Nr   )progress_barr   )r*   funcr   r   lineoutliner   r   r   parallel_or_not=   s   

r7   tokenizez--aggressive-dash-splitsz-azTriggers dash split rules.)r   r   r   z--xml-escapez-xz"Escape special characters for XML.z--protected-patternsz-pzXSpecify file with patters to be protected in tokenisation. Special values: :basic: :web:)r   z--custom-nb-prefixesz-czjSpecify a custom non-breaking prefixes file, add prefixes to the default ones from the specified language.c                 C   s   t ||d}|r7|dkr|j}n'|dkr|j}nt|dd}	dd |	 D }W d    n1 s2w   Y  t|jd|||d	}
t| |
||S )
N)lang custom_nonbreaking_prefixes_filez:basic:z:web:r   r"   c                 S   s   g | ]}|  qS r   )strip).0patternr   r   r   
<listcomp>|   s    z!tokenize_file.<locals>.<listcomp>T)
return_straggressive_dash_splitsescapeprotected_patterns)r   BASIC_PROTECTED_PATTERNSWEB_PROTECTED_PATTERNSopen	readlinesr   r8   r7   )r*   r   r   r   
xml_escaper@   rB   custom_nb_prefixesmosesr)   moses_tokenizer   r   r   tokenize_fileM   s&   $rK   
detokenizez--xml-unescapez$Unescape special characters for XML.c                 C   s4   t |d}t|jd|d}ttttj| |||S )N)r9   T)r?   unescape)r   r   rL   r7   r%   mapstrsplit)r*   r   r   r   xml_unescaperI   moses_detokenizer   r   r   detokenize_file   s
   
rS   	normalizez--normalize-quote-commasz Normalize quotations and commas.z--normalize-numbersz-dzNormalize number.z--replace-unicode-punctsz2Replace unicode punctuations BEFORE normalization.z--remove-control-charsz.Remove control characters AFTER normalization.c           
      C   s*   t |||||d}t|j}	t| |	||S )N)norm_quote_commasnorm_numberspre_replace_unicode_punctpost_remove_control_chars)r   r   rT   r7   )
r*   r   r   r   normalize_quote_commasnormalize_numbersreplace_unicode_punctsremove_control_charsrI   moses_normalizer   r   r   normalize_file   s   $
r^   ztrain-truecasez--modelfilez-mzFilename to save the modelfile.)requiredr   z--is-asrz)A flag to indicate that model is for ASR.z--possibly-use-first-tokenz*Use the first token as part of truecasing.c           	      C   s,   t |d}|j| ||| d}|| d S )Nis_asrpossibly_use_first_tokenr   r3   )r	   train
save_model)	r*   r   r   r   	modelfilera   rc   rI   modelr   r   r   train_truecaser   s   
rh   truecasez$Filename to save/load the modelfile.z1Use the first token as part of truecase training.c                 C   sd   t j|st| }t|d}|j|||| d}	|| t||d}
t|
jdd}t	| |||S )Nr`   rb   )	load_fromra   T)r?   )
ospathisfiler   r	   rd   re   r   ri   r7   )r*   r   r   r   rf   ra   rc   iterator_copy	truecaserrg   rI   moses_truecaser   r   r   truecase_file  s   

rq   
detruecasez--is-headlinezWhether the file are headlines.c                 C   s$   t  }t|jd|d}t| |||S )NT)r?   is_headline)r
   r   rr   r7   )r*   r   r   r   rs   rI   moses_detruecaser   r   r   detruecase_file/  s
   
ru   )&rk   copyr   	functoolsr   r   r#   sacremoses.tokenizer   r   sacremoses.truecaser	   r
   sacremoses.normalizer   sacremoses.utilr   dictCONTEXT_SETTINGSgroupoptionversion_optionr   int__version__rP   resultcallbackresult_callbackr-   r1   r7   commandrK   rS   r^   rh   rq   ru   r   r   r   r   <module>   s  
$

&
