o
    ॵiY                     @   s   d dl mZ d dlmZmZ d dlZd dlZd dlm	Z	m
Z
mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d
dlmZ ejejejdG dd deZdS )    N)AnyDict)MosesDetokenizerMosesPunctNormalizerMosesTokenizer)	apply_bpe)Preprocessors)Preprocessor)PREPROCESSORS)Config)Fields	ModelFile   )	TextClean)module_namec                       sH   e Zd ZdZ	ddedef fddZdedeeef fd	d
Z	  Z
S )CanmtTranslationPreprocessorz3The preprocessor used in text correction task.
    N	model_dir
max_lengthc                    s  ddl m} 	 t j|i | tt|tj	| _
|t|d| _|t|d| _| j | _|d ur=|d nd| _| j
d d | _| j
d d	 | _t | _| jd
kr]t| _nt| jd| _t| jd| _t|| j
d d d | _tt| j| _d S )Nr   )
Dictionaryzdict.src.txtzdict.tgt.txtr      preprocessorsrc_langtgt_langzh)langsrc_bpefile)fairseq.datar   super__init__r   	from_fileospjoinr   CONFIGURATIONcfgload	vocab_src	vocab_tgtpadpadding_valuer   r   r   r   tcjiebatokr   punct_normalizerr   src_bpe_pathr   BPEopenbpe)selfr   r   argskwargsr   	__class__ b/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/preprocessors/nlp/canmt_translation.pyr      s*   
z%CanmtTranslationPreprocessor.__init__inputreturnc                    s8   j dkr j|} j|}dt|}n fdd|D } fdd|D } j|	 
 }ddd |D } jj|ddd	}tj|d
d}| d }t j|}tj jg||  |jd}	tjt||	gdd}
tjt|	|gdd}tjt||	gdd}t|g}||||
d}|S )u  process the raw input data

        Args:
            data (str): a sentence
                Example:
                    '随着中国经济突飞猛近，建造工业与日俱增'
        Returns:
            Dict[str, Any]: the preprocessed data
            Example:
            {'net_input':
                {'src_tokens':tensor([1,2,3,4]),
                'src_lengths': tensor([4])}
            }
        r    c                    s   g | ]} j |qS r7   )_punct_normalizer	normalize.0itemr2   r7   r8   
<listcomp>N   s    z9CanmtTranslationPreprocessor.__call__.<locals>.<listcomp>c                    s   g | ]} j j|d d dqS )T)
return_straggressive_dash_splits)r,   tokenizer>   rA   r7   r8   rB   O   s    c                 S   s   g | ]}|qS r7   r7   )r?   xr7   r7   r8   rB   V   s    TF)
append_eosadd_if_not_existr   )shiftsr   )dtype)dim)
src_tokenssrc_lengthsprev_src_tokenssources)r   r*   cleanr,   cutr"   listr1   process_linestripsplitr&   encode_linetorchrollsizeminr   tensorr)   rJ   	unsqueezecat)r2   r9   	input_tok	input_bpetextinputsprev_inputslengthsmax_lenpaddingrO   outr7   rA   r8   __call__:   s@   

z%CanmtTranslationPreprocessor.__call__)N)__name__
__module____qualname____doc__strintr   r   r   rg   __classcell__r7   r7   r5   r8   r      s    "!r   )os.pathpathr!   typingr   r   r+   rW   
sacremosesr   r   r   subword_nmtr   modelscope.metainfor   modelscope.preprocessors.baser	    modelscope.preprocessors.builderr
   modelscope.utils.configr   modelscope.utils.constantr   r   
text_cleanr   register_modulenlpcanmt_translationr   r7   r7   r7   r8   <module>   s    