o
    ॵi'                     @   s   d dl Z d dlmZ d dlZd dlmZmZ d dlZ	d dl
Zd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZmZ d d
lmZ ejdkr_ejj Ze!  e Z"dgZ#ej$ej%ej&dG dd deZ'dS )    N)AnyDict)	Pipelines)Model)
OutputKeys)Pipeline)	PIPELINES)ConfigConfigFields)	ModelFileTasks)
get_loggerz2.0LanguageIdentificationPipeline)module_namec                       s   e Zd ZdZdef fddZdedefddZdedeee	f fd	d
Z
deee	f deee	f fddZdeee	f deee	f fddZ  ZS )r   u[   Language Identification Pipeline.

    Examples:

    >>> from modelscope.pipelines import pipeline
    >>> from modelscope.utils.constant import Tasks

    >>> pipeline_ins = pipeline(Tasks.text_classification, 'damo/nlp_language_identification-classification-base')
    >>> pipeline_ins('Elon Musk, co-founder and chief executive officer of Tesla Motors.\n' \
    >>>              'Gleichzeitig nahm die Legion an der Befriedung Algeriens teil, die von.\n' \
    >>>              '使用pipeline推理及在线体验功能的时候，尽量输入单句文本，如果是多句长文本建议人工分句。'

    >>> {
    >>>    "labels":[
    >>>        "en",
    >>>        "de",
    >>>        "zh"
    >>>    ],
    >>>    "scores":[
    >>>        [('en', 0.99)],
    >>>        [('de', 1.0)],
    >>>        [('zh', 1.0)]
    >>>    ]
    >>> }
    modelc              	      s  t  jdd|i| |}d| _ttj|tj	| _
tj|| j
tj d }g }g }tt|dD ]/\}}| }z|d}|||f |||f W q4 tyc   | jratd|| Y q4w t|| _t|| _| jdd| _| jd	d
| _tj|| j
tj d }	tdd tt|	dddD | _d| _t  tjdd}
d|
j_ tj!|
d| _"tj#j$%| j"tj#j&j'g| t( }| jr|) D ]
}t|j*|+  q|,d| _-|,d}|,d}||d| _.t/ }t0 }| j"1||g tj#j$%| j"tj#j&j'g| dS )zBuild a language identification pipeline with a model dir or a model id in the model hub.

        Args:
            model: A Model instance.
        r   Fvocabrbzutf-8zerror vocab:<UNK>   z</S>r   labelc                 S   s   g | ]
\}}||  fqS  )strip).0iwr   r   l/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/pipelines/nlp/language_identification_pipline.py
<listcomp>]   s    z;LanguageIdentificationPipeline.__init__.<locals>.<listcomp>rutf8)encodingunkT)allow_soft_placement)configz	src_cid:0zoutput_label:0zpredict_score:0)
output_idsoutput_scoreNr   )2super__init__debugr	   	from_fileospathjoinr   CONFIGURATIONcfgr
   preprocessor	enumerateopenr   decodeappendUnicodeDecodeErrorprintdictr   vocab_reversegetunk_idpad_idr   	unk_labeltfreset_default_graphConfigProtogpu_optionsallow_growthSession_sessionsaved_modelloaderloadtag_constantsSERVINGget_default_graphget_operationsnamevaluesget_tensor_by_name	input_idsoutputglobal_variables_initializerlocal_variables_initializerrun)selfr   kwargs
export_dirjoint_vocab_file
vocabfilesvocabfiles_reverser   r   joint_label_file	tf_configdefault_graphopoutput_labelr$   init
local_init	__class__r   r   r&   :   st   







z'LanguageIdentificationPipeline.__init__inputreturnc           
         s:  |  }d}t|d|}d}t|d|}d}t|d|}dd d d fdd	|D }td
tj}t|d|}ddd	 | D }g }| D ]"}| j	|| j
}	t|dkro|	| j
kro|d | j
kroqR||	 qRt|dkr|d | j
kr|dd  }t|dkr|d | j
kr|d d }|S )Nz/<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6}); z\S+[./]\S+\s?z
\S*@\S*\s?c                 S   sd   t | }d|k s|dkr|d8 }t|S |dkrd}t|S |dv r(d}t|S |dv r.d	}t|S )
Ni   i_  i  i 0      )i0  i0  i   i   i   i   "   )i   i   i   i   '   )ordchr)ucharinside_coder   r   r   stringpartQ2B   s   	zELanguageIdentificationPipeline._lid_preprocess.<locals>.stringpartQ2BuV   ,-+"'\&.!=:;°·$«»|±[]{}_?<>~^*/%#@()，。！《》？、`Â …‼️c                    s    g | ]}| vr|nd qS ) r   )r   cm_noisyCharsrj   r   r   r      s    zBLanguageIdentificationPipeline._lid_preprocess.<locals>.<listcomp>u`   [😀-🙏🌀-🗿🚀-🛿🇠-🇿🤦-🤷𐀀-􏿿✂-➰♀-♂☀-⭕‍⏏⏩⌚️〰]+rk   c                 S   s0   g | ]}t td |rt td|s|qS )z\dz^[a-z0-9+-_]+$)boolresearchmatch)r   itemr   r   r   r      s    r   r   )lowerrp   subr+   compileUNICODEsplitr   r   r7   r8   lenr2   )
rQ   r`   sentenceCLEANRURLREEMAILREEMOJIREoutidsr   tmpr   rm   r   _lid_preprocess}   sD   
z.LanguageIdentificationPipeline._lid_preprocessc           	         s   | d} fdd|D } jr/t||D ]\}}td| tdd fdd|D  qtdd |D }|D ]}| jg|t|   q:t	
|}d	|i}|S )
N
c                    s"   g | ]}|  d kr |qS )rb   )r   r   )r   r{   rQ   r   r   r      s
    z=LanguageIdentificationPipeline.preprocess.<locals>.<listcomp>zraw:zres:rb   c                    s$   g | ]} j | jd dqS )r   rk   )r6   r7   r8   replace)r   widr   r   r   r      s    c                 S   s   g | ]}t |qS r   )rz   )r   idsr   r   r   r      s    rL   )ry   r'   zipr4   r+   maxextendr9   rz   nparray)	rQ   r`   
sentenceltinput_ids_ltr{   rL   maxlenr   resultr   r   r   
preprocess   s$   



z)LanguageIdentificationPipeline.preprocessc                 C   sR   | j   | j|d i}| j j| j|d}|W  d    S 1 s"w   Y  d S )NrL   )	feed_dict)rA   
as_defaultrL   rP   rM   )rQ   r`   r   sess_outputsr   r   r   forward   s
   $z&LanguageIdentificationPipeline.forwardinputsc                 C   s   |d }t g d}g }g }|D ]A}g }t|| j D ]\}}	|	|vr%q||	|f qt|dd ddd d }t|dkrCd	g}|| ||d d  qd
d |D }
tj|tj	|
i}|S )Nr$   )hafamarazbebgbnbscacecocscydadeeleneoeseteufafifrfygagdglguhahawhehihmnhrhthuhyidigisitjajvkakkkmknkokukylaloltlvmgmimkmlmnmrmsmtmynenlnonypaplpsptrorusdsiskslsmsnsosqsrstsusvswtatetgthtltrugukuruzvixhyiyozhzzh-twzuc                 S   s   | d S )Nr   r   )r   r   r   r   <lambda>   s    z<LanguageIdentificationPipeline.postprocess.<locals>.<lambda>T)keyreverse   r   )r   g      ?c                 S   s   g | ]	}d d |D qS )c                 S   s&   g | ]\}}|d kr|t |dfqS )g{Gz?   )round)r   r   scorer   r   r   r      s    zILanguageIdentificationPipeline.postprocess.<locals>.<listcomp>.<listcomp>r   )r   labels_scoresr   r   r   r      s
    
z>LanguageIdentificationPipeline.postprocess.<locals>.<listcomp>)
setr   r   rJ   r2   sortedrz   r   LABELSSCORES)rQ   r   output_scores_rawsupported_104_langlabels_scores_ltoutput_labelsr$   tmpltsloutput_scoresr   r   r   r   postprocess   s,   
z*LanguageIdentificationPipeline.postprocess)__name__
__module____qualname____doc__strr&   listr   r   r   r   r   r  __classcell__r   r   r^   r   r      s    CD"*)(r)   os.pathr*   osprp   typingr   r   numpyr   
tensorflowr;   modelscope.metainfor   modelscope.models.baser   modelscope.outputsr   modelscope.pipelines.baser   modelscope.pipelines.builderr   modelscope.utils.configr	   r
   modelscope.utils.constantr   r   modelscope.utils.loggerr   __version__compatv1disable_eager_executionlogger__all__register_moduletext_classificationlanguage_identificationr   r   r   r   r   <module>   s.   
