o
    ॵi5                     @   s   d dl mZmZmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZmZ d d
lmZmZ d dlmZ e ZejejejdG dd deZ ejejej!dG dd deZ"dS )    )AnyDictListTupleUnionN)Preprocessors)Preprocessor)PREPROCESSORS)"TextClassificationPreprocessorBase)NLPTokenizerForLSTM#TokenClassificationPreprocessorBase)NLPTokenizer)FieldsModeKeys)get_model_typeparse_label_mapping)
get_logger)module_namec                       sb   e Zd ZdddZdddddejdddf	dededeeef de	ded	e
d
ef fddZ  ZS )/SpeakerDiarizationDialogueDetectionPreprocessorNc                 K   s4   d|vr| j tjkrdnd |d< | j||fi |S )Nreturn_tensorspt)moder   	INFERENCEnlp_tokenizer)self	sequence1	sequence2kwargs r   T/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/preprocessors/speaker.py_tokenize_text   s   z>SpeakerDiarizationDialogueDetectionPreprocessor._tokenize_textlabelfirst_sequencesecond_sequencelabel2idr   
max_lengthuse_fastc
              	      s   |
 dd|
d< |
 dd|
d< |d ur|n|
 dd|
d< |
dd  d }|d ur.t|}t||||
d| _t |||||||	 d S )N
truncationTpaddingr%   sequence_length   )r&   tokenize_kwargs)getpopr   r   r   super__init__)r   	model_dirr"   r#   r!   r$   r   r%   r&   keep_original_columnsr   
model_type	__class__r   r   r/   !   s$   z8SpeakerDiarizationDialogueDetectionPreprocessor.__init__)N)__name__
__module____qualname__r    r   r   strr   r   r   intboolr/   __classcell__r   r   r3   r   r      s4    

r   c                       s   e Zd Zdddddejddddf
dedededed	ed
ef fddZde	ee
e f fddZdd Zdd Zdd Zdd Zdd Z  ZS ):SpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessorNtextr!   FTr0   r"   r$   label_all_tokensr   c              
      s   t  |||||||	|
 d }|d urt|}|dd|d< |dd|d< |d ur-|n|dd|d< |dd  |dk|d< t||||d	| _d S )
Nr'   Tr(   r%   r)   r*   lstmadd_special_tokens)r0   r2   r&   r+   )r.   r/   r   r,   r-   r   r   )r   r0   r"   r!   r$   r>   r   r%   r&   r1   return_textr   r2   r3   r   r   r/   @   s.   zCSpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor.__init__c                 K   s  |}| j tjkrt|tsJ d| jdd}|rL|d}|dks&| jr+t|}n!g }|	t|d |  |
d |	t||d d   |}|r`| j tjkr`| j|fi |\}}n| jjjrq| j|fi |\}}n| j|fi |\}}d}t|d D ]\}	}
|
| jjjkr|	} nq|dkrt|t|d D ]}d|d |< q| j tjkr| D ]}t|| d	||< q||fS |d
d  ||fS )NzsInput needs to be lists in training and evaluating,because the length of the words and the labels need to be equal.is_split_into_wordsFz[SEP]   	input_ids
label_maskr   offset_mapping)r   r   r   
isinstancelistr   get_tokenizer_kwargfindis_lstm_modelextendappend_tokenize_text_by_words	tokenizeris_fast"_tokenize_text_with_fast_tokenizer"_tokenize_text_with_slow_tokenizer	enumeratesep_token_idrangelenkeystorchtensor	unsqueezer-   )r   r=   r   tokensrB   sep_idx
tmp_tokens	encodingsword_idsidxtoken_idikeyr   r   r   r    `   s^   




zISpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor._tokenize_textc              	   K   sD  g }g }g }g }t |D ]A\}}| jjj|dd}	t|	dkr%| jjjg}	||	 |dgt|	  |dgdgt|	d    |||d fg q|d| jd}
|d|d| jd}| jd	rndnd}t||d
|  kr|d |d
|   }|d |d
|   }|d t	| }|
dkrdg| | dg|t| |   }|dg|t|   }| jjj
g| | | jjjg|  | jjjg|t| d
|    }|dg|d
   dg|t| d
|    }n'dg| | dg|  }| jjj
g| | | jjjg|  }|dg|d
   }||||d}|d fS )NF)r@   r      Tr(   r%   r)   r@      r   r   )rE   attention_maskrF   rG   )rT   r   rP   encoderW   unk_token_idrM   r,   rJ   sumcls_token_idrU   pad_token_id)r   r\   r   rE   rF   rG   rh   offsettokensubtoken_idsr(   r%   special_tokenr_   r   r   r   rO      s   



zRSpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor._tokenize_text_by_wordsc           
      K   s"  t |t}| j|fd|d|}g }| }g }tt|D ]N}|| d u r-|d q|| ||d  krO|d |sN|d d |d | d f|d< q|d |rd||| || d f q||d |  q| jd}	|	d	kr|d
gt|t|   }||d< ||d< ||fS )NT)return_offsets_mappingrB   Fre   rC   r   rG   r(   r%   rg   rF   )rH   rI   r   r`   rV   rW   rN   rJ   )
r   r\   r   rB   r_   rF   r`   rG   rc   r(   r   r   r   rR      sD   



z]SpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor._tokenize_text_with_fast_tokenizerc                 K   sr  | j tjkrt|tsJ dd }| j|fddi|}| j }d| }t| |s8td| d| d| dt	| ||\}}|
d	| jd	}	|
d
| jd
}
|
d| jdradnd}t||
d|  krw|d |
d|   }|d t| }|	d
krdg| | dg|
t| |   }|dg|
t|   }ndg| | dg|  }||d< ||d< ||fS )NzSlow tokenizer now only support str input in inference mode. If you are training models, please consider using the fast tokenizer.rB   F"get_label_mask_and_offset_mapping_zNo `z` method defined for tokenizer z>, please use a fast tokenizer instead, or try to implement a `z` methodr(   r%   r@   re   r   rf   rg   rG   rF   )r   r   r   rH   r8   r   get_tokenizer_classhasattrRuntimeErrorgetattrr,   rJ   rW   rk   )r   r\   r   r`   r_   tokenizer_namemethodrF   rG   r(   r%   rq   r   r   r   rS      sh   



z]SpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor._tokenize_text_with_slow_tokenizerc           
      C   s   g }g }| j j|}d}|D ]D}|d d dk}|r!|d n|dd  }|d |||d  | }|t| }	|rG|||	f n
|d d |	f|d< |	}q||fS )Nr   rf   z##TFrC   )r   rP   tokenizerN   indexrW   )
r   r=   rF   rG   r\   rn   ro   is_startstartendr   r   r   /get_label_mask_and_offset_mapping_BertTokenizer  s"   
zjSpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor.get_label_mask_and_offset_mapping_BertTokenizerc                 C   s   g }g }| j j|}d}d}|D ]O}|d dk}|r0|dd  }|d t|dkr/d}qn|d |||d  | }	|	t| }
|sJ|rR||	|
f n
|d d |
f|d< |
}d}q||fS )Nr   F_re   TrC   )r   rP   rz   rN   rW   r{   )r   r=   rF   rG   r\   rn   last_is_blankro   r|   r}   r~   r   r   r   5get_label_mask_and_offset_mapping_XLMRobertaTokenizer"  s.   

zpSpeakerDiarizationSemanticSpeakerTurnDetectionPreprocessor.get_label_mask_and_offset_mapping_XLMRobertaTokenizer)r5   r6   r7   r   r   r8   r   r:   r/   r   r   r    rO   rR   rS   r   r   r;   r   r   r3   r   r<   ;   s<     .6!'r<   )#typingr   r   r   r   r   rY   modelscope.metainfor   modelscope.preprocessorsr    modelscope.preprocessors.builderr	   =modelscope.preprocessors.nlp.text_classification_preprocessorr
   >modelscope.preprocessors.nlp.token_classification_preprocessorr   r   3modelscope.preprocessors.nlp.transformers_tokenizerr   modelscope.utils.constantr   r   modelscope.utils.hubr   r   modelscope.utils.loggerr   loggerregister_moduleaudiosen_cls_tokenizerr   token_cls_tokenizerr<   r   r   r   r   <module>   s0   
#

