o
    ॵiB                     @   sl  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1 ddl2m3Z3 dZ4e1e4Z5de j6d< e%j7ej8dG dd de"Z9dS )    N)defaultdict)ceil)CallableDictListOptionalTupleUnion)distributed)nnDataset)Trainers)
TorchModel)	MsDataset)pipeline)Preprocessor)EpochBasedTrainerNlpEpochBasedTrainer)TRAINERS)build_optimizer)Config)DEFAULT_MODEL_REVISIONModeKeysTasks)func_receive_dict_inputs)
get_logger   )is_paralleltrueTOKENIZERS_PARALLELISM)module_namec                       sn  e Zd Zdddddddedddddfdeeeeje	f  d	ee	 d
ee
 deeeef  deeeef  deeeee	ef f  deejjejjjf dee	 def fddZ	d-deejjjeeejjj f dede	dee f fddZdd Zdd Zdd Zdd Zd.d!d"Z d#d$ Z!	d-d%ee	 d&ee	e"f fd'd(Z#d&eee	ef  fd)d*Z$d+d, Z%  Z&S )/SiameseUIETrainerN)NN*      i`  i     modelcfg_filecfg_modify_fntrain_dataseteval_datasetpreprocessor
optimizersmodel_revisionseedc                    sN   t d || _|| _|| _|
| _t jd|||| j||||||	d
| dS )a	  Epoch based Trainer, a training helper for PyTorch.

        Args:
            model (:obj:`torch.nn.Module` or :obj:`TorchModel` or `str`): The model to be run, or a valid model dir
                or a model id. If model is None, build_model method will be called.
            cfg_file(str): The local config file.
            cfg_modify_fn (function): Optional[Callable] = None, config function
            train_dataset (`MsDataset` or `torch.utils.data.Dataset`, *optional*):
                The dataset to use for training.

                Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a
                distributed fashion, your iterable dataset should either use a internal attribute `generator` that is a
                `torch.Generator` for the randomization that must be identical on all processes (and the Trainer will
                manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally
                sets the seed of the RNGs used.
            eval_dataset (`MsDataset` or `torch.utils.data.Dataset`, *optional*): The dataset to use for evaluation.
            preprocessor (:obj:`Preprocessor`, *optional*): The optional preprocessor.
                NOTE: If the preprocessor has been called before the dataset fed into this
                trainer by user's custom code,
                this parameter should be None, meanwhile remove the 'preprocessor' key from the cfg_file.
                Else the preprocessor will be instantiated from the cfg_file or assigned from this parameter and
                this preprocessing action will be executed every time the dataset's __getitem__ is called.
            optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler]`, *optional*): A tuple
                containing the optimizer and the scheduler to use.
            model_revision (str): The model version to use in modelhub.
            negative_sampling_rate (float): The rate to do negative sampling.
            slide_len (int): The length to slide.
            max_len (int): The max length of prompt + text.
            hint_max_len (int): The max length of prompt.
            seed (int): The optional random seed for torch, cuda, numpy and random.
        z*******************)
r&   r'   r(   data_collatorr)   r*   r+   r,   r-   r.   N )print	slide_lenmax_lenhint_max_lennegative_sampling_ratesuper__init___nn_collate_fn)selfr&   r'   r(   r)   r*   r+   r,   r-   r.   r5   r2   r3   r4   kwargs	__class__r0   _/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/trainers/nlp/siamese_uie_trainer.pyr7   (   s&   2
zSiameseUIETrainer.__init__datasets	model_cfgmodec                    s6   |t jkr
| |}tt| jd|| j||d|S )N)r>   r?   r@   r+   r0   )r   TRAINload_datasetr6   r"   build_datasetcfg)r9   r>   r?   r@   r+   r:   r;   r0   r=   rC   m   s   

zSiameseUIETrainer.build_datasetc                 C   s:   d| j _| j| j f| j| j| jd| jjdi }|S )a,   Builder torch dataloader for training.

        We provide a reasonable default that works well. If you want to use something else, you can change
        the config for data.train in configuration file, or subclass and override this method
        (or `get_train_dataloader` in a subclass.
        N)distr.   
collate_fn
dataloader)	r)   r+   _build_dataloader_with_dataset_dist_seedtrain_data_collatorrD   trainget)r9   data_loaderr0   r0   r=   get_train_dataloader}   s   z&SiameseUIETrainer.get_train_dataloaderc                    sV   |sd S |D ]" |t | g    fdd|D 7  < | |  || g  qd S )Nc                    s   g | ]}| kr|qS r0   r0   ).0vkr0   r=   
<listcomp>       z:SiameseUIETrainer.get_brother_type_map.<locals>.<listcomp>)tupleget_brother_type_map)r9   schemabrother_type_mapprefix_typesr0   rR   r=   rW      s   z&SiameseUIETrainer.get_brother_type_mapc                    sF  g }t |D ]r\}}t|d |d< t|d |d< tt}|d D ]1 d} D ]*}||d  d7 }|d |d d}||| vrL|| | ||d  d	7 }q+q%tt}	| |d |	g  |d D ]M d}t  D ]D\}
}t fd
dt|
d D }|		|g D ]}|| d }||vrt

 | jk rg ||< q||d  d7 }||d  d	7 }qpqh|d D ]}| d}||vrt

 | jk rg ||< qt |D ]\}
}|d  d|
 ||d || d}|d }|d }| |gd }| j|g| jddd }|j|d< |	dg }| |||d |\}}t|| jkr4tt|| j | j d nd}t|D ]=}|| j || j | j }}|||j|| |j|| |j|j|j|| |j||| ||| d
}|| q:qqddlm} ||}t
tt|dD ]}td| d||  d q|S )N	info_listrX    typez: spanoffset)r^   r_   z, c                    s   g | ]} | d  qS )r]   r0   )rP   jinfor0   r=   rT      rU   z2SiameseUIETrainer.load_dataset.<locals>.<listcomp>r$   id-text)rc   hintre   spansr   T)
max_length
truncationoffsetsrg   )
rc   shifttokens	token_idshint_tokenshint_token_idsattention_maskscross_attention_maskshead_labelstail_labelsr      zSample z of the training set: .)	enumeratejsonloadsr   listappendrW   rV   rangerM   randomr5   train_preprocessorr4   rj   _get_labelslenr3   r   r2   rl   idsattention_maskr>   r   	from_listsampleloggerrb   )r9   raw_datasetdatanum_line
raw_samplehint_spans_maprf   itemr^   rY   ikeystneg_hintrS   r   uuidre   tokenized_inputtokenized_hintentitiesrr   rs   	split_numr`   abr   r)   indexr0   ra   r=   rB      s   





$
zSiameseUIETrainer.load_datasetc              	   C   sZ  t |}dg| }dg| }i }tt |D ]}	||	 }
t|
d |
d D ]}|	||< q%q|D ]y}|d \}}|d8 }||vre|d7 }|t |kratd|d |d ||d d |d d   n||vs?||vr|d8 }|dk rtd|d |d ||d d |d d   n||vsi|t |ks|dk rq/|| }|| }d||< d||< q/||fS )Nr   r$   r_   hr^   t)r   r{   r1   )r9   re   r   rj   r   
num_tokensrr   rs   char_index_to_token_index_mapr   r_   r`   er   r   
token_head
token_tailr0   r0   r=   r~      sF   



zSiameseUIETrainer._get_labelsr   c                 C   s0   g }|D ]}| ||g| jt|    q|S N)rz   r3   r   )r9   r   valresseqr0   r0   r=   _padding  s    zSiameseUIETrainer._paddingc                 C   s  t j| dd |D t jd}t j| dd |D t jd}t j| dd |D t jd}t j| dd |D t jd}t j| dd |D t jd}t j| dd |D t jd}|d	jd
d  }|d|d  d 7 }t	| j
|}	|d d d |	f }|d d d |	f }|d d d |	f }|d d d |	f }|d	jd
d  }|d|d  d 7 }t	| j|}
|d d d |
f }|d d d |
f }||||||dS )Nc                 S      g | ]}|d  qS )rm   r0   rP   r   r0   r0   r=   rT         z4SiameseUIETrainer._nn_collate_fn.<locals>.<listcomp>)dtypec                 S   r   )ro   r0   r   r0   r0   r=   rT     r   c                 S   r   )rp   r0   r   r0   r0   r=   rT     r   c                 S   r   )rq   r0   r   r0   r0   r=   rT     r   c                 S   r   )rr   r0   r   r0   r0   r=   rT     r   c                 S   r   )rs   r0   r   r0   r0   r=   rT     r   r   )dim   )	input_idsrp   hint_idsrq   rr   rs   )torchtensorr   longfloatgtsummaxr   minr3   r4   )r9   batchrm   ro   rp   rq   rr   rs   batch_max_lentruncate_lenhint_truncate_lenr0   r0   r=   r8     sV   z SiameseUIETrainer._nn_collate_fncheckpoint_pathreturnc                 O   s0  t tj| jt| jd}|dur"tj|r"ddl	m
} |||  | j  tj| _| j| _d } }}d| j_| jD ]K}	|	d }
t|	d }t|	d }||
|d	d
 }tdd |D }tdd |D }t|t|t||}}}||7 }||7 }||7 }q<| |||\}}}|||dS )a  evaluate a dataset

        evaluate a dataset via a specific model from the `checkpoint_path` path, if the `checkpoint_path`
        does not exist, read from the config file.

        Args:
            checkpoint_path (Optional[str], optional): the model path. Defaults to None.

        Returns:
            Dict[str, float]: the results about the evaluation
            Example:
            {"accuracy": 0.5091743119266054, "f1": 0.673780487804878}
        )deviceNr   )LoadCheckpointHook绽|=re   rX   r[   )inputrX   outputc                 S      g | ]}t |qS r0   strr   r0   r0   r=   rT   Z  r   z.SiameseUIETrainer.evaluate.<locals>.<listcomp>c                 S   r   r0   r   r   r0   r0   r=   rT   [  r   )	precisionrecallf1)r   r   siamese_uier&   r   r   ospathisfilemodelscope.trainers.hooksr   load_checkpointevalr   EVAL_modetrain_dataloadereval_dataloaderr*   r+   rw   rx   setr   intersectioncompute_metrics)r9   r   argsr:   pipeline_uier   num_pred
num_recallnum_correctr   re   rX   gold_info_listpred_info_listpred_info_list_setgold_info_list_setr   r   cr   r   r   r0   r0   r=   evaluate:  s8   



zSiameseUIETrainer.evaluatec                 C   s   | j S )a*  Get the metric class types.

        The first choice will be the metrics configured in the config file, if not found, the default metrics will be
        used.
        If no metrics is found and the eval dataset exists, the method will raise an error.

        Returns: The metric types.

        )r   )r9   r0   r0   r=   get_metricse  s   
zSiameseUIETrainer.get_metricsc                 C   sZ   ||  krdkrdS  |t | }|t | }d| | ||  }|dkr(dS |||fS )Nr   )r$   r$   r$   r   )r   r   r   )r   )r9   r   r   r   r   r   r   r0   r0   r=   r   q  s   
z!SiameseUIETrainer.compute_metricsr   )r   )'__name__
__module____qualname__r   r   r	   r   r   Moduler   r   r   r   r   r   r   r   optim	Optimizerlr_scheduler_LRSchedulerintr7   utilsr   r   r   rC   rO   rW   rB   r~   r   r8   r   r   r   r   __classcell__r0   r0   r;   r=   r"   %   s~    
	J	Q
 .

+r"   ):r   r|   timecollectionsr   mathr   typingr   r   r   r   r   r	   rw   numpynpr   r
   rE   r   torch.utils.datar   modelscope.metainfor   modelscope.models.baser   modelscope.msdatasetsr   modelscope.pipelinesr   modelscope.preprocessors.baser   modelscope.trainersr   r   modelscope.trainers.builderr   %modelscope.trainers.optimizer.builderr   modelscope.utils.configr   modelscope.utils.constantr   r   r   modelscope.utils.file_utilsr   modelscope.utils.loggerr   parallel.utilsr   PATHr   environregister_modulesiamese_uie_trainerr"   r0   r0   r0   r=   <module>   s<    
