o
    ॵi1                     @   sz  d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
mZmZmZmZmZ d dlZd dlZd dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z(m)Z) d dl*m+Z+ d dl,m-Z- e+ Z.e j/dd Z0G dd dej1j2j3Z4eG dd dZ5G dd deZ6e"j7ej8dG dd de$Z9dS )    N)defaultdict)	dataclass)LooseVersion)partial)CallableDictListOptionalTupleUnion)nn)
DataLoaderDataset)DistributedSampler)Trainers)
TorchModel)	MsDataset)Preprocessor)TRAINERS)EpochBasedTrainer)worker_init_fn)DEFAULT_MODEL_REVISIONModeKeys)
get_logger)get_dist_infoc              	   g   sv    | du r
dV  dS t |dkrtt| g|R d } tj }tj|  zdV  W tj| dS tj| w )zgContext manager which seeds the NumPy PRNG with the specified seed and
    restores the state afterwardNr   g    .A)leninthashnprandom	get_stateseed	set_state)r!   
addl_seedsstate r%   j/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/trainers/nlp/faq_question_answering_trainer.py
numpy_seed   s   
r'   c                   @   s>   e Zd Zdd Zdd ZdddZdd	 Zd
d Zdd ZdS )EpisodeSamplerc
              
      s  |_ |_|_|_|_|_|_|	_d_d_	d_
d_d_|_i }
|}|jdkrEtdt| dt|  t|D ]B\}}||v rRqI|j	}|j
}|d u sh|d u riqI|jj}|
|i }||
|< ||g }|||< || qI|
_ttj _fd	d
jD  tt  fdd
tjD _d}j  D ]\}}|  D ]
\}}|t|7 }qq|jdkrt|j d d| dtj  |j_d S )Nr   labeltextdomaindefault_domaintrainznum. of bad sample ids:/c                    s   g | ]	}t  j| qS r%   )r   domain_label_tokens).0r+   selfr%   r&   
<listcomp>W   s    z+EpisodeSampler.__init__.<locals>.<listcomp>c                    s   g | ]
\}} |  qS r%   r%   )r0   ir+   )domain_label_cnttotalr%   r&   r3   [   s    
z: label size:z, data size:z,                 domain_size:)!datasetk_shotn_wayr_query
min_labelsr!   rank
world_sizesteplabel_field
text_fielddomain_fieldr,   episodeget_bad_sampleidsmodeloggerinfor   	enumerate
_get_fieldgetappendremove_invalid_labelsr/   sortedlistkeysdomainsfloatsumdomain_to_probitems)r2   r7   r8   r9   r:   r;   r!   n_iterr<   r=   domain_label_sampleidbad_sample_idssample_indexsampler)   r*   r+   label_tokenssample_list	data_sizetokensr%   )r5   r2   r6   r&   __init__1   st   




zEpisodeSampler.__init__c              	   c   s<   t | jD ]}| j| j | j }t|| jf { |  jd7  _tjj	| j
| jdddd }tt| j|  }t| jt|}tjj	|t|t|dd }g }|d | D ](}| j| | }	| j| j }
tt|	t|
}tjj	|	|dd }|| qYdd |D }|V  W d    n1 sw   Y  qd S )N   F)psizereplacer   )r`   ra   c                 S   s   g | ]}t |qS r%   )r   )r0   nr%   r%   r&   r3          z+EpisodeSampler.__iter__.<locals>.<listcomp>)rangerB   r>   r=   r<   r'   r!   r   r   choicerO   rR   rL   rM   r/   rN   minr9   r   tolistr8   r:   r   extend)r2   r4   r!   r+   
all_labelsNlabelsbatchr)   
candidatesnum_samplesKtmpr%   r%   r&   __iter__i   sJ   
zEpisodeSampler.__iter__Nc                 C   s    | ||}|d urt|S d S N)rI   str)r2   objkeydefaultvaluer%   r%   r&   rH      s   zEpisodeSampler._get_fieldc           	      C   s   t  }t  }i }| D ]5\}}i ||< | D ]\}}t|| jk r)|| q||| |< qt|| | jk rA||= || q|S rr   )setrS   r   r8   addr;   )	r2   rU   removed_labelsremoved_domainsresultr+   label_to_samplesr)   samplesr%   r%   r&   rK      s   
z$EpisodeSampler.remove_invalid_labelsc                 C   s  t dd }t|D ]'\}}| j|| j| jd}| j|| jdd}|| | || || jf q
g }g }| D ]C\}}	g }
g }|	 D ],\}}t	dd |D }t
|dkrd|
dd |D  qF|d	d |d
d  D  qF||
 || q:t	t|}|t	t| |S )Nc                   S   s   t tS rr   )r   rM   r%   r%   r%   r&   <lambda>   s    z2EpisodeSampler.get_bad_sampleids.<locals>.<lambda>)rv    c                 S      g | ]}|d  qS )r^   r%   r0   itemr%   r%   r&   r3      rc   z4EpisodeSampler.get_bad_sampleids.<locals>.<listcomp>   c                 S   r   r   r%   r   r%   r%   r&   r3      rc   c                 S   r   r   r%   r   r%   r%   r&   r3      rc   r^   )r   rG   rH   rA   r,   r@   rJ   r?   rS   rx   r   rh   rM   update)r2   r7   domain_text_to_sampleslocal_indexrX   r+   idxoverall_conflict_resultoverall_duplicate_resulttext_to_samplesconflict_resultduplicate_resultr*   r~   	label_cntr|   r%   r%   r&   rC      s0   

z EpisodeSampler.get_bad_sampleidsc                 C   s   | j S rr   )rB   r1   r%   r%   r&   __len__   s   zEpisodeSampler.__len__rr   )	__name__
__module____qualname__r]   rq   rH   rK   rC   r   r%   r%   r%   r&   r(   /   s    8
r(   c                   @   s,   e Zd ZdefddZd	ddZdd ZdS )
FewShotCollatorpreprocessorc                 C   s"   || _ || _d| _d| _d| _d S )Nr)   r*   r+   )r   r8   r?   r@   rA   )r2   r   r8   r%   r%   r&   r]      s
   
zFewShotCollator.__init__Nc                 C   s   t |||p|||S rr   )getattrrI   )r2   rt   ru   rv   r%   r%   r&   rH      s   zFewShotCollator._get_fieldc                    s   t t}|D ]}|j}|j |  | qg }g }g }| D ].\ }|d j }	|jd  }
||
 | fdd|	D  | gt	|
  q(|||d}j
|tjd}|S )Nc                    s   g | ]
}j |j iqS r%   )r@   r?   )r0   tr)   r2   r%   r&   r3      s
    z,FewShotCollator.__call__.<locals>.<listcomp>)	query_setsupport_setquery_label)rD   )r   rM   rH   r@   r?   rJ   rS   r8   rh   r   r   r   	INFERENCE)r2   r~   label_to_textsrX   r*   r   query_labelsr   textssqr|   r%   r   r&   __call__   s,   
zFewShotCollator.__call__rr   )r   r   r   r   r]   rH   r   r%   r%   r%   r&   r      s    
r   c                   @   s,   e Zd Zdd Zdd Zdd Zdd Zd	S )

FaqDatasetc                 C   s
   || _ d S rr   data)r2   r   r%   r%   r&   r]         
zFaqDataset.__init__c                 C   s
   | j | S rr   r   )r2   r4   r%   r%   r&   __getitem__   r   zFaqDataset.__getitem__c                 C   s   || j |< d S rr   r   )r2   ru   rw   r%   r%   r&   __setitem__   s   zFaqDataset.__setitem__c                 C   s
   t | jS rr   )r   r   r1   r%   r%   r&   r      r   zFaqDataset.__len__N)r   r   r   r]   r   r   r   r%   r%   r%   r&   r      s
    r   )module_namec                       s"  e Zd Zdddddddddedfdeeeeje	f  dee	 dee
 dee
 deee
ee	e
f f  d	eeeeef  d
eeeeef  deeeee	ef f  deejjejjjf dee	 def fddZedd ZedefddZ				ddedededededefddZ  ZS ) FaqQuestionAnsweringTrainerN)NN*   modelcfg_filecfg_modify_fnarg_parse_fndata_collatortrain_dataseteval_datasetr   
optimizersmodel_revisionr!   c                    s|   t |tr	t|}t |trt|}tt| j|||||||||	|
|fi | | jd}t| j	|| _
t| j|| _d S )Nztrain.sampler.k_shot)
isinstancerM   r   superr   r]   cfgsafe_getr   train_preprocessortrain_data_collatoreval_preprocessoreval_data_collator)r2   r   r   r   r   r   r   r   r   r   r   r!   kwargsr8   	__class__r%   r&   r]      s(   



z$FaqQuestionAnsweringTrainer.__init__c                 C   s   | j | j S rr   )_train_iters_per_epoch
max_epochsr1   r%   r%   r&   	max_iters  s   z%FaqQuestionAnsweringTrainer.max_itersreturnc                 C   s   dS )Nr   r%   r1   r%   r%   r&   
inner_iter  s   z&FaqQuestionAnsweringTrainer.inner_iterFTr   r7   workers_per_gpudistshufflec              	   K   s   t  \}}	d }
| jdi }||d< |jtjkr!| jd|d< n| jd|d< ||d< |	|d< t|fi |}|d urEtt|||dnd }t	t
jt	d	krU||d
< n
|du r_| jd t|f|
|||dd|d|}|S )Nztrain.samplerr!   ztrain.train_iters_per_epochrT   zevaluation.val_iters_per_epochr<   r=   )num_workersr<   r!   z1.7.0persistent_workersTzNpersistent_workers is invalid because your pytorch version is lower than 1.7.0
pin_memoryF)samplerr   batch_samplerr   r   )r   r   r   rD   r   TRAINr(   r   r   r   torch__version__rE   warningr   pop)r2   r7   r   r   r   r!   r   r   r<   r=   r   sampler_cfgr   init_fndata_loaderr%   r%   r&   _build_dataloader_with_dataset  sN   



	z:FaqQuestionAnsweringTrainer._build_dataloader_with_dataset)FTr   F)r   r   r   r   r	   r   r   r   Modulers   r   r   r   r   r   r   r
   r   optim	Optimizerlr_scheduler_LRSchedulerr   r]   propertyr   r   boolr   r   __classcell__r%   r%   r   r&   r      s|    

	

!
r   ):
contextlibcollectionsr   dataclassesr   distutils.versionr   	functoolsr   typingr   r   r   r	   r
   r   numpyr   r   r   torch.utils.datar   r   torch.utils.data.distributedr   modelscope.metainfor   modelscope.models.baser   modelscope.msdatasetsr   modelscope.preprocessorsr   modelscope.trainers.builderr   modelscope.trainers.nlp_trainerr   modelscope.trainers.trainerr   modelscope.utils.constantr   r   modelscope.utils.loggerr   modelscope.utils.torch_utilsr   rE   contextmanagerr'   utilsr   BatchSamplerr(   r   r   register_modulefaq_question_answering_trainerr   r%   r%   r%   r&   <module>   s>    
 
'