o
    ॵi                     @   s(  d dl Z d dlmZ d dlmZmZmZmZmZm	Z	m
Z
 d dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' e' Z(eG dd deZ)e!j*ej+dG dd de#Z,dS )    N)	dataclass)AnyCallableDictListOptionalTupleUnion)nn)
DataLoaderDataset)tqdm)DataCollatorWithPadding)Trainers)Model
TorchModel)BertForTextRanking)	MsDataset)Preprocessor)TRAINERS)NlpEpochBasedTrainer)DEFAULT_MODEL_REVISION)
get_loggerc                   @   s    e Zd ZdZdZdZdd ZdS )SentenceEmbeddingCollatorz
    Wrapper that does conversion from List[Tuple[encode_qry, encode_psg]] to List[qry], List[psg]
    and pass batch separately to the actual collator.
    Abstract out data detail for the model.
       Nc                    s   dd |D dd |D  d   }fdd|D | jjjd| jdd	} d   } fd
d|D  | jjj d| jdd	}||dS )Nc                 S      g | ]}|d  qS )query .0fr   r   f/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/trainers/nlp/sentence_embedding_trainer.py
<listcomp>&       z6SentenceEmbeddingCollator.__call__.<locals>.<listcomp>c                 S   r   )docsr   r   r   r   r!   r"   '   r#   r   c                    s    i | ]   fd dD qS )c                       g | ]}|  qS r   r   r   elekr   r!   r"   )   r#   ASentenceEmbeddingCollator.__call__.<locals>.<dictcomp>.<listcomp>r   r   )qqr(   r!   
<dictcomp>)   s     z6SentenceEmbeddingCollator.__call__.<locals>.<dictcomp>
max_lengthpt)paddingr.   return_tensorsc                    s&   i | ]  t  fd dD g qS )c                    r%   r   r   r&   r(   r   r!   r"   0   r#   r*   )sumr+   )ddr(   r!   r-   0   s   & )r   r$   )keys	tokenizer
_tokenizerpadr.   )selffeaturesr4   
q_collated
d_collatedr   )r3   r,   r!   __call__%   s&   
z"SentenceEmbeddingCollator.__call__)__name__
__module____qualname____doc__r.   r5   r<   r   r   r   r!   r      s
    r   )module_namec                       s   e Zd Zdddddddddef
deeeeje	f  dee	 dee
 dee
 dee
 deeeef  d	eeeef  d
ee deejjejjjf dee	 f fddZ fddZdd Z  ZS )SentenceEmbeddingTrainerN)NNmodelcfg_filecfg_modify_fnarg_parse_fndata_collatortrain_dataseteval_datasetpreprocessor
optimizersmodel_revisionc                    s,   t  jd|||||||	|||
d
| d S )N)
rC   rD   rE   rF   rG   rJ   rK   rH   rI   rL   r   )super__init__)r8   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   kwargs	__class__r   r!   rN   <   s   
z!SentenceEmbeddingTrainer.__init__c                    s0   |du rt | jj| jjd}t j|fi |S )zGet the data collator for both training and evaluating.

        Args:
            data_collator: The input data_collator param.

        Returns:
            The train_data_collator and eval_data_collator, can be None.
        N)r5   r.   )r   train_preprocessornlp_tokenizerr.   rM   get_data_collator)r8   rG   rO   rP   r   r!   rT   Y   s   	z*SentenceEmbeddingTrainer.get_data_collatorc                 C   s   i S )Nr   )r8   r   r   r!   evauateh   s   z SentenceEmbeddingTrainer.evauate)r=   r>   r?   r   r   r	   r   r
   Modulestrr   r   r   r   r   torchoptim	Optimizerlr_scheduler_LRSchedulerrN   rT   rU   __classcell__r   r   rP   r!   rB   9   sH    	
rB   )-timedataclassesr   typingr   r   r   r   r   r   r	   numpynprX   r
   torch.utils.datar   r   r   transformersr   modelscope.metainfor   modelscope.models.baser   r   modelscope.models.nlpr    modelscope.msdatasets.ms_datasetr   modelscope.preprocessors.baser   modelscope.trainers.builderr   modelscope.trainers.nlp_trainerr   modelscope.utils.constantr   modelscope.utils.loggerr   loggerr   register_modulenlp_sentence_embedding_trainerrB   r   r   r   r!   <module>   s.   $