o
    ߥiKF                     @   s   d Z ddlZddlZddlZddlmZ ddlmZ ddlZ	ddl
Z
ddlZ
ddlmZmZ ddlmZ ddlmZ dd	lmZ G d
d de
jjjZG dd dZdd Z		dddZdd Zdd Zdd Zdd Zdd Z dS )z&parses arguments and preps data loader    N)bisect_right)
accumulate)mpuprint_rank_0   )
data_utils)ConstructBlockStrategy)make_tokenizerc                       s@   e Zd Z			d fdd	Zdd Zedd	 Zd
d Z  ZS )MultiTaskDatasetT皙?@ c                    s   t t|   || _|| _|| _| _dd |D | _t	 fdd| jD | _
t| j| _tt| j| _| jrIttt| j| j| j
 nttt| j| j |  j
| j
   _
d S )Nc                 S      g | ]}t |qS  )len).0datasetr   r   ]/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/mglm/configure_data.py
<listcomp>-       z-MultiTaskDataset.__init__.<locals>.<listcomp>c                    s   g | ]	}t |  qS r   )min)r   length	max_limittemperaturer   r   r   /   s    )superr
   __init__tasksdatasetsreweightr   lensnparrayweightssum	total_lenlistr   cumulative_lensr   zip)selfr   r   r   r   r   	__class__r   r   r   "   s   zMultiTaskDataset.__init__c                 C   s
   | j d S )Ni  )r$   )r(   r   r   r   __len__8   s   
zMultiTaskDataset.__len__c                 C   s   | d }| d }| d }| d }| d }| d }t |jdkr4|| }|| }|| }|| }|| }n|| }|jsB|t |}|||||dS )	Ntext
logit_masktargetmaskpositionlabel   )r,   r.   	loss_maskposition_idattention_mask)r   shaperepeat)datar,   r3   r.   r5   r4   r1   r   r   r   pet_wrapper;   s*   
zMultiTaskDataset.pet_wrapperc                    s   | j r=t| tjj fddtdD d  jtt| j	| j
d}| j	| } tt|}| j	| | }nt| j|}|dkrJ|}n	|| j|d   }| j	| | }| |}|S )Nc                    s   g | ]}  d dqS )r   l    )randint)r   _rngr   r   r   Y   s    z0MultiTaskDataset.__getitem__.<locals>.<listcomp>   )seed)pr   r   )r   randomRandomr    RandomStaterangechoicearanger   r   r"   r   r&   r9   )r(   idxdataset_idxr   
sample_idxitemr   r<   r   __getitem__U   s$   


zMultiTaskDataset.__getitem__)Tr   r   )	__name__
__module____qualname__r   r+   staticmethodr9   rK   __classcell__r   r   r)   r   r
       s    
r
   c                       s6   e Zd Zd
 fdd	Zdd Zdd Zdd	 Z  ZS )
DataConfigNc                    s$   t t|   |d u ri }|| _d S N)r   rQ   r   defaults)r(   rS   r)   r   r   r   l   s   
zDataConfig.__init__c                 C   s*   t j dkrtd | | t||S )Nr   zconfiguring data)torchdistributedget_rankprintapply_defaultsmake_loaders)r(   args	tokenizerr   r   r   applyr   s   

zDataConfig.applyc                 K   s    |  D ]	\}}|| j|< qd S rR   )itemsrS   )r(   kwargskvr   r   r   set_defaultsx   s   zDataConfig.set_defaultsc                 C   s:   | j  D ]\}}|dd}t||st||| qd S )N-r;   )rS   r]   replacehasattrsetattr)r(   rZ   r_   r`   r   r   r   rX   |   s   
zDataConfig.apply_defaultsrR   )rL   rM   rN   r   r\   ra   rX   rP   r   r   r)   r   rQ   j   s
    rQ   c           	      C   s4  d}| j r| j}t| jd | j| j| j| j| j|| j	| j
dkp"| jdkd
}t dkrp|j}|dj}||djks>J |}|}| j}|| dkrU|d7 }|| dksKtd||| | td| tj||g}ntjddg}tjj|t t d	 |d  }|d  }||| _| _|S )
Nr           )add_block_symbols	cache_diradd_sentinel_tokenadd_task_maskadd_decoder_maskeospadr   z=> padded vocab (size: {}) with {} dummy tokens (new size: {})z!> found end-of-document token: {}group)sentinel_tokenmax_position_embeddingsr	   tokenizer_typetokenizer_path
vocab_sizetokenizer_model_typeblock_lmrh   	task_maskblock_mask_probcontext_mask_ratior   get_model_parallel_rank
num_tokensget_commandIdmake_vocab_size_divisible_byr   formatrT   cuda
LongTensorrU   	broadcastget_model_parallel_src_rankget_model_parallel_grouprJ   	eod_token)	rZ   ri   r[   r{   r   beforeaftermultipletoken_countsr   r   r   prepare_tokenizer   sV   

r   Fc                 C   s  t jjt d}t jjt d}|jd ur&||j }||j }||j }|dk}	|jr:tj	
t| ||||}
n4|rLtj	j| d||j |j d}nt jj| }|	}|	retj	j||||||jd}
n	t jj|||}
d }|rt|||jfi d|jd|jd|jd	|jd
|jd|jd|jd|jd|jd|jd|j  d|j! d|j"d|j#d|j$d|j%d|j&j'}t jjj(| |
|j)d|d}|S )Nrn   r   T)replacementnum_samples)gradient_accumulation_steps	bert_probgap_sentence_probgap_sentence_ratiogpt_infill_probaverage_block_lengthgpt_min_ratiorx   ry   short_seq_probsingle_span_probshuffle_blocksblock_position_encodingrp   encoder_decoderrw   random_position	masked_lm)batch_samplernum_workers
pin_memory
collate_fn)*rT   rU   get_world_sizer   get_data_parallel_grouprV   loader_scattertransformer_xlr   samplersDistributedSequentialSamplerr   RandomSamplertrain_itersr   utilsr8   SequentialSamplerDistributedBatchSamplerBatchSamplerr   
seq_lengthr   r   r   r   avg_block_lengthr   rx   ry   r   r   no_shuffle_blockno_block_positionrp   r   rw   r   r   construct_blocks
DataLoaderr   )r   r[   
batch_size	num_itersrZ   shuffleblock_collate
world_sizerankrU   r   sampler	drop_lastr   data_loaderr   r   r   make_data_loader   s   



	
r   c                 C   s   ddl }| j| j| jdt| jd| j| j d | jdkd}|jj	| j
fi |}d|d< | jdur7| j|d< | jdurA| j|d	< d}| jdurS|jj	| jfi |}d}| jdure|jj	| jfi |}|j| j|| j| j| j| jd
}|||f|fS )z3Load train/val/test dataset from shuffled TFRecordsr   NTr   )r   max_seq_lenmax_preds_per_seqtrainr   r?   threaded_dlFr   r   r   )rh   )data_utils.tf_dlr   r   r   maxr   r?   r   tf_dlTFRecordDataLoader
train_dataeval_seq_lengtheval_max_preds_per_seq
valid_data	test_datar	   rr   rs   rt   ru   rh   )rZ   r   data_set_argsr   validtestr[   r   r   r   make_tfrecord_loaders   sH   
	





r   c              	   C   s  | j rt| S tjjt d}| jdur|| j dksJ | j| }|}| j	dur/| j	| }| j
}|dk r:|| }| j}|durI|dk rI|| }t| }i d| jd|d| jd| jd| jd	d
d| jd|d| jd| jd| jd| jd| jd| j d|d| jd| j| j| j| jt | j| jd}t|}	dg|	d< |r||	d< | j r| j |	d< | j!dur| j!|	d< d\}
}}| jdurt"j#di |}
t"$|r|
\}
}}||	d< |du r| j%dur| j%|	d< t"j#di |	}||	d< |du r| j&dur| j&|	d< t"j#di |	}| j'p| j(}|
dur7| jdkr7t)|
||| j*| | j+|d}
d| _,nd| _,|dkrA|n|}|durYt)|||| j*| | j+|d}d| _-nd| _-|durwt)|||t.|| d | | j+|d}d| _/nd| _/|
||fS )zmakes training/val/testrn   Nr   pathr   
mem_lengthdelimtext_key	label_keyr1   ds_typesplitlooser   presplit_sentencessample_one_documentfilter_englishpre_tokenizer[   save_splitsload_splits)save_test_datano_lazy_loaderr   data_parallel_ranknon_sentence_starthalf_lazy_loader      ?)NNN)r   r   TFr   r   )0use_tfrecordsr   rT   rU   r   r   r   r   r   eval_batch_sizer   r   	get_splitr   r   r   r   data_set_type
loose_jsonr   r   r   r   no_pre_tokenizer   r   r   r   get_data_parallel_rankr   r   copyr   eval_text_keyr   make_datasetshould_splitr   r   rv   r   r   r   r   do_traindo_validr   do_test)rZ   r[   r   r   r   r   r   r   r   eval_set_argsr   r   r   	use_blockr   r   r   rY     s   



	














rY   c                 C   s.  ddddddddd	d
ddd}d\}}t  dkr| j}| jd ur$| j}g g }}| jD ],}| }tj| j	|| }	|
t| ||	|d|dd |
t| ||	|d|dd q,t| j|}t| j|}tjjt  d}
| j|
 }| jd ur}| j|
 }t|||| j| dd}t|||| j| dd}||fS )NMNLICoLAMRPCQNLIQQPzSST-2Agnewsyelp_review_polarity_csvyelp_review_full_csvYahooSQuADRACE)mnlicolamrpcqnliqqpsst2agnewszyelp-polarityz	yelp-fullyahoosquadrace)NNr   r   T)pattern_ensembledevrn   )r   )r   rz   r   multi_seq_lengthmulti_task_datalowerosr   joindata_dirappendSuperGlueDatasetr
   rT   rU   r   r   r   multi_batch_sizer   r   )rZ   r[   	task_dirsr   r   r  train_datasetsvalid_datasetstaskr  r   r  r   r   r   build_multi_task_dataset  s   


		


r  c                    s   g }| j ddkrdd | j  dD }n| j ddkr*dd | j  dD }nt| j g}t|}|dk r?|d|  t|d	k rP|d
 t|d	k sE|dd	 }| jdur_d
|d< | jdurhd
|d< t|  fdd|D S )z=
    Get dataset splits from comma separated string list
    ,c                 S   r   r   floatr   sr   r   r   r     r   zget_split.<locals>.<listcomp>/c                 S   r   r   r  r  r   r   r   r     r   r   r      rf   Nr2   c                    s   g | ]}|  qS r   r   r  	final_sumr   r   r     r   )r   findr  r#   r  r   r   r   )rZ   splitssplit_totalr   r  r   r     s&   


r   c               
   C   s"   dddddddddd	} t | d	S )
z*add cmdline flags for configuring datasetsr   r  r   F
supervised   d   )	r   r   persist_statelazy	transposer   r   r   samples_per_shardrS   )rQ   r)  r   r   r   configure_data  s   
r*  )FF)!__doc__r   r
  rA   bisectr   	itertoolsr   numpyr    rT   torch.utils.datamegatron_utilr   r    r   blocklm_utilsr   data_utils.tokenizationr	   r   r8   Datasetr
   rQ   r   r   r   rY   r  r   r*  r   r   r   r   <module>   s0   J0
I'wB