o
    ߥiŲ                     @   sd  d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlZddlZddlZddlZddlZddlZddlmZ ddlmZ ddlmZ d	d
lmZmZ G dd dejZG dd dejZG dd dejZd ddZ G dd dejZ!G dd dejZ"G dd dejZ#G dd dejZ$G dd dejZ%G dd dejZ&dS )!z2dataset objects for jsons, csvs, and BERT datasets    N)bisect_right)
accumulate)
itemgetter)tokenize)data)print_rank_0   )
LazyLoaderexists_lazyc                   @   s$   e Zd Zdd Zdd Zdd ZdS )ShuffleDatasetc                    sr   | _ ttt j  _t j t|do|j _ jr7 fdd jD  _	 fdd jD  _
d S d S )Nis_lazyc                       g | ]} j j| qS  )dsprompt_lens.0idxselfr   b/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/mglm/data_utils/datasets.py
<listcomp>.       z+ShuffleDataset.__init__.<locals>.<listcomp>c                    r   r   )r   	text_lensr   r   r   r   r   1   r   )r   listrangelenshuffle_idsrandomshufflehasattrr   r   r   )r   r   r   r   r   __init__(   s   

zShuffleDataset.__init__c                 C      | j | j|  S N)r   r   r   r   r   r   r   __getitem__5      zShuffleDataset.__getitem__c                 C   
   t | jS r#   )r   r   r   r   r   r   __len__8      
zShuffleDataset.__len__N)__name__
__module____qualname__r!   r%   r(   r   r   r   r   r   &   s    r   c                       sx   e Zd ZdZedd Z fddZdd Zdd	 Zd
d Z	dd Z
dd Zedd Zedd Zedd Z  ZS )ConcatDataseta'  
    Dataset to concatenate multiple datasets.
    Purpose: useful to assemble different existing datasets, possibly
    large-scale datasets as the concatenation operation is done in an
    on-the-fly manner.
    Arguments:
        datasets (sequence): List of datasets to be concatenated.
    c                 C   s6   g d}}| D ]}t |}|||  ||7 }q|S Nr   )r   append)sequencerselr   r   r   cumsumF   s   

zConcatDataset.cumsumc                    sp   t t|   t|dksJ dt|| _tdd | jD t| jk| _| | j| _	d | _
d | _d | _d S )Nr   z(datasets should not be an empty iterablec                 S   s&   g | ]}t |tpt|d o|jqS )r   )
isinstancer	   r    r   )r   r   r   r   r   r   S   s
    
z*ConcatDataset.__init__.<locals>.<listcomp>)superr-   r!   r   r   datasetssumr   r5   cumulative_sizes_X_Y_lens)r   r8   kwargs	__class__r   r   r!   O   s   

zConcatDataset.__init__c                 C   s<   t | j|}|dkr|}n	|| j|d   }| j| |S Nr   r   )r   r:   r8   get_text_lenr   r   dataset_idx
sample_idxr   r   r   rB   \   s
   zConcatDataset.get_text_lenc                 C   s   | j D ]}|| qd S r#   )r8   SetTokenizer)r   	tokenizerr   r   r   r   rF   d   s   
zConcatDataset.SetTokenizerc                 C   s   | j d  S r.   )r8   GetTokenizerr   r   r   r   rH   h   s   zConcatDataset.GetTokenizerc                 C   s
   | j d S )N)r:   r   r   r   r   r(   k   r)   zConcatDataset.__len__c                 C   s:   t | j|}|dkr|}n	|| j|d   }| j| | S rA   )r   r:   r8   rC   r   r   r   r%   n   s
   zConcatDataset.__getitem__c                 C   s^   | j d u r,g | _ | jr| jD ]	}| j |j q| j S | jD ]}| j dd |D  q| j S )Nc                 S   *   g | ]}t |trt|d  nt|qS textr6   dictr   r   dr   r   r   r          z&ConcatDataset.lens.<locals>.<listcomp>)r=   r   r8   extendlensr   r   r   r   r   rS   v   s   



zConcatDataset.lensc                 C   s0   | j d u rg | _ | jD ]	}| j |j q| j S r#   )r;   r8   rR   XrT   r   r   r   rU      s
   

zConcatDataset.Xc                 C   sB   | j d u rg | _ | jD ]}| j t|j qt| j | _ | j S r#   )r<   r8   rR   r   YnparrayrT   r   r   r   rV      s   

zConcatDataset.Y)r*   r+   r,   __doc__staticmethodr5   r!   rB   rF   rH   r(   r%   propertyrS   rU   rV   __classcell__r   r   r?   r   r-   <   s     	


r-   c                   @   s`   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	e
dd Ze
dd Zdd ZdS )SplitDatasetaq  
    Dataset wrapper to access a subset of another dataset.
    Purpose: useful to index into existing datasets, possibly
    large-scale datasets as the subindexing operation is done in an
    on-the-fly manner.
    Arguments:
        ds (Dataset or array-like): List of datasets to be subindexed
        split_inds (1D array-like): List of indices part of subset
    c                 K   s<   t || _|| _t|tpt|do|j| _d | _d | _d S )Nr   )	r   
split_indswrapped_datar6   r	   r    r   r;   r<   )r   r   r^   r>   r   r   r   r!      s   

zSplitDataset.__init__c                 C   r'   r#   )r   r^   r   r   r   r   r(      r)   zSplitDataset.__len__c                 C   s   | j | j| S r#   )r_   rB   r^   r$   r   r   r   rB      s   zSplitDataset.get_text_lenc                 C   r"   r#   )r_   r^   )r   indexr   r   r   r%      r&   zSplitDataset.__getitem__c                 C   s   | j | d S r#   )r_   rF   r   rG   r   r   r   rF      r&   zSplitDataset.SetTokenizerc                 C   s
   | j  S r#   )r_   rH   r   r   r   r   rH      r)   zSplitDataset.GetTokenizerc                 C   s$   | j d u rt| j | jj| _ | j S r#   )r;   r   r^   r_   rU   r   r   r   r   rU      s   
zSplitDataset.Xc                 C   s*   | j d u rtt| j | jj| _ | j S r#   )r<   rW   rX   r   r^   r_   rV   r   r   r   r   rV      s
   
zSplitDataset.Yc                 c   s    | j D ]}| j| V  qd S r#   )r^   r_   r$   r   r   r   __iter__   s   
zSplitDataset.__iter__N)r*   r+   r,   rY   r!   r(   rB   r%   rF   rH   r[   rU   rV   rb   r   r   r   r   r]      s    


r]   Tc                 C   sX  |du rg d}t |}|dkrtdt|}|| }t| }t|}|r3tjd}|| |durLt	|}t||ksDJ t
d|  n|durdtj dkrdt|| td|  d}	d}
dgt| }t|D ]6\}}|dkr|||  }|
|d 7 }
tt||
 }||	|	t|d  }t| |||< |	|7 }	|
d; }
qs|S )	a  
    Split a dataset into subsets given proportions of how
    much to allocate per split. If a split is 0% returns None for that split.
    Purpose: Useful for creating train/val/test splits
    Arguments:
        ds (Dataset or array-like): Data to be split.
        split (1D array-like): proportions to split `ds`. `sum(splits) != 0`
        shuffle (boolean): Randomly split dataset. Default: True
        save_splits: save split indices to file
        load_splits: load split indices from file
    N)皙?g?        r   zSplit cannot sum to 0.i  zLoad split indices from zSave split indices to r   )r9   	ExceptionrW   rX   r   aranger   RandomStater   loadr   torchdistributedget_ranksaveprint	enumerateintmaxr]   )r   splitr   save_splitsload_splits	split_sumds_lenindsrng	start_idxresidual_idxrtn_dsif
proportionsplit_r^   r   r   r   split_ds   sD   



r   c                   @   s^   e Zd ZdZ							dddZd	d
 Zdd Zedd Zdd Z	dd Z
dddZdS )csv_dataseta  
    Class for loading datasets from csv files.
    Purpose: Useful for loading data for unsupervised modeling or transfer tasks
    Arguments:
        path (str): Path to csv file with dataset.
        tokenizer (data_utils.Tokenizer): Tokenizer to use when processing text. Default: None
        preprocess_fn (callable): Callable that process a string into desired format.
        delim (str): delimiter for csv. Default: ','
        binarize_sent (bool): binarize label values to 0 or 1 if they're on a different scale. Default: False
        drop_unlabeled (bool): drop rows with unlabelled values. Always fills remaining empty
            columns with -1 (regardless if rows are dropped based on value) Default: False
        text_key (str): key to get text from csv. Default: 'sentence'
        label_key (str): key to get label from json dictionary. Default: 'label'
    Attributes:
        X (list): all strings from the csv file
        Y (np.ndarray): labels to train with
    N,Fsentencelabelc	              
   K   s4  d| _ || _| | || _|| _|| _|| _|| _d| jv r"d| _g | _g | _	z|g}
t
|tr6|
|7 }
n|
|g7 }
tj| j| j|
dd}W n   tj| j| j|gdd}Y |jdd}|| j | _z|| j| _	W n ty } ztt| jd | _	W Y d }~nd }~ww |rt| j	|d	| _	d S d S )
NFz.tsv	zlatin-1)sepusecolsencodingr   axisrI   hard)r   preprocess_fnrF   pathdelimtext_key	label_keydrop_unlabeledrU   rV   r6   r   pdread_csvdropnavaluestolistre   rW   onesr   binarize_labels)r   r   rG   r   r   binarize_sentr   r   r   r>   colsr   r3   r   r   r   r!     sL   






"zcsv_dataset.__init__c                 C   6   |d u rd| _ t| ds|| _d S d S d| _ || _d S NF
_tokenizerTusing_tokenizerr    r   ra   r   r   r   rF   ?     


zcsv_dataset.SetTokenizerc                 C      | j S r#   r   r   r   r   r   rH   H     zcsv_dataset.GetTokenizerc                 C      | j r| jS d S r#   r   r   r   r   r   r   rG   K     zcsv_dataset.tokenizerc                 C   r'   r#   r   rU   r   r   r   r   r(   Q  r)   zcsv_dataset.__len__c                 C      | j | }| jdur| j|| j}n
| jdur| |}| j| }t|tr?| jdur5| j|| j}n
| jdur?| |}|t||dS )z=process+tokenize string and return string,label,and stringlenNrL   lengthr   rU   rG   EncodeAsIdsr   rV   r6   strr   r   r`   xyr   r   r   r%   T     








zcsv_dataset.__getitem__c           	      C   s  |du r	| j d }td|  t|di}tj|| jd}|durU|s6| jftt| | j	f }|
| t|D ]\}}| j| ft| | j| f }|
| q:n|
| j| j	g t| j| jD ]}|
| qeW d   dS W d   dS 1 sw   Y  dS )z
        given a generator of metrics for each of the data points X_i,
            write the metrics, text, and labels to a csv file
        N.resultszgenerating csv at w)	delimiter)r   rm   opencsvwriterr   r   tuplenextr   writerowrn   rV   rU   zip)	r   
writer_genr   skip_headercsvfilecheaderr{   rowr   r   r   writec  s0   

 "zcsv_dataset.write)NNr   FFr   r   NNF)r*   r+   r,   rY   r!   rF   rH   r[   rG   r(   r%   r   r   r   r   r   r      s"    
2	
r   c                   @   sl   e Zd ZdZ						dddZdd	 Zd
d Zedd Zdd Z	dd Z
dddZdd Zdd ZdS )json_dataseta   
    Class for loading datasets from a json dump.
    Purpose: Useful for loading data for unsupervised modeling or transfer tasks
    Arguments:
        path (str): path to json file with dataset.
        tokenizer (data_utils.Tokenizer): Tokenizer to use when processing text. Default: None
        preprocess_fn (callable): callable function that process a string into desired format.
            Takes string, maxlen=None, encode=None as arguments. Default: process_str
        text_key (str): key to get text from json dictionary. Default: 'sentence'
        label_key (str): key to get label from json dictionary. Default: 'label'
    Attributes:
        all_strs (list): list of all strings from the dataset
        all_labels (list): list of all labels from the dataset (if they have it)
    NFr   r   c                 K   s   d| _ || _|| _| | g | _g | _|| _|| _|| _| 	| jD ]}	|	| }
| j
|
 | j
|	|  q#|rDt| j|d| _d S d S )NFr   )r   r   r   rF   rU   rV   r   r   
loose_jsonload_json_streamr/   r   )r   r   rG   r   r   r   r   r   r>   jr2   r   r   r   r!     s    	
zjson_dataset.__init__c                 C   r   r   r   ra   r   r   r   rF     r   zjson_dataset.SetTokenizerc                 C   r   r#   r   r   r   r   r   rH     r   zjson_dataset.GetTokenizerc                 C   r   r#   r   r   r   r   r   rG     r   zjson_dataset.tokenizerc                 C   r   )z)gets the index'th string from the datasetNr   r   r   r   r   r   r%     r   zjson_dataset.__getitem__c                 C   r'   r#   r   r   r   r   r   r(     r)   zjson_dataset.__len__c                    sX   |du r	j d }durfdd nfdd  fdd}||  dS )z
        given a generator of metrics for each of the data points X_i,
            write the metrics, text, and labels to a json file
        Nr   c                  3   s    i }  j | d< stttD ]
\}}|| |d < qtD ]:\}}|dkr=r=t|D ]\}}d|f | |d < q/i }t j| ft| D ]\}}| | }|||< qK|V  q!d S )Nr   r   z	metric_%d)r   rn   r   r   rV   )keysr   kr{   r   _r   v)r   r   r   r   r   
gen_helper  s    
 
z&json_dataset.write.<locals>.gen_helperc                  3   s&     j D ]} i }| | j< |V  qd S r#   )rV   r   )r   r   r   r   r   r     s   

c                  3   s0    t   D ]\} }j|  |j< |V  qd S r#   )rn   rU   r   )r{   r   )r   r   r   r   
out_stream  s
   z&json_dataset.write.<locals>.out_stream)r   save_json_stream)r   r   r   r   r   r   )r   r   r   r   r   r     s   
zjson_dataset.writec                 C   s   | j r8t|d&}t|D ]\}}d}|dkrd}|t|7 }|| qW d    d S 1 s1w   Y  d S dd |D }tj|t|ddd d S )	Nr    r   
c                 S   s   g | ]}|qS r   r   )r   r   r   r   r   r     s    z1json_dataset.save_json_stream.<locals>.<listcomp>)r   :)
separators)r   r   rn   jsondumpsr   dump)r   	save_pathjson_streamr|   r{   r   write_stringjsonsr   r   r   r     s   "zjson_dataset.save_json_streamc                 #   s`    | j stt ddd}t|}n	 fdd}| }|D ]}| j|vr*d|| j< |V  qd S )Nr1   utf-8r   c                  3   sL    t  ddd} | D ]}t|V  qW d    d S 1 sw   Y  d S )Nr1   r   r   )r   r   loads)r|   r   	load_pathr   r   r     s   "z1json_dataset.load_json_stream.<locals>.gen_helperrI   )r   r   rh   r   iterr   )r   r   r   	generatorr   r   r   r   r   r      s   


zjson_dataset.load_json_stream)NNFr   r   Fr   )r*   r+   r,   rY   r!   rF   rH   r[   rG   r%   r(   r   r   r   r   r   r   r   r   |  s$    
	

(r   c                   @   sF   e Zd Z			dddZdd Zdd	 Zd
d Zdd ZdddZdS )	XLDataset   NTc                 K   s\   || _ || _|| _|d u r|}|| _|| _d\| _| _t| j dr(| j jr(d| _| 	  d S )NNNr   T)
r   rG   max_seq_lenmem_lensample_across_docindicesnum_samplesr    r   init_indices)r   r   rG   r   r   r   r>   r   r   r   r!     s   zXLDataset.__init__c                    s    j rt fddtt jD }ntdd  jD }tt| _t	dt| d jd    jd  j
 d  _d S )Nc                       g | ]} j |qS r   r   rB   r   r   r   r   r   +      z*XLDataset.init_indices.<locals>.<listcomp>c                 S   s6   g | ]}t |trt|d  t|d  nt|qS )promptrL   rM   rO   r   r   r   r   -  s    
Dataset document count , token count rI   r   )r   rW   rX   r   r   r   r   r   r   r   r   r   r   rS   r   r   r   r   (  s   
zXLDataset.init_indicesc                 C   r   r#   r   r   r   r   r   r(   8  r   zXLDataset.__len__c                 C   sZ   |  |\}}}}| |}| |}| j|dd}t|t|t|t|dS )Nr   pad_id)rL   target	loss_maskattention_mask)getidxpad_seqrW   rX   )r   r   tokenstargetsr   r   r   r   r   r%   ;  s   

zXLDataset.__getitem__c                 C   s  g g g }}}t jt j| j| jftdt j| j| jftdfdd}t| j|| j }|dkr2dn| j|d  }|| j | }|dkrZt	| j|}	d|d d | j |	 | j f< d}
t
|| jk r|t
| jk r| j| }|d |d }}|| jdjg }t	t
|d || j t
| }|dg }|
dkrt
|}d||d d || j f< |||| 7 }|||d |d  7 }|||d |d  7 }|
d7 }
|d7 }d}t
|| jk r|t
| jk sj||||fS )N)dtyper   r   r   r   
loss_maskseos)rW   concatenatezerosr   r   ro   r   r   r   minr   r   rG   get_commandId)r   r   r   r   r   r   rE   last_endtoken_offsethistorycountitemrL   masksendcurrentr   r   r   r   G  sH   

zXLDataset.getidxc                 C   @   | j }td|t| }||d u r| jdjn|g| 7 }|S Nr   padr   rp   r   rG   r  r  r   seqr   total_tokensnum_pad_tokensr   r   r   r   g     zXLDataset.pad_seq)r   NTr#   )	r*   r+   r,   r!   r   r(   r%   r   r   r   r   r   r   r     s    
 r   c                   @   s`   e Zd Z				dddZdd Zd	d
 Zdd Zdd Zdd Zdd Z	dddZ
dd ZdS )BlockDatasetr   Trd   Fc           	      K   s   || _ t| j | _d| j | _|| _|| _|| _|| _|| _d\| _	| _
d| _| jr8ddl}|d| _td t| j drE| j jrEd	| _|   dS )
Z
        sentence_start: the stripped article must start with a complete sentence
          r   Fr   Nz/mnt/lid.176.binzLoad language detection modelr   T)r   r   ru   r   r   rG   r   non_sentence_startfilter_english	weighting	total_lenr   fasttext
load_modelmodelr   r    init_weighting)	r   r   rG   r   r   r  r  r>   r  r   r   r   r!   s  s"   zBlockDataset.__init__c                    s    j rt fddtt jD }ntdd  jD }t| _tdt| d j d j	  t
t| _d S )Nc                    r   r   r   r   r   r   r   r     r   z/BlockDataset.init_weighting.<locals>.<listcomp>c                 S   rJ   rK   rM   rO   r   r   r   r     rQ   r   r   z, non sentence start)r   rW   rX   r   r   r   r9   r  r   r  r   r   r  r   r   r   r   r    s   
zBlockDataset.init_weightingc                 C   s~   	 | | j}t| j|}| |\}}| jr9| j|d d }| j	|
ddd d }|dkr8	 ||fS n	 ||fS q)NTr   r   r   r   __label__en)randintr  r   r  r   r  rG   	DecodeIdsr  predictreplace)r   np_rngr   data_idxr   r   rL   langr   r   r   get_weighted_samples  s   z!BlockDataset.get_weighted_samplesc                 C   r   r#   r   r   r   r   r   r(     r   zBlockDataset.__len__c                    s  t | tj j fddtdD d |  \}}t|}|| j d }|dkrd} |}   | j	kr   dk rp|| jd k ro|dkro| 
||d  so|d8 }|d7 }|| jd k ro|dkro| 
||d  rSn4|| jd k r|t|k r| 
||d  s|d7 }|d7 }|| jd k r|t|k r| 
||d  r| jd	jg||d   }dg||d   }t|dkr|d | jd
jkrg g }}| ||| j\}}n]| jd	jg| }dg| }| jr9t|| jk r9|  \}}	| jd	jg| }dg|	 }	t|| jt| k}
| ||	| jt| \}}	||7 }||	7 }|
r2nt|| jk st|t|dS )Nc                       g | ]}  d dqS r   l    r   r   r   rw   r   r   r     r   z,BlockDataset.__getitem__.<locals>.<listcomp>   seedr   r         ?   ENCr   rL   r   )r   RandomrW   rg   r   r'  r   r   r   r  contains_sentence_endrG   r  r  right_strip_seqr   rX   )r   r   r   r   
num_tokenstokens_to_strip
move_countstrip_left_tokens
new_tokensnew_loss_maskis_lastr   r,  r   r%     s   










zBlockDataset.__getitem__c                 C   s   t || }|dkrP|t |d k r2| || d  s2|d7 }|t |d k r2| || d  rt || |d k rBt || }|d |  }|d |  }||fS )Nr   r   r1  )r   r5  )r   r   r   
seq_lengthstrip_right_tokensr   r   r   r6    s4   zBlockDataset.right_strip_seqc                 C   B   | j | }|d |d }}|| jdjg }|dg }||fS Nr   r   r   r   r   rG   r  r  r   r%  r   r   r   r   r   r   r     
   

zBlockDataset.getidxNc                 C   r  r  r  r  r   r   r   r     r  zBlockDataset.pad_seqc                 C   sX   | j |}d|v rdS d|v rdS d|v rdS d|v rdS d|v r$dS d|v r*dS dS )	N.T?!;r   r   FrG   	IdToTokenr   tokr   r   r   r5    s   z"BlockDataset.contains_sentence_end)r   Trd   Fr#   )r*   r+   r,   r!   r  r'  r(   r%   r6  r   r   r5  r   r   r   r   r  q  s    
8

r  c                   @   s\   e Zd Z						dddZdd Zd	d
 Zdd Zdd Zdd ZdddZ	dd Z
dS )GPT2Datasetr   NTFc	           
      K   s   || _ t| j | _|| _|du rd| j | _|| _|| _|| _|| _|| _|| _	d\| _
| _d| _t| j dr>| j jr>d| _|   dS )r  Nr  r   Fr   T)r   r   ru   r   r   rG   weightedr   random_across_doc_samplingsentence_startr  r  r   r    r  )
r   r   rG   r   r   rN  r   rO  rP  r>   r   r   r   r!     s    zGPT2Dataset.__init__c                    s    j r? jrt fddtt jD }ntdd  jD }t| _t	dt| d j  t
t| _d S d  _d S )Nc                    r   r   r   r   r   r   r   r   ;  r   z.GPT2Dataset.init_weighting.<locals>.<listcomp>c                 S   rJ   rK   rM   rO   r   r   r   r   =  rQ   r   r   )rN  r   rW   rX   r   r   r   r9   r  r   r   r   r  r   r   r   r   r  7  s   

zGPT2Dataset.init_weightingc                 C   .   | j d ur|| j}t| j |S || jS r#   r  r   r  r   ru   r   r$  r   r   r   r   r'  I     
z GPT2Dataset.get_weighted_samplesc                 C   r   r#   r   r   r   r   r   r(   P  r   zGPT2Dataset.__len__c                    sv  t | tj j fddtdD d |  }| |\}}t|}|| j d }|dkr^ 	|d }||d  }||d  }t|| j d }|dkr^|d |  }|d |  }| j
rt|| jd k r| jrs|  }n|d | j }| |\}	}
||	7 }||
7 }t|| jd k sj|d | jd  }|d | jd  }| |}| j|dd}t|t|dS )	Nc                    r(  r)  r*  r+  r,  r   r   r   W  r   z+GPT2Dataset.__getitem__.<locals>.<listcomp>r-  r.  r   r   r   r3  )r   r4  rW   rg   r   r'  r   r   r   r   r   rO  ru   r   rX   )r   r   r%  r   r   r7  r8  r:  strip_right_rokensr;  r<  r   r,  r   r%   S  s<   

	
zGPT2Dataset.__getitem__c                 C   r@  rA  rB  rC  r   r   r   r     rD  zGPT2Dataset.getidxc                 C   sD   | j d }td|t| }||d u r| jdjn|g| 7 }|S )Nr   r   r  r  r  r   r   r   r     s   
zGPT2Dataset.pad_seqc                 C   s4   | j |}d|v rdS d|v rdS d|v rdS dS )NrE  TrF  rG  FrI  rK  r   r   r   r5    s   z!GPT2Dataset.contains_sentence_end)r   NTTTFr#   )r*   r+   r,   r!   r  r'  r(   r%   r   r   r5  r   r   r   r   rM    s    
1

rM  c                   @   s   e Zd ZdZ							d#dd	Zd
d Zdd Zdd Zdd Zdd Z				d$ddZ
dd Zdd Zdd Zdd Zdd  Zd!d" ZdS )%BertSentencepairDataseta  
    Dataset containing sentencepairs for BERT training. Each index corresponds to a randomly generated sentence pair.
    Arguments:
        ds (Dataset or array-like): data corpus to use for training
        max_seq_len (int): maximum sequence length to use for a sentence pair
        mask_lm_prob (float): proportion of tokens to mask for masked LM
        max_preds_per_seq (int): Maximum number of masked tokens per sentence pair. Default: math.ceil(max_seq_len*mask_lm_prob/10)*10
        short_seq_prob (float): Proportion of sentence pairs purposefully shorter than max_seq_len
        dataset_size (int): number of random sentencepairs in the dataset. Default: len(ds)*(len(ds)-1)

       333333?N{Gz?FTc	           
      K   s   || _ t| j | _| j  | _t| jj | _| j 	d  || _
|| _|d u r3t|| d d }|| _|| _|| _| jd u rJ| j| jd  | _|| _| jsWtjddd || _|   d S )N
   r   punktz./nltk)download_dir)r   r   ru   rH   rG   r   text_token_vocabr   vocab_wordsrF   r   mask_lm_probmathceilmax_preds_per_seqshort_seq_probdataset_sizepresplit_sentencesnltkdownloadrN  get_weighting)
r   r   r   r_  rb  rc  rd  re  rN  r>   r   r   r   r!     s&   

z BertSentencepairDataset.__init__c                 C   sh   | j r/t| jdr| jjrt| jj}ntdd | jD }t|| _t	t
|| _d S d | _d S )Nr   c                 S   rJ   rK   rM   rO   r   r   r   r     rQ   z9BertSentencepairDataset.get_weighting.<locals>.<listcomp>)rN  r    r   r   rW   rX   rS   r9   r  r   r   r  r   r   r   r   rh    s   

z%BertSentencepairDataset.get_weightingc                 C   rQ  r#   rR  rS  r   r   r   r'    rT  z,BertSentencepairDataset.get_weighted_samplesc                 C   r   r#   )rd  r   r   r   r   r(     r   zBertSentencepairDataset.__len__c                    s2  t | tj j fddtdD d}| j}d}   | jk r) d|}d}d }d}d}|d u s;|d	k s;|d	k r]| | |\}}	}t	|d }t	|	d }|d u s;|d	k s;|d	k s;| 
||	| j \}}	| ||	| j| j| j \}
}}}t|
d t|
d	 t|t|t|t|d
}|S )Nc                    r(  r)  r*  r+  r,  r   r   r     r   z7BertSentencepairDataset.__getitem__.<locals>.<listcomp>r-  r.  Fr1  Tr   r   )rL   types	is_randommaskmask_labelspad_mask)r   r4  rW   rg   r   r   rc  r   create_random_sentencepairr   truncate_seq_paircreate_masked_lm_predictionsr_  rb  r^  rX   ro   )r   r   r$  target_seq_length	short_seqis_random_nextlenalenbtokensatokensbr   rk  rl  rm  sampler   r,  r   r%     sD   

z#BertSentencepairDataset.__getitem__c                 C   sH   | d}| jrdd |D S g }|D ]}|dkr!|t| q|S )zsplit document into sentencesr   c                 S   s   g | ]}|r|qS r   r   )r   liner   r   r   r         z:BertSentencepairDataset.sentence_split.<locals>.<listcomp>r   )rq   re  rR   r   sent_tokenize)r   documentlinesrtnry  r   r   r   sentence_split
  s   
z&BertSentencepairDataset.sentence_splitr   c                 C   s:   | j |j}dt| }| j |jgt| }||fS )z%tokenize sentence and get token typesr   )rG   r   tokenizationr   get_typer  r   )r   sentsentence_num	beginningendingr   str_typetoken_typesr   r   r   sentence_tokenize  s   z)BertSentencepairDataset.sentence_tokenizec                 C   s    | j | }t|tr|d }|S )z*gets text of document corresponding to idxrL   )r   r6   rN   )r   r   r~  r   r   r   get_doc   s   

zBertSentencepairDataset.get_docc                 C   s  d}g }g }d}|dk rd}d}|du r6| j r| |}	n	|d| jd }	| | |	}|s2d}|du s|dt|d }
|
t|k r||
 }| |d|
dk|
t|k\}}|| || |t|7 }|
t|d ksv||krwn
|
d }
|
t|k sF|dk s|rbd}t|dkr|dt|}g }g }t	|D ]}|
||  |
||  qg }g }d}t|dks| dk rGd}|t| }d}|dk rFd}|du r|d| jd }|t||	k7 }| | |}|sd}|du s|dt|d }|t|k rB|| }| |d|dk|t|k\}}|t|7 }|
| |
| t||kr7n|d }|t|k s|dk snd}t	|t|D ]}|
||  |
||  qP||f||f|fS )z
        fetches a random sentencepair corresponding to rng state similar to
        https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L248-L294
        Nr   r   r1  Fr0  T)rN  r'  r   ru   r  r  r   r  r/   r   rR   r   ro   )r   rq  rw   r$  rs  	curr_strscurr_str_typescurr_lendoc_a	doc_a_idxrandom_start_ar   sentence_typesnum_atokens_atoken_types_ar   tokens_btoken_types_btarget_b_lengthb_lendoc_b	doc_b_idxrandom_start_b
sentence_bnew_b_tokensnew_b_typesr   r   r   rn  '  s   









z2BertSentencepairDataset.create_random_sentencepairc                 C   s   |\}}|\}}|d }		 t |}
t |}|
| }||	krn3t |t |kr+|}|}n|}|}t |dks7J | dk rH|d |d n|  |  q||f||ffS )z
        Truncate sequence pair according to original BERT implementation:
        https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L391
           Tr   r0  r   )r   r   pop)r   abr   rw   r  r  r  r  max_num_tokenslen_alen_btotal_lengthtrunc_tokenstrunc_typesr   r   r   ro  ~  s,   
z)BertSentencepairDataset.truncate_seq_pairc                 C   sL   || }|  dk r| jdj}n|  dk r|}n||}|||< |S )z
        helper function to mask `idx` token from `tokens` according to
        section 3.3.1 of https://arxiv.org/pdf/1810.04805.pdf
        rc   MASKr0  )r   rG   r  r  choice)r   r   r   ri  r^  rw   r   	new_labelr   r   r   
mask_token  s   
z"BertSentencepairDataset.mask_tokenc                 C   sL   t d| jt| }dgt| dg|  }|| jdjg| 7 }||fS )z$helper function to pad sequence pairr   r   r  )rp   r   r   rG   r  r  )r   r  num_padrm  r   r   r   r     s   zBertSentencepairDataset.pad_seqc                    sX  |\}}|\}	}
| j djg| | j djg |	 | j djg }|d g| |d g |
 |
d g }t| t|	}dd t D  fddt|D  }|| | t|\}}| t|\}}t|t	dt
tt|| }dgt| }dgt| }t|d	| D ]}d||< | |||||}|||< q||f|||fS )
z
        Mask sequence pair for BERT training according to:
        https://github.com/google-research/bert/blob/master/create_pretraining_data.py#L338
        r2  r   r   c                 S   s   g | ]}|d  qS )r   r   r   r   r   r   r     rz  zHBertSentencepairDataset.create_masked_lm_predictions.<locals>.<listcomp>c                    s   g | ]}|d    qS )r1  r   r   r  r   r   r     r   r   rI   N)rG   r  r  r   r   r   r   r   r   rp   ro   roundsortedr  )r   r  r  r_  rb  r^  rw   r  r  r  r  r   r  r  cand_indicesoutput_tokensrm  output_typesr   num_to_predictrk  rl  r   r   r   r  r   rp    sH   


z4BertSentencepairDataset.create_masked_lm_predictions)rW  rX  NrY  NFT)r   FF)r*   r+   r,   rY   r!   rh  r'  r(   r%   r  r  r  rn  ro  r  r   rp  r   r   r   r   rV    s2    
'
WrV  )NTNN)'rY   r   r`  osr   timebisectr   	itertoolsr   operatorr   r   rf  numpyrW   pandasr   ri   tqdmr   torch.utilsr    modelscope.models.nlp.mglm.utilsr   lazy_loaderr	   r
   Datasetr   r-   r]   r   r   r   r   r  rM  rV  r   r   r   r   <module>   sB   [
4/  ^ ' 
