o
    wi\                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlZd dl	m
Z
 d dlmZ ddlmZ ddlmZ eeZdZG d	d
 d
eZG dd deZG dd deZG dd deZG dd deZdS )    N)Optional)FileLock)Dataset   )PreTrainedTokenizer)loggingu   This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: {0}c                
   @   sL   e Zd ZdZ		ddedededee fdd	Zd
d Z	de
jfddZdS )TextDatasetH
    This will be superseded by a framework-agnostic approach soon.
    FN	tokenizer	file_path
block_size	cache_dirc              
   C   s4  t tdt tj|du rtd| d||j	dd }tj
|\}}tj|d ur2|n|d|jj d| d| }|d }	t|	 tj|r|st }
t|d	}t|| _W d    n1 slw   Y  td
| dt |
  ntd|  g | _t|dd}| }W d    n1 sw   Y  |||}tdt|| d |D ]}| j|||||   qt }
t|d}tj| j|tjd W d    n1 sw   Y  td| dt |
 dd W d    d S W d    d S 1 sw   Y  d S )Nchttps://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.pyFInput file path 
 not foundpair
cached_lm__.lockrb"Loading features from cached file  [took %.3f s]'Creating features from dataset file at utf-8encodingr      wbprotocol!Saving features into cached file  [took .3f s]) warningswarnDEPRECATION_WARNINGformatFutureWarningospathisfile
ValueErrornum_special_tokens_to_addsplitjoin	__class____name__r   existstimeopenpickleloadexamplesloggerinforeadconvert_tokens_to_idstokenizerangelenappend build_inputs_with_special_tokensdumpHIGHEST_PROTOCOL)selfr
   r   r   overwrite_cacher   	directoryfilenamecached_features_file	lock_pathstarthandleftexttokenized_texti rP   i/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/data/datasets/language_modeling.py__init__-   sX   

$zTextDataset.__init__c                 C   
   t | jS Nr?   r8   rD   rP   rP   rQ   __len__j      
zTextDataset.__len__returnc                 C   s   t j| j| t jdS )Ndtype)torchtensorr8   longrD   rO   rP   rP   rQ   __getitem__m   s   zTextDataset.__getitem__)FN)r2   
__module____qualname____doc__r   strintr   rR   rW   r\   Tensorr`   rP   rP   rP   rQ   r   (   s    	
=r   c                   @   sF   e Zd ZdZdededefddZdd Zd	e	ee
jf fd
dZdS )LineByLineTextDatasetr	   r
   r   r   c                 C   s   t tdt tj|du rtd| dt	
d|  t|dd}dd	 |  D }W d    n1 s=w   Y  ||d
d
|d}|d | _dd	 | jD | _d S )Nr   Fr   r   r   r   r   c                 S   s$   g | ]}t |d kr| s|qS r   )r?   isspace.0linerP   rP   rQ   
<listcomp>   s   $ z2LineByLineTextDataset.__init__.<locals>.<listcomp>Tadd_special_tokens
truncation
max_length	input_idsc                 S       g | ]}d t j|t jdiqS rr   rZ   r\   r]   r^   rk   erP   rP   rQ   rm           )r%   r&   r'   r(   r)   r*   r+   r,   r-   r9   r:   r5   r;   
splitlinesr8   )rD   r
   r   r   rL   linesbatch_encodingrP   rP   rQ   rR   v   s   
zLineByLineTextDataset.__init__c                 C   rS   rT   rU   rV   rP   rP   rQ   rW      rX   zLineByLineTextDataset.__len__rY   c                 C   
   | j | S rT   r8   r_   rP   rP   rQ   r`      rX   z!LineByLineTextDataset.__getitem__Nr2   ra   rb   rc   r   rd   re   rR   rW   dictr\   r]   r`   rP   rP   rP   rQ   rg   q   s
    rg   c                   @   sJ   e Zd ZdZdedededefddZdd	 Zd
e	ee
jf fddZdS )LineByLineWithRefDatasetr	   r
   r   r   ref_pathc              
   C   s  t tdt tj|du rtd| dtj|du r)td| dt	
d|  t	
d|  t|dd	}| }W d    n1 sNw   Y  d
d |D }t|dd	}dd |  D }W d    n1 svw   Y  t|t|krtd| dt| d| dt| ||dd|d}|d | _dd | jD | _t| j}	t|	D ]}
tj||
 tjd| j|
 d< qd S )Nzghttps://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm_wwm.pyFr   r   zRef file path r   zUse ref segment results at r   r   c                 S   s(   g | ]}t |d kr| s| qS rh   )r?   ri   striprj   rP   rP   rQ   rm      s   ( z5LineByLineWithRefDataset.__init__.<locals>.<listcomp>c                 S   s*   g | ]}t |d kr| st|qS rh   )r?   ri   jsonloadsrj   rP   rP   rQ   rm      s   * zDLength of Input file should be equal to Ref file. But the length of z is z while length of Trn   rr   c                 S   rs   rt   ru   rv   rP   rP   rQ   rm      rx   rZ   chinese_ref)r%   r&   r'   r(   r)   r*   r+   r,   r-   r9   r:   r5   	readlinesr;   ry   r?   r8   r>   r\   r]   r^   )rD   r
   r   r   r   rL   datarefr{   nrO   rP   rP   rQ   rR      sD   


 z!LineByLineWithRefDataset.__init__c                 C   rS   rT   rU   rV   rP   rP   rQ   rW      rX   z LineByLineWithRefDataset.__len__rY   c                 C   r|   rT   r}   r_   rP   rP   rQ   r`      rX   z$LineByLineWithRefDataset.__getitem__Nr~   rP   rP   rP   rQ   r      s
    $r   c                   @   sP   e Zd ZdZdededefddZddd	Zd
d Z	de
eejf fddZdS )LineByLineWithSOPTextDatasetzY
    Dataset for sentence order prediction task, prepare sentence pairs for SOP task
    r
   file_dirr   c              	      s8  t tdt tj|du rt| dt	
d|  g | _t|D ]l}tj||}tj|du r@t| dd}t|ddD}| }g }	|D ]3}
d|
v rZd	}qQd
|
v r}d} fdd|	dd  D }| || }| j| g }	qQ|r|	|
 qQW d    n1 sw   Y  q(t	
d d S )Nr   Fz is not a directoryz.Creating features from dataset file folder at z is not a filer   r   z<doc id=Tz</doc>c                    s0   g | ]}t |d kr| s  |qS rh   )r?   ri   r<   r=   rj   r
   rP   rQ   rm      s
    z9LineByLineWithSOPTextDataset.__init__.<locals>.<listcomp>r   zDataset parse finished.)r%   r&   r'   r(   r)   r*   r+   isdirr-   r9   r:   r8   listdirr0   r,   r5   r   create_examples_from_documentextendr@   )rD   r
   r   r   	file_namer   article_openrL   original_linesarticle_linesrl   documentr8   rP   r   rQ   rR      sH   


z%LineByLineWithSOPTextDataset.__init__皙?c                 C   s  ||j dd }|}t |k rtd|}g }g }d}	d}
|
t|k r||
 }|s/|
d7 }
q|| |	t|7 }	|
t|d ksF|	|kr|rd}t|dkrZtdt|d }g }t|D ]	}|||  q`g }t|t|D ]	}|||  qst|dkst|dkrqt dk rd}||}}nd}dd	 }|||| t|dkstd
t| dt|dkstdt| d|||}|	||}t
j|t
jdt
j|t
jdt
j|rdndt
jdd}|| g }d}	|
d7 }
|
t|k s$|S )'Creates examples for a single document.Tr      r   r         ?Fc                 S   sh   	 t | t | }||krdS t | t |kr| n|}t |dks%tdt dk r/|d= n|  q)z;Truncates a pair of sequences to a maximum sequence length.Tr   z8Sequence length to be truncated must be no less than oner   r   N)r?   r-   randompop)tokens_atokens_bmax_num_tokenstotal_lengthtrunc_tokensrP   rP   rQ   truncate_seq_pair-  s   zULineByLineWithSOPTextDataset.create_examples_from_document.<locals>.truncate_seq_pairLength of sequence a is  which must be no less than 1Length of sequence b is rZ   )rr   token_type_idssentence_order_label)r.   r   randintr?   r@   r>   r   r-   rA   $create_token_type_ids_from_sequencesr\   r]   r^   )rD   r   r   r
   short_seq_probr   target_seq_lengthr8   current_chunkcurrent_lengthrO   segmenta_endr   jr   is_nextr   rr   r   examplerP   rP   rQ   r      sd   	

Gz:LineByLineWithSOPTextDataset.create_examples_from_documentc                 C   rS   rT   rU   rV   rP   rP   rQ   rW   S  rX   z$LineByLineWithSOPTextDataset.__len__rY   c                 C   r|   rT   r}   r_   rP   rP   rQ   r`   V  rX   z(LineByLineWithSOPTextDataset.__getitem__N)r   )r2   ra   rb   rc   r   rd   re   rR   r   rW   r   r\   r]   r`   rP   rP   rP   rQ   r      s    
)cr   c                   @   s\   e Zd ZdZ			ddededefdd	Zd
eee  dedefddZ	dd Z
dd ZdS )$TextDatasetForNextSentencePredictionr	   Fr   r   r
   r   r   c              	   C   sz  t tdt tj|std| d|| _	|| _
tj|\}}tj|d|jj d| d| }	|| _|	d }
t|
 tj|	ry|syt }t|	d}t|| _W d    n1 sew   Y  td|	 d	t |  ntd
|  g g| _t|dd:}	 | }|sn*| }|st| jd dkr| jg  ||}||}|r| jd | qW d    n1 sw   Y  tdt| j d g | _t | jD ]\}}| !||| qt }t|	d}tj"| j|tj#d W d    n	1 sw   Y  td|	 dt | dd W d    d S W d    d S 1 s6w   Y  d S )Nr   r   r   cached_nsp_r   r   r   r   r   r   r   r   Tr   zCreating examples from z documents.r   r   r!   r"   r#   r$   )$r%   r&   r'   r(   r)   r*   r+   r,   r-   short_seq_probabilitynsp_probabilityr/   r0   r1   r2   r
   r   r3   r4   r5   r6   r7   r8   r9   r:   	documentsreadliner   r?   r@   r=   r<   	enumerater   rB   rC   )rD   r
   r   r   rE   r   r   rF   rG   rH   rI   rJ   rK   rL   rl   tokens	doc_indexr   rP   rP   rQ   rR   _  sr   	


$z-TextDatasetForNextSentencePrediction.__init__r   r   c                 C   s|  || j jdd }|}t | jk rtd|}g }d}d}|t|k r<|| }	||	 |t|	7 }|t|d ksA||kr0|r,d}
t|dkrVtdt|d }
g }t|
D ]	}|||  q\g }t|dksut | j	k rd}|t| }tdD ]}tdt| j
d }||kr nq| j
| }tdt|d }t|t|D ]}|||  t||kr nqt||
 }||8 }nd}t|
t|D ]	}|||  qt|dkstdt| d	t|dkstd
t| d	| j ||}| j ||}tj|tjdtj|tjdtj|rdndtjdd}| j| g }d}|d7 }|t|k s%dS dS )r   Tr   r   r   r   
   Fr   r   r   rZ   )rr   r   next_sentence_labelN)r
   r.   r   r   r   r?   r@   r>   r   r   r   r-   rA   r   r\   r]   r^   r8   )rD   r   r   r   r   r   r   r   rO   r   r   r   r   r   is_random_nexttarget_b_lengthr   random_document_indexrandom_documentrandom_startnum_unused_segmentsrr   r   r   rP   rP   rQ   r     sn   	


zBTextDatasetForNextSentencePrediction.create_examples_from_documentc                 C   rS   rT   rU   rV   rP   rP   rQ   rW     rX   z,TextDatasetForNextSentencePrediction.__len__c                 C   r|   rT   r}   r_   rP   rP   rQ   r`     rX   z0TextDatasetForNextSentencePrediction.__getitem__N)Fr   r   )r2   ra   rb   rc   r   rd   re   rR   listr   rW   r`   rP   rP   rP   rQ   r   Z  s    	
UZr   )r   r*   r6   r   r4   r%   typingr   r\   filelockr   torch.utils.datar   tokenization_utilsr   utilsr   
get_loggerr2   r9   r'   r   rg   r   r   r   rP   rP   rP   rQ   <module>   s*   
I!0 