o
    i5                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZmZ ddlmZm	Z	m
Z
 e
eZeG dd dZedd	G d
d dZG dd dZG dd deZdS )    N)	dataclass)OptionalUnion   )is_tf_availableis_torch_availableloggingc                   @   sJ   e Zd ZU dZeed< eed< dZee ed< dZee ed< dd Z	dS )	InputExamplea5  
    A single training/test example for simple sequence classification.

    Args:
        guid: Unique id for the example.
        text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
        text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
        label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
    guidtext_aNtext_blabelc                 C   s   t jt| ddd S )*Serializes this instance to a JSON string.   )indent
jsondumpsdataclassesasdictself r   V/home/ubuntu/.local/lib/python3.10/site-packages/transformers/data/processors/utils.pyto_json_string1   s   zInputExample.to_json_string)
__name__
__module____qualname____doc__str__annotations__r   r   r   r   r   r   r   r   r	      s   
 r	   T)frozenc                   @   sf   e Zd ZU dZee ed< dZeee  ed< dZ	eee  ed< dZ
eeeef  ed< dd ZdS )	InputFeaturesa  
    A single set of features of data. Property names are the same names as the corresponding inputs to a model.

    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in `[0, 1]`: Usually `1` for tokens that are NOT MASKED, `0` for MASKED (padded)
            tokens.
        token_type_ids: (Optional) Segment token indices to indicate first and second
            portions of the inputs. Only some models use them.
        label: (Optional) Label corresponding to the input. Int for classification problems,
            float for regression problems.
    	input_idsNattention_masktoken_type_idsr   c                 C   s   t t| d S )r   r   r   r   r   r   r   r   K   s   zInputFeatures.to_json_string)r   r   r   r   listintr!   r%   r   r&   r   r   floatr   r   r   r   r   r#   6   s   
 r#   c                   @   sN   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	e
dddZdS )DataProcessorzEBase class for data converters for sequence classification data sets.c                 C      t  )z
        Gets an example from a dict with tensorflow tensors.

        Args:
            tensor_dict: Keys and values should match the corresponding Glue
                tensorflow_dataset examples.
        NotImplementedError)r   tensor_dictr   r   r   get_example_from_tensor_dictS   s   z*DataProcessor.get_example_from_tensor_dictc                 C   r+   )z8Gets a collection of [`InputExample`] for the train set.r,   r   data_dirr   r   r   get_train_examples]      z DataProcessor.get_train_examplesc                 C   r+   )z6Gets a collection of [`InputExample`] for the dev set.r,   r0   r   r   r   get_dev_examplesa   r3   zDataProcessor.get_dev_examplesc                 C   r+   )z7Gets a collection of [`InputExample`] for the test set.r,   r0   r   r   r   get_test_examplese   r3   zDataProcessor.get_test_examplesc                 C   r+   )z*Gets the list of labels for this data set.r,   r   r   r   r   
get_labelsi   r3   zDataProcessor.get_labelsc                 C   s(   t |  dkr|  t|j |_|S )z
        Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. This method converts
        examples to the correct format.
           )lenr6   r(   r   )r   exampler   r   r   tfds_mapm   s   zDataProcessor.tfds_mapNc                 C   sF   t |ddd}ttj|d|dW  d   S 1 sw   Y  dS )z!Reads a tab separated value file.rz	utf-8-sig)encoding	)	delimiter	quotecharN)openr'   csvreader)cls
input_filer?   fr   r   r   	_read_tsvv   s   $zDataProcessor._read_tsvN)r   r   r   r   r/   r2   r4   r5   r6   r:   classmethodrF   r   r   r   r   r*   P   s    
	r*   c                   @   s   e Zd ZdZdddZdd Zd	d
 Ze	dddZedddZ								dddZ
	dddZ					dddZdS )%SingleSentenceClassificationProcessorz@Generic processor for a single sentence classification data set.NclassificationFc                 C   s4   |d u rg n|| _ |d u rg n|| _|| _|| _d S rG   )labelsexamplesmodeverbose)r   rK   rL   rM   rN   r   r   r   __init__   s   
z.SingleSentenceClassificationProcessor.__init__c                 C   s
   t | jS rG   )r8   rL   r   r   r   r   __len__   s   
z-SingleSentenceClassificationProcessor.__len__c                 C   s(   t |trt| j| j| dS | j| S )N)rK   rL   )
isinstanceslicerI   rK   rL   )r   idxr   r   r   __getitem__   s   

z1SingleSentenceClassificationProcessor.__getitem__ r   r7   c           	   
   K   s,   | di |}|j ||||||ddd |S )NT)
split_namecolumn_labelcolumn_text	column_idskip_first_rowoverwrite_labelsoverwrite_examplesr   )add_examples_from_csv)	rC   	file_namerV   rW   rX   rY   rZ   kwargs	processorr   r   r   create_from_csv   s   
z5SingleSentenceClassificationProcessor.create_from_csvc                 K   s    | di |}|j ||d |S )N)rK   r   )add_examples)rC   texts_or_text_and_labelsrK   r_   r`   r   r   r   create_from_examples   s   z:SingleSentenceClassificationProcessor.create_from_examplesc	                 C   s   |  |}	|r|	dd  }	g }
g }g }t|	D ]0\}}|
||  |||  |d ur5|||  q|r>| d| nt|}|| q| j|
||||dS )Nr7   -)r[   r\   )rF   	enumerateappendr    rb   )r   r^   rV   rW   rX   rY   rZ   r[   r\   linestextsrK   idsiliner
   r   r   r   r]      s    

z;SingleSentenceClassificationProcessor.add_examples_from_csvc              	   C   sB  |d urt |t |krtdt | dt | |d ur4t |t |kr4tdt | dt | |d u r?d gt | }|d u rJd gt | }g }t }t|||D ]'\}}	}
t|ttfrj|	d u rj|\}}	n|}||	 |t	|
|d |	d qU|r|| _
n| j
| |rt|| _| j
S tt| j|| _| j
S )Nz(Text and labels have mismatched lengths z and z%Text and ids have mismatched lengths )r
   r   r   r   )r8   
ValueErrorsetziprQ   tupler'   addrg   r	   rL   extendrK   union)r   rc   rK   rj   r[   r\   rL   added_labelstext_or_text_and_labelr   r
   textr   r   r   rb      s4   


z2SingleSentenceClassificationProcessor.add_examplesTc                    s  |du r|j }dd t| jD }g }t| jD ]$\}	}
|	d dkr*td|	  |j|
jdt||j d}|	| qt
d	d
 |D }g  tt|| jD ]\}	\}}
|	d dkrjtd|	 dt| j  |rndndgt| }|t| }|r|g| | }|rdndg| | }n||g|  }||rdndg|  }t||krtdt| d| t||krtdt| d| | jdkr||
j }n| jdkrt|
j}nt| j|	dk r%| jr%td td|
j  tdddd |D   tdddd |D   td|
j d| d  	t|||d qP|du r7 S |dkrqt sDtd ddl} fd!d"}|jj||j|jd#|jf|dg|dgd#|g f}|S |d$krt s~td%ddl}dd&l m!} |j"d'd  D |j#d(}|j"d)d  D |j#d(}| jdkr|j"d*d  D |j#d(}n| jdkr|j"d+d  D |jd(}||||}|S td,)-a  
        Convert examples in a list of `InputFeatures`

        Args:
            tokenizer: Instance of a tokenizer that will tokenize the examples
            max_length: Maximum example length
            pad_on_left: If set to `True`, the examples will be padded on the left rather than on the right (default)
            pad_token: Padding token
            mask_padding_with_zero: If set to `True`, the attention mask will be filled by `1` for actual values
                and by `0` for padded values. If set to `False`, inverts it (`1` for padded values, `0` for actual
                values)

        Returns:
            If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the
            task-specific features. If the input is a list of `InputExamples`, will return a list of task-specific
            `InputFeatures` which can be fed to the model.

        Nc                 S   s   i | ]\}}||qS r   r   ).0rk   r   r   r   r   
<dictcomp>  s    zFSingleSentenceClassificationProcessor.get_features.<locals>.<dictcomp>i'  r   zTokenizing example T)add_special_tokens
max_lengthc                 s   s    | ]}t |V  qd S rG   )r8   )rw   r$   r   r   r   	<genexpr>  s    zESingleSentenceClassificationProcessor.get_features.<locals>.<genexpr>zWriting example /r7   zError with input length z vs rJ   
regression   z*** Example ***zguid: zinput_ids:  c                 S      g | ]}t |qS r   r    rw   xr   r   r   
<listcomp>6      zFSingleSentenceClassificationProcessor.get_features.<locals>.<listcomp>zattention_mask: c                 S   r   r   r   r   r   r   r   r   7  r   zlabel: z (id = )r$   r%   r   tfz?return_tensors set to 'tf' but TensorFlow 2.0 can't be importedc                  3   s&     D ]} | j | jd| jfV  qd S )Nr$   r%   r   )exfeaturesr   r   genC  s   z?SingleSentenceClassificationProcessor.get_features.<locals>.genr   ptz8return_tensors set to 'pt' but PyTorch can't be imported)TensorDatasetc                 S      g | ]}|j qS r   )r$   rw   rE   r   r   r   r   S      )dtypec                 S   r   r   )r%   r   r   r   r   r   T  r   c                 S   r   r   r   r   r   r   r   r   V  r   c                 S   r   r   r   r   r   r   r   r   X  r   z,return_tensors should be one of 'tf' or 'pt')$max_lenrf   rK   rL   loggerinfoencoder   minrg   maxro   r8   rm   rM   r   r)   rN   r
   joinr#   r   RuntimeError
tensorflowdataDatasetfrom_generatorint32int64TensorShaper   torchtorch.utils.datar   tensorlong)r   	tokenizerrz   pad_on_left	pad_tokenmask_padding_with_zeroreturn_tensors	label_mapall_input_idsex_indexr9   r$   batch_lengthr%   padding_lengthr   r   r   datasetr   r   all_attention_mask
all_labelsr   r   r   get_features   s   




  

"
z2SingleSentenceClassificationProcessor.get_features)NNrJ   F)rU   r   r7   NFrG   )rU   r   r7   NFFF)NNFF)NFr   TN)r   r   r   r   rO   rP   rT   rH   ra   rd   r]   rb   r   r   r   r   r   rI   }   s4    


(rI   )rA   r   r   r   typingr   r   utilsr   r   r   
get_loggerr   r   r	   r#   r*   rI   r   r   r   r   <module>   s   
-