o
    ei2                     @   s   d dl Z d dlZd dlZd dlmZ ddlmZmZ eeZ	eG dd dZ
eddG d	d
 d
ZG dd dZG dd deZdS )    N)	dataclass   )is_torch_availableloggingc                   @   sJ   e Zd ZU dZeed< eed< dZedB ed< dZedB ed< dd ZdS )	InputExamplea5  
    A single training/test example for simple sequence classification.

    Args:
        guid: Unique id for the example.
        text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
        text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
        label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
    guidtext_aNtext_blabelc                 C   s   t jt| ddd S )*Serializes this instance to a JSON string.   )indent
jsondumpsdataclassesasdictself r   `/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/data/processors/utils.pyto_json_string/   s   zInputExample.to_json_string)	__name__
__module____qualname____doc__str__annotations__r	   r
   r   r   r   r   r   r      s   
 r   T)frozenc                   @   sb   e Zd ZU dZee ed< dZee dB ed< dZee dB ed< dZ	ee
B dB ed< dd ZdS )	InputFeaturesa  
    A single set of features of data. Property names are the same names as the corresponding inputs to a model.

    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in `[0, 1]`: Usually `1` for tokens that are NOT MASKED, `0` for MASKED (padded)
            tokens.
        token_type_ids: (Optional) Segment token indices to indicate first and second
            portions of the inputs. Only some models use them.
        label: (Optional) Label corresponding to the input. Int for classification problems,
            float for regression problems.
    	input_idsNattention_masktoken_type_idsr
   c                 C   s   t t| d S )r   r   r   r   r   r   r   r   I   s   zInputFeatures.to_json_string)r   r   r   r   listintr   r"   r#   r
   floatr   r   r   r   r   r    4   s   
 r    c                   @   sN   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	e
dddZdS )DataProcessorzEBase class for data converters for sequence classification data sets.c                 C      t  )z
        Gets an example from a dict.

        Args:
            tensor_dict: Keys and values should match the corresponding Glue
                tensorflow_dataset examples.
        NotImplementedError)r   tensor_dictr   r   r   get_example_from_tensor_dictQ   s   z*DataProcessor.get_example_from_tensor_dictc                 C   r(   )z8Gets a collection of [`InputExample`] for the train set.r)   r   data_dirr   r   r   get_train_examples[      z DataProcessor.get_train_examplesc                 C   r(   )z6Gets a collection of [`InputExample`] for the dev set.r)   r-   r   r   r   get_dev_examples_   r0   zDataProcessor.get_dev_examplesc                 C   r(   )z7Gets a collection of [`InputExample`] for the test set.r)   r-   r   r   r   get_test_examplesc   r0   zDataProcessor.get_test_examplesc                 C   r(   )z*Gets the list of labels for this data set.r)   r   r   r   r   
get_labelsg   r0   zDataProcessor.get_labelsc                 C   s(   t |  dkr|  t|j |_|S )z
        Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. This method converts
        examples to the correct format.
           )lenr3   r%   r
   )r   exampler   r   r   tfds_mapk   s   zDataProcessor.tfds_mapNc                 C   sF   t |ddd}ttj|d|dW  d   S 1 sw   Y  dS )z!Reads a tab separated value file.rz	utf-8-sig)encoding	)	delimiter	quotecharN)openr$   csvreader)cls
input_filer<   fr   r   r   	_read_tsvt   s   $zDataProcessor._read_tsvN)r   r   r   r   r,   r/   r1   r2   r3   r7   classmethodrC   r   r   r   r   r'   N   s    
	r'   c                   @   s   e Zd ZdZdddZdd Zd	d
 Ze	dddZedddZ								dddZ
	dddZ					dddZdS )%SingleSentenceClassificationProcessorz@Generic processor for a single sentence classification data set.NclassificationFc                 C   s4   |d u rg n|| _ |d u rg n|| _|| _|| _d S rD   )labelsexamplesmodeverbose)r   rH   rI   rJ   rK   r   r   r   __init__~   s   
z.SingleSentenceClassificationProcessor.__init__c                 C   s
   t | jS rD   )r5   rI   r   r   r   r   __len__   s   
z-SingleSentenceClassificationProcessor.__len__c                 C   s(   t |trt| j| j| dS | j| S )N)rH   rI   )
isinstanceslicerF   rH   rI   )r   idxr   r   r   __getitem__   s   

z1SingleSentenceClassificationProcessor.__getitem__ r   r4   c           	   
   K   s,   | di |}|j ||||||ddd |S )NT)
split_namecolumn_labelcolumn_text	column_idskip_first_rowoverwrite_labelsoverwrite_examplesr   )add_examples_from_csv)	r@   	file_namerS   rT   rU   rV   rW   kwargs	processorr   r   r   create_from_csv   s   
z5SingleSentenceClassificationProcessor.create_from_csvc                 K   s    | di |}|j ||d |S )N)rH   r   )add_examples)r@   texts_or_text_and_labelsrH   r\   r]   r   r   r   create_from_examples   s   z:SingleSentenceClassificationProcessor.create_from_examplesc	                 C   s   |  |}	|r|	dd  }	g }
g }g }t|	D ]0\}}|
||  |||  |d ur5|||  q|r>| d| nt|}|| q| j|
||||dS )Nr4   -)rX   rY   )rC   	enumerateappendr   r_   )r   r[   rS   rT   rU   rV   rW   rX   rY   linestextsrH   idsiliner   r   r   r   rZ      s    

z;SingleSentenceClassificationProcessor.add_examples_from_csvc              	   C   sB  |d urt |t |krtdt | dt | |d ur4t |t |kr4tdt | dt | |d u r?d gt | }|d u rJd gt | }g }t }t|||D ]'\}}	}
t|ttfrj|	d u rj|\}}	n|}||	 |t	|
|d |	d qU|r|| _
n| j
| |rt|| _| j
S tt| j|| _| j
S )Nz(Text and labels have mismatched lengths z and z%Text and ids have mismatched lengths )r   r   r	   r
   )r5   
ValueErrorsetziprN   tupler$   addrd   r   rI   extendrH   union)r   r`   rH   rg   rX   rY   rI   added_labelstext_or_text_and_labelr
   r   textr   r   r   r_      s4   


z2SingleSentenceClassificationProcessor.add_examplesTc                 C   s6  |du r|j }dd t| jD }g }t| jD ]$\}	}
|	d dkr*td|	  |j|
jdt||j d}|	| qt
d	d
 |D }g }tt|| jD ]\}	\}}
|	d dkrjtd|	 dt| j  |rndndgt| }|t| }|r|g| | }|rdndg| | }n||g|  }||rdndg|  }t||krtdt| d| t||krtdt| d| | jdkr||
j }n| jdkrt|
j}nt| j|	dk r%| jr%td td|
j  tdddd |D   tdddd |D   td|
j d| d |	t|||d qP|du r7|S |dkrt sDtd ddl}dd!lm} |jd"d |D |jd#}|jd$d |D |jd#}| jdkr||jd%d |D |jd#}n| jdkr|jd&d |D |jd#}||||}|S td')(a  
        Convert examples in a list of `InputFeatures`

        Args:
            tokenizer: Instance of a tokenizer that will tokenize the examples
            max_length: Maximum example length
            pad_on_left: If set to `True`, the examples will be padded on the left rather than on the right (default)
            pad_token: Padding token
            mask_padding_with_zero: If set to `True`, the attention mask will be filled by `1` for actual values
                and by `0` for padded values. If set to `False`, inverts it (`1` for padded values, `0` for actual
                values)

        Returns:
            Will return a list of task-specific `InputFeatures` which can be fed to the model.

        Nc                 S   s   i | ]\}}||qS r   r   ).0rh   r
   r   r   r   
<dictcomp>  s    zFSingleSentenceClassificationProcessor.get_features.<locals>.<dictcomp>i'  r   zTokenizing example T)add_special_tokens
max_lengthc                 s   s    | ]}t |V  qd S rD   )r5   )rt   r!   r   r   r   	<genexpr>  s    zESingleSentenceClassificationProcessor.get_features.<locals>.<genexpr>zWriting example /r4   zError with input length z vs rG   
regression   z*** Example ***zguid: zinput_ids:  c                 S      g | ]}t |qS r   r   rt   xr   r   r   
<listcomp>2      zFSingleSentenceClassificationProcessor.get_features.<locals>.<listcomp>zattention_mask: c                 S   r}   r   r~   r   r   r   r   r   3  r   zlabel: z (id = ))r!   r"   r
   ptz8return_tensors set to 'pt' but PyTorch can't be imported)TensorDatasetc                 S      g | ]}|j qS r   )r!   rt   rB   r   r   r   r   @      )dtypec                 S   r   r   )r"   r   r   r   r   r   A  r   c                 S   r   r   r
   r   r   r   r   r   C  r   c                 S   r   r   r   r   r   r   r   r   E  r   z)return_tensors should be `'pt'` or `None`)max_lenrc   rH   rI   loggerinfoencoder   minrd   maxrl   r5   rj   rJ   r
   r&   rK   r   joinr    r   RuntimeErrortorchtorch.utils.datar   tensorlong)r   	tokenizerrw   pad_on_left	pad_tokenmask_padding_with_zeroreturn_tensors	label_mapall_input_idsex_indexr6   r!   batch_lengthfeaturesr"   padding_lengthr
   r   r   all_attention_mask
all_labelsdatasetr   r   r   get_features   sr   




  

z2SingleSentenceClassificationProcessor.get_features)NNrG   F)rR   r   r4   NFrD   )rR   r   r4   NFFF)NNFF)NFr   TN)r   r   r   r   rL   rM   rQ   rE   r^   ra   rZ   r_   r   r   r   r   r   rF   {   s4    


(rF   )r>   r   r   r   utilsr   r   
get_loggerr   r   r   r    r'   rF   r   r   r   r   <module>   s   
-