o
    }oiV                     @   sn   d dl Z d dlmZmZmZmZ d dlZd dlm	Z	 d dl
mZ d dlmZmZmZ dgZG dd deZdS )    N)DictListOptionalTuple)AutoTokenizer)Dataset)ChannelTypeMaskType
NeuralTypeText2SparqlDatasetc                   @   s   e Zd ZdZedeeeef  fddZ			ddede
d	e
d
ededededefddZ	ddede
deee ee f fddZdd Zdee deee ee f fddZdd ZdS )r   a  A dataset class that converts raw data to a dataset that can be used by NeuralMachineTranslationModel.

    Args:
        filepath: .tsv file to sequence + label.
            the first line is header (sentence [tab] label)
            each line should be [sentence][tab][label]
        encoder_tokenizer: encoder tokenizer object such as AutoTokenizer
        decoder_tokenizer: decoder tokenizer object. If using BART or end to end model, set this to encoder_tokenizer
        max_seq_length: max sequence length including bos and eos tokens
        num_samples: number of samples you want to use for the dataset. If -1, use all dataset. Useful for testing.
        convert_labels: if true, converts labels for masked lm and updates pad_id to -100
            for hf masked loss
    returnc                 C   s.   t dt t dt t dt t dt dS )z;Returns definitions of module output ports.
               )BT)	input_idsattention_maskdecoder_input_ids	lm_labels)r
   r   r	   self r   m/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/nlp/data/text2sparql/text2sparql_dataset.pyoutput_types,   s
   



zText2SparqlDataset.output_typesFfilepathencoder_tokenizerdecoder_tokenizerencoder_add_special_tokensdecoder_add_special_tokensmax_seq_lengthnum_samplesconvert_labelsc	              	   C   sl  || _ || _|| _|| _|| _|| _|| _|| _|dkr!td|| jr,| jdkr,d | _t	j
|s9t| dt|}	|	 dd  }
W d    n1 sPw   Y  |dkr_|
d | }
g g g }}}|
D ]8}z	|d\}}W n ty   tdw | j|||d\}}|| || || j|||dd  qit|| _t|| _t|| _d S )	Nr   znum_samples has to be positive.   zU not found. The filepath must be set in train_ds.filepath and validation_ds.filepath.   	zIEach line of input file should contain the format [sentence][tab][label].)	tokenizeradd_special_tokens)r   r   r   r   r   r   r   r    
ValueErrorospathexistsFileNotFoundErroropen	readlinessplittext_to_idsappendnpasarrayr   input_masks	label_ids)r   r   r   r   r   r   r   r   r    flinesr   r2   r3   linesentencelabelidsmaskr   r   r   __init__7   sN   




zText2SparqlDataset.__init__textr$   c                 C   s   | | }|rdnd}| jr| j|kr|d| j|  }|r*|jg| |jg }dgt| }| jrO| j|krO| jt| }||jg| 7 }|dg| 7 }||fS )z1Converts text to ids. Truncates and adds padding.r!   r   Nr"   )r.   stripr   bos_ideos_idlenpad_id)r   r<   r$   r%   text_tokensnum_special_tokensr:   
pad_lengthr   r   r   r.   o   s   zText2SparqlDataset.text_to_idsc                 C   s
   t | jS N)r@   r   r   r   r   r   __len__   s   
zText2SparqlDataset.__len__r3   c                 C   s<   |d d }|dd    }d||dd  | jjk< ||fS )Nr   r"   i)copyr   rA   )r   r3   r   r   r   r   r   convert_label_ids   s   z$Text2SparqlDataset.convert_label_idsc                 C   sH   | j r| | j| \}}n
| j| }| j| }| j| | j| ||fS rE   )r    rH   r3   r   r2   )r   idxr   r   r   r   r   __getitem__   s
   

zText2SparqlDataset.__getitem__N)r   F)F)__name__
__module____qualname____doc__propertyr   r   strr
   r   r   boolintr;   r   r   r.   rF   rH   rJ   r   r   r   r   r      sD    	
9
&)r'   typingr   r   r   r   numpyr0   "nemo.collections.common.tokenizersr   nemo.core.classesr   nemo.core.neural_typesr   r	   r
   __all__r   r   r   r   r   <module>   s   