o
    }oi$                     @   sV   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ dgZ	G dd deZ
dS )    N)PreTrainedTokenizerBase)Dataset)loggingT5G2PDatasetc                       sh   e Zd ZdZ						ddeded	ed
ededededef fddZdd Z	dd Z
dd Z  ZS )r   z3
    Creates a dataset to train a T5G2P model.
       Ftext_graphemestextTmanifest_filepath	tokenizermax_source_lenmax_target_lendo_lowergrapheme_fieldphoneme_fieldwith_labelsc	                    s  t    tj|st| d|| _|| _|| _|| _	|| _
g | _d}	t|d}
td|  |
D ]}t|}|| }|rG| }|rt| j|}||krc|	d7 }	td| d q6t| j|| }|dkr||kr|	d7 }	td| d q6| j||| d	 q6| |d
 }t||kr|d|d  }| j|gd }| jd|i q6W d   n1 sw   Y  td|	 d| d dS )a  
        Dataset to train T5-based G2P generative model.

        Args:
            manifest_filepath: path to a .json manifest that contains "phoneme_field" and "grapheme_field"
            tokenizer: pretrained T5 tokenizer
            max_source_len: max length of the grapheme input sequence (examples exceeding len will be dropped)
            max_target_len: max length of the phoneme sequence (examples exceeding len will be dropped)
            do_lower: a flag that indicates whether to lower case input grapheme sequence
            phoneme_field: name of the field in manifest_filepath for ground truth phonemes
            grapheme_field: name of the field in manifest_filepath for input grapheme text
            with_labels: set to True for training and False for inference
        z
 not foundr   rzLoading dataset from:    z	dropping z longer max_source_lenz longer max_target_len)	graphemesphonemes	input_idsNr   z	Filtered z too long entries from .)super__init__ospathexists
ValueErrorr
   r   r   r   r   dataopenr   infojsonloadslowerlentokenizedebugappendbatch_decode)selfr	   r
   r   r   r   r   r   r   num_filteredf_inlineitemr   graphemes_len
target_lengraphemes_tokenizedgraphemes_tokenized_truncated	__class__ T/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/g2p/data/t5.pyr      sJ   

zT5G2PDataset.__init__c                 C   s
   t | jS N)r#   r   r(   r3   r3   r4   __len__f      
zT5G2PDataset.__len__c                 C   s
   | j | S r5   )r   )r(   indexr3   r3   r4   __getitem__i   r8   zT5G2PDataset.__getitem__c           
         s   dd |D } j |d jddd}|j|j}}||f} jrGdd |D } j |d jdd}|j}	 fd	d|	D }	t|	}	|||	f}|S )
Nc                 S      g | ]}|d  qS )r   r3   .0entryr3   r3   r4   
<listcomp>m       z,T5G2PDataset._collate_fn.<locals>.<listcomp>longestTpt)padding
max_length
truncationreturn_tensorsc                 S   r;   )r   r3   r<   r3   r3   r4   r?   y   r@   )rC   rD   rE   c                    s   g | ]} fd d|D qS )c                    s    g | ]}| j jkr|nd qS )i)r
   pad_token_id)r=   labelr6   r3   r4   r?      s     z7T5G2PDataset._collate_fn.<locals>.<listcomp>.<listcomp>r3   )r=   labels_exampler6   r3   r4   r?      s    )r
   r   r   attention_maskr   r   torchtensor)
r(   batchgraphemes_batchinput_encodingr   rJ   outputphonemes_batchtarget_encodinglabelsr3   r6   r4   _collate_fnl   s$   



zT5G2PDataset._collate_fn)r   r   Fr   r   T)__name__
__module____qualname____doc__strr   intboolr   r7   r:   rT   __classcell__r3   r3   r1   r4   r      s8    	G)r    r   rK   transformersr   nemo.core.classesr   
nemo.utilsr   __all__r   r3   r3   r3   r4   <module>   s   