o
    wi                     @   sb   d dl Z d dlZd dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 dgZG dd deZdS )    N)List)PreTrainedTokenizerBase)Dataset)loggingCTCG2PBPEDatasetc                       s   e Zd Z						ddededed	ed
ee dedededef fddZdd Z	dd Z
dedee fddZdd Z  ZS )r   TN   texttext_graphemesmanifest_filepathtokenizer_graphemestokenizer_phonemesdo_lowerlabelsmax_source_lenphoneme_fieldgrapheme_fieldwith_labelsc
              	      s  t    tj|st| d|| _|| _|| _|| _	|| _
dd t|D | _g | _d| _|	| _d}
d}t|d}td|  t|D ]\}}t|}|r]||  ||< t| jtrq| || }t|d }n| j|| }t|}|	r| j|| }t|}||kr|
d7 }
qJ||kr|d7 }qJ| j|| || ||d	 qJt||kr|| d
| ||< |d7 }| jd|| i qJW d
   n1 sw   Y  td|
 d| d|  d
S )a   
        Creates a dataset to train a CTC-based G2P models.

        Args:
            manifest_filepath: path to a .json manifest that contains "phoneme_field" and "grapheme_field"
            tokenizer_graphemes: tokenizer for graphemes
            tokenizer_phonemes: tokenizer for phonemes
            do_lower: set to True to lower case input graphemes
            labels: output labels (tokenizer_phonemes vocabulary)
            max_source_len: max length of the grapheme input sequence (examples exceeding len will be dropped)
            phoneme_field: name of the field in manifest_filepath for ground truth phonemes
            grapheme_field: name of the field in manifest_filepath for input grapheme text
            with_labels: set to True for training and False for inference
        z
 not foundc                 S   s   i | ]\}}||qS  r   ).0ilr   r   ^/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/tts/g2p/data/ctc.py
<dictcomp>A   s    z-CTCG2PBPEDataset.__init__.<locals>.<dictcomp>r   rzLoading dataset from: 	input_ids   )	graphemesphonemestarget
target_lenNr   zRemoved z examples on CTC constraint, z' examples based on max_source_len from )super__init__ospathexists
ValueErrormanifestr   r   r   r   	enumeratelabels_tkn2iddata	pad_tokenr   openr   debugjsonloadslower
isinstancer   lentext_to_idsappendinfo)selfr
   r   r   r   r   r   r   r   r   removed_ctc_maxremoved_source_maxf_inr   lineitemgrapheme_tokensgrapheme_tokens_lentarget_tokensr   	__class__r   r   r!      sh   

	
+zCTCG2PBPEDataset.__init__c                 C   s
   t | jS N)r1   r)   r5   r   r   r   __len__w      
zCTCG2PBPEDataset.__len__c                 C   s
   | j | S r@   )r)   )r5   indexr   r   r   __getitem__z   rC   zCTCG2PBPEDataset.__getitem__returnc                 C   s.   g }t | D ]\}}|| j|  q|S )z- Creates a mapping from target labels to ids.)r'   splitr3   r(   )r5   r   tokensword_idwordr   r   r   map}   s   zCTCG2PBPEDataset.mapc                    sN  dd |D }t jtr(j|djddd}|j|j}}t|dd }n,fdd|D }d	d |D }t|  fd
dt	||D }d }t
|}t
|}js^|||f}|S dd |D }dd |D }	t|	}
g }t	||	D ]\}}d|
| f}tjjj||tjd}|| qwt|}t|	}	|||||	f}|S )Nc                 S   s   g | ]}|d  qS )r   r   r   entryr   r   r   
<listcomp>       z0CTCG2PBPEDataset._collate_fn.<locals>.<listcomp>longestTpt)padding
max_length
truncationreturn_tensorsr   c                    s   g | ]} j |qS r   )r   r2   )r   sentencerA   r   r   rN      s    c                 S   s   g | ]}t |qS r   )r1   rL   r   r   r   rN      rO   c                    s"   g | ]\}}|d g |   qS )r   r   )r   rM   	entry_len)max_lenr   r   rN      s   " c                 S      g | ]	}t |d  qS )r   torchtensorrL   r   r   r   rN          c                 S   rY   )r   rZ   rL   r   r   r   rN      r]   r   )value)r0   r   r   r   r   attention_maskr[   summaxzipr\   r   nn
functionalpadr1   r   r3   stack)r5   batchgraphemes_batchinput_encodingr   r_   	input_lenoutputtargetstarget_lengthsmax_target_lenpadded_targetsr   r   re   
target_padr   )rX   r5   r   _collate_fn   sB   




zCTCG2PBPEDataset._collate_fn)TNr   r   r	   T)__name__
__module____qualname__strr   boolr   intr!   rB   rE   rK   rq   __classcell__r   r   r>   r   r      s<    	
Z)r-   r"   typingr   r[   transformersr   nemo.core.classesr   
nemo.utilsr   __all__r   r   r   r   r   <module>   s   