o
    }oi*                     @   s~   d dl Z d dlZd dlZd dlmZmZmZmZ d dlZ	d dl
Z
d dlmZ d dlmZ d dlmZ dgZG dd deZdS )    N)DictListOptionalTuple)TokenizerSpec)Dataset)loggingHeteronymClassificationDatasetc                       s   e Zd Z			ddededeeeeef f deeef ded	ed
ef fddZ	ddede	e
eef  de	e dee	e  fddZdd Zdd Zdd Z  ZS )r	      text_graphemesTmanifest	tokenizerheteronym_dictwordid_to_idxmax_seq_lengrapheme_fieldwith_labelsc              	      sp  t    tj|st| d|| _|| _g | _d| _	|| _
|| _|| _d| _d| _d}t|dh}	|	D ]]}
t|
}
|
d |
d }}d|
v rP|
d }nt|trXd}ndgt| }t|tro|g|g|g}}}| |
| |||}|du r|d	7 }q7|d |d	 |d
 d}| j| q7W d   n1 sw   Y  td| dt| j d| d dS )u  
        Creates dataset to use to run training and inference on G2PClassificationModel.
        Processes WikiHomograph raw data files:
        https://github.com/google-research-datasets/WikipediaHomographData/tree/master/data

        Args:
            manifest: path to manifest with "heteronym_span", "start_end", "text_graphemes"
                and (optional) "word_id" fields. "word_id" is required for model training.
            tokenizer: pretrained tokenizer
            heteronym_dict: a dictionary where each grapheme contains word_id to ipa_form mappings, e.g.,
                {'use': {'use_nou': "'juːs", 'use_vrb': "'juːz"}}
            wordid_to_idx: mapping from word id to index
            max_seq_len: maximum input sequence length
            grapheme_field: name of the field in the .json manifest with grapheme input
            with_labels: indicates whether labels are provided in the manifest. False for inference, True for training
        z
 not foundr   ir	start_endheteronym_spanword_idN      )	input_idssubtokens_masktargetzNumber of samples in z: z	, remove z lines)super__init__ospathexists
ValueErrorr   r   data	pad_tokenr   r   r   LOSS_PAD_TOKEN	PAD_TOKENopenjsonloads
isinstancestrlen_prepare_sampleappendr   info)selfr   r   r   r   r   r   r   num_skippedflinecur_start_endcur_heteronymscur_word_idsexampleexample_dict	__class__ j/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/g2p/data/heteronym_classification.pyr      sF   





(z'HeteronymClassificationDataset.__init__Nsentencer   
heteronymsword_idsc                 C   s  t | j|d }|| jkrtd| j d| d dS t||D ] \}}| ||d |d   krCtd| d	  dS q#g g g }}	}
t| jd
rk|	| jj
 |		| j | jri|
	| j nd}
d}dd td|D }|D ]}|\}}|t |k r6|d || d   kr|d k r6n n|| }d}g }|d |d k r||d |d  }| j|}|	| jgt |  |||t | d }| j|}|||  |	dg| jgt |d    | jr| j||  }|
| jgt | |g | jgt |d    | |d |d  }|| jvr1t| d  dS |d7 }qz| j|}|| |	| jgt |  | jr[|
| jgt |  qz|t |k rjtd dS t| jdr|	| jj |		| j | jr|
	| j ||	|
fS )a  
        Prepares a single training sample

        Args:
            sentence: input sentence in grapheme form
            start_end: start and end indices of the heteronym spans, start_end indices should be in increasing order
            heteronyms: heteronyms present in the sentence
            word_ids: [Optional] target word_ids, use None for inference, e.g. ['diffuse_adj']
        r   z-Sequence length exceeds max sequence length (z): .Nr   r   z	Span for z  is incorrect. Skipping example.bos_idc                 S   s*   g | ]}| d | | d ffqS )r   r   )groupstartend).0mr:   r:   r;   
<listcomp>   s   * zBHeteronymClassificationDataset._prepare_sample.<locals>.<listcomp>z\S+ z$ is not supported. Skipping example.z9Not all heteronym spans were processed. Skipping example.eos_id)r+   r   text_to_tokensr   r   debugziplowerhasattrr-   r@   r%   r   r$   refinditertext_to_idsextendindexr   r   r.   rH   )r/   r<   r   r=   r>   length
heteronym_
start_end_r   r   target_word_idsheteronym_span_idxmatchesmatchwordword_start_endheteronym_start_endprefix
prefix_idsword_input_idscur_target_word_id	heteronymidsr:   r:   r;   r,   g   s   
 * 



z.HeteronymClassificationDataset._prepare_samplec                 C   s
   t | jS N)r+   r"   )r/   r:   r:   r;   __len__      
z&HeteronymClassificationDataset.__len__c                 C   s
   | j | S rc   )r"   )r/   rR   r:   r:   r;   __getitem__   re   z*HeteronymClassificationDataset.__getitem__c                 C   s\  t dd |D }g }g }g }| jrg }|D ]s}|d }t||k ri|t| }	|dgt| dg|	   |tj|d|	g| jd |tj|d d|	g| jd | jrh|tj|d d|	g| jd q|dgt|  || ||d  | jr||d  qt	t
|t	t
|t	t
|d	}
| jrt	||
d
< |
S )zx
        Args:
            batch:  A list of tuples of (input_ids, subtokens_mask, [Optional] target_word_ids).
        c                 S   s   g | ]}t |d  qS )r   )r+   )rD   entryr:   r:   r;   rF      s    z>HeteronymClassificationDataset._collate_fn.<locals>.<listcomp>r   r   r   )	pad_widthconstant_valuesr   r   )r   attention_maskr   targets)maxr   r+   r-   nppadr%   r$   torch
LongTensorarray)r/   batch
max_lengthpadded_input_idspadded_subtokens_maskpadded_attention_maskpadded_targetsitemr   rh   outputr:   r:   r;   _collate_fn   sB   
z*HeteronymClassificationDataset._collate_fn)r
   r   Trc   )__name__
__module____qualname__r*   r   r   intboolr   r   r   r   r,   rd   rf   rz   __classcell__r:   r:   r8   r;   r	      s@    
M

g)r'   r   rN   typingr   r   r   r   numpyrm   ro   1nemo.collections.common.tokenizers.tokenizer_specr   nemo.core.classesr   
nemo.utilsr   __all__r	   r:   r:   r:   r;   <module>   s   