o
    }oi F                  	   @   s   d dl Z d dlZd dlmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZmZ d dlmZ d dlmZ z
d dlmZ dZ W n e!e"fyq   dZ Y nw dgZ#G dd deZ$dS )    N)ListOptional)instantiate)Trainer)
DictConfig)CrossEntropyLoss)ClassificationReport)TokenClassifier)tensor2list)HeteronymClassificationDataset)get_heteronym_spansget_wordid_to_phonemesread_wordids)PretrainedModelInfo)logging)NLPModelTFHeteronymClassificationModelc                       s  e Zd ZdZdAdedef fddZdd Zd	d
 Zdd Z	 fddZ
dBddZdd Zdd Zdd ZdefddZdedeee  dee fddZe 	 	!	dCd"ee d#ed$edee fd%d&Ze dDd(ed#ed$efd)d*Ze 	'	 	!	dEd+ed,ed#ed$edee f
d-d.Zd/ee fd0d1Zd2ee fd3d4Zd5ee fd6d7Zded8efd9d:Zd(ed,ed#ed$ed;d<f
d=d>Zed;ee  fd?d@Z!  Z"S )Fr   z
    This is a classification model that selects the best heteronym option out of possible dictionary entries.
    Supports only heteronyms, no OOV.
    Ncfgtrainerc              	      sJ  |j | _ | d|j| _t| j\| _| _dd | j D | _t| j	 | _
|jjd u r_d}t|d}tt| jD ]}|| j| d  q=W d    n1 sTw   Y  | d| t j||d | jd	d | _t| j}t| j|| jjj| jjjd
| jjj| jjjd| _tdd| _t |dd| jd| _!d | _"d | _#d S )Nwordidsc                 S   s   i | ]\}}||qS  r   ).0kvr   r   l/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/g2p/models/heteronym_classification.py
<dictcomp>6       z9HeteronymClassificationModel.__init__.<locals>.<dictcomp>z/tmp/label_ids.csvw
zclass_labels.class_labels_file)r   r   langF)hidden_sizenum_classes
num_layers
activationlog_softmaxdropoutuse_transformer_init   )logits_ndimmacroT)r!   modedist_sync_on_step	label_ids)$max_seq_lengthregister_artifactr   r   heteronym_dictwordid_to_idxitemsidx_to_wordidlistkeyssupported_heteronymsclass_labelsclass_labels_fileopenrangelenwritesuper__init___cfggetr   r	   r    headnum_fc_layersr#   
fc_dropoutr&   
classifierr   lossr   classification_reportwordid_to_phonemes_filewordid_to_phonemes)selfr   r   label_ids_filefidxr!   	__class__r   r   r=   2   s>   


z%HeteronymClassificationModel.__init__c                 C   s2   | j |||d}t|tr|d }| j|d}|S )N	input_idsattention_masktoken_type_idsr   )hidden_states)
bert_model
isinstancetuplerC   )rH   rO   rP   rQ   rR   logitsr   r   r   forwardY   s   
z$HeteronymClassificationModel.forwardc                 C   sP   | j |d |d t|d d}d|v r"| j||d d}||fS d }||fS )NrO   rP   rN   targets)rV   labels)rW   torch
zeros_likerD   )rH   batchrV   rD   r   r   r   	make_stepb   s   z&HeteronymClassificationModel.make_stepc                 C   s   |  |\}}| d| |S )z
        Lightning calls this inside the training loop with the data from the training dataloader
        passed in as `batch`.
        
train_loss)r]   log)rH   r\   	batch_idxrD   rV   r   r   r   training_stepr   s   z*HeteronymClassificationModel.training_stepc                    s
   t   S N)r<   on_train_epoch_endrH   rL   r   r   rc   |   s   
z/HeteronymClassificationModel.on_train_epoch_endvalc                 C   s   |  |\}}|d }|d }||dk }| | d| tj|dd|dk }| ||\}	}
}}| d|d|	d	|
d
|i}|dkrL| j| |S |dkrV| j| |S )z
        Lightning calls this inside the validation loop with the data from the validation dataloader
        passed in as `batch`.
        subtokens_maskrX   i_lossaxisr   tpfnfpre   test)r]   r_   rZ   argmaxrE   validation_step_outputsappendtest_step_outputs)rH   r\   r`   splitval_lossrV   rf   rX   	tag_predsrk   rl   rm   _rD   r   r   r   validation_step   s   z,HeteronymClassificationModel.validation_stepc                    s  | j jrdnd  dkrt fdd| jD  }n dkr/t fdd| jD  }| j \}}}}d	dd |
dD }t  d|  t  d	|d
d | j  d|dd |   d| |   d| |   d| ||dd  
dd dd 
 d }||dd  
dd dd 
 d }|   dtt|g |   dtt|g | j   dkr| j  d S  dkr| j  d S d S )Nrn   re   c                       g | ]	}|  d  qS rg   r   r   xrs   r   r   
<listcomp>       zHHeteronymClassificationModel.on_validation_epoch_end.<locals>.<listcomp>c                    rx   ry   r   rz   r|   r   r   r}      r~   r   c                 S   s"   g | ]}| d sd|vr|qS )z          0z100.00     100.00     100.00)endswithrz   r   r   r   r}      s    z	_report: z_f1: z.2f%rg   T)prog_bar
_precision_f1_recallr)   r   z	macro avg microz	micro avg	_f1_macro	_f1_micro)r   testingrZ   stackrp   meanrr   rE   computejoinrs   r   infor_   indexreplacestripTensorfloatresetclear)rH   avg_loss	precisionrecallf1reportf1_macrof1_micror   r|   r   on_validation_epoch_end   s6    00
z4HeteronymClassificationModel.on_validation_epoch_endc                 C   s   |  ||dS )zx
        Lightning calls this inside the test loop with the data from the test dataloader passed in as `batch`.
        rn   )rw   )rH   r\   r`   r   r   r   	test_step   s   z&HeteronymClassificationModel.test_stepc                 C   s   |   S )z
        Called at the end of test to aggregate outputs.

        Args:
            outputs: list of individual outputs of each test step.
        )r   rd   r   r   r   on_test_epoch_end   s   z.HeteronymClassificationModel.on_test_epoch_endrF   c                 C   sN   |d u s
t j|st| d d S || _t| j| _td|  d S )Nz, not found, skip setting wordid_to_phonemes.z"Wordid to phonemes file is set to )	ospathexistsr   warningrF   r   rG   r   )rH   rF   r   r   r   set_wordid_to_phonemes   s
   z3HeteronymClassificationModel.set_wordid_to_phonemestext	start_endpredictionsc                 C   s   d}d}t |D ]8\}}|\}}	|| }
| jd u s|
| jvr%d|
 d}
n| j|
 }
ddd |
D }
|||| |
 7 }|	}q|t|k rO|||d  7 }|S )Nr   r   []c                 S   s   g | ]}d | d qS )|r   r   pr   r   r   r}          zBHeteronymClassificationModel._process_sentence.<locals>.<listcomp>)	enumeraterG   r   r:   )rH   r   r   r   text_with_heteronym_replacedlast_idxheteronym_idxcur_start_end	cur_startcur_endcur_predr   r   r   _process_sentence   s   
z.HeteronymClassificationModel._process_sentence   r   	sentences
batch_sizenum_workersc                 C   s6  t |tr|g}t|t|}t|| j\}}t|t|  kr't|kr7n ntdt| dt| d}t|d&}t|||D ]\}	}
}|	|
|d}|	t
j|ddd  qEW d	   n1 sgw   Y  | j|||d
}|d	ur}| | g }t|D ]\}}|| j|| ||| d q||fS )a  
        Replaces heteronyms, supported by the model, with the phoneme form (if wordid_to_phonemes_file)
        or with predicted wordids.

        Args:
            sentences: Sentences to use for inference
            batch_size: batch size to use during inference.
                Bigger will result in better throughput performance but would use more memory.
            num_workers: number of workers for DataLoader
            wordid_to_phonemes_file: (Optional) file with mapping between wordid predicted by the model to phonemes

        Returns:
            preds: model predictions
            output: sentences with heteronym replaced with phonemes (if wordid_to_phonemes_file specified)
        zLNumber of sentences should match the lengths of provided start-end indices, z != z/tmp/manifest.jsonr   )text_graphemesr   heteronym_spanFensure_asciir   N)manifestr   r   r   r   r   )rT   strminr:   r   r/   
ValueErrorr8   zipr;   jsondumps_disambiguater   r   rq   r   )rH   r   r   r   rF   r   
heteronymstmp_manifestrJ   cur_sentencecur_start_endscur_heteronymsitem	all_predsoutputsent_idxsent_start_endr   r   r   disambiguate   s<   
$
z)HeteronymClassificationModel.disambiguater   r   c              	      s  g } j }zztj rdnd}    |  j||||d}|D ]S}	|	d }
|	d ||	d |d}	 |	\}}ttj	|dd	|
d
k }dd t|
D }d
}|D ]}||||  } fdd|D }|
| ||7 }qYq#W  j|d |S  j|d w )Ncudacpu)grapheme_fieldr   r   rf   rO   rP   )rO   rP   rh   ri   r   c                 S   s   g | ]}t d d |D qS )c                 S   s   g | ]}|d kr|qS )   r   )r   p_r   r   r   r}   9  r   zIHeteronymClassificationModel._disambiguate.<locals>.<listcomp>.<listcomp>)r:   r   r   r   r   r}   9  s    z>HeteronymClassificationModel._disambiguate.<locals>.<listcomp>c                    s   g | ]} j | qS r   )r2   r   rd   r   r   r}   >  r   )r*   )trainingrZ   r   is_availableevalto_setup_infer_dataloaderr]   r
   ro   rq   train)rH   r   r   r   r   r   r*   deviceinfer_datalayerr\   rf   rv   rV   preds	preds_numr   numpreds_r   rd   r   r   !  s8   


z*HeteronymClassificationModel._disambiguateoutput_manifestr   c              	   C   s  | j ||||d}| | t|dddc}t|dddL}	t|D ]?\}
}t|}|d }t|dkr?t|d tr?|g}| j	|| |||
 d}||d	< ||
 |d
< |	
tj|ddd  q"W d    n1 slw   Y  W d    n1 s{w   Y  td|  |S )N)r   r   r   r   rzutf-8)encodingr   r   r   r   	pred_textpred_wordidFr   r   zPredictions save at )r   r   r8   r   r   loadsr:   rT   intr   r;   r   r   r   )rH   r   r   r   r   r   rF   r   f_inf_predsrK   liner   r   r   r   r   disambiguate_manifestF  s,   

 
 z2HeteronymClassificationModel.disambiguate_manifesttrain_data_configc                 C   8   |r|j jd u rtd d | _d S | j|dd| _d S )Nz`Dataloader config or file_path for the train is missing, so no data loader for train is created!r   r   
data_split)datasetr   r   r   	_train_dl_setup_dataloader_from_config)rH   r   r   r   r   setup_training_datai     z0HeteronymClassificationModel.setup_training_dataval_data_configc                 C   r   )NzjDataloader config or file_path for the validation is missing, so no data loader for validation is created!re   r   )r   r   r   r   _validation_dlr   )rH   r   r   r   r   setup_validation_datar  r   z2HeteronymClassificationModel.setup_validation_datatest_data_configc                 C   r   )Nz^Dataloader config or file_path for the test is missing, so no data loader for test is created!rn   r   )r   r   r   r   _test_dlr   )rH   r   r   r   r   setup_test_data{  r   z,HeteronymClassificationModel.setup_test_datar   c              
   C   s   d|vs
t |jtstd| d|vst |jts"td| t|j|jj|jj| j| j	| j
| jdd}tjjj|fd|ji|jS )Nr   zNo dataset for dataloader_paramszNo dataloader_params for Tr   r   	tokenizerr0   r/   max_seq_lenwith_labels
collate_fn)rT   r   r   r   r   r   r   r   r  r0   r/   r-   rZ   utilsdata
DataLoaderr  )rH   r   r   r   r   r   r   r     s   z:HeteronymClassificationModel._setup_dataloader_from_configreturnztorch.utils.data.DataLoaderc              	   C   s>   t ||| j| j| j| jjjdd}tjjj||j	|d|ddS )NFr   )r  r   shuffler   	drop_last)
r   r  r0   r/   model_max_lengthrZ   r  r  r  r  )rH   r   r   r   r   r   r   r   r   r     s"   
z4HeteronymClassificationModel._setup_infer_dataloaderc                 C   s   g S )z
        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
        Returns:
            List of available pre-trained models.
        r   )clsr   r   r   list_available_models  s   z2HeteronymClassificationModel.list_available_modelsrb   )re   )r   r   N)r   r   )r   r   r   N)#__name__
__module____qualname____doc__r   r   r=   rW   r]   ra   rc   rw   r   r   r   r   r   r   r   r   rZ   no_gradr   r   r   r   r   r   r   r   r   classmethodr   r  __classcell__r   r   rL   r   r   ,   sz    '	

%	"	9$"			
)%r   r   typingr   r   rZ   hydra.utilsr   lightning.pytorchr   	omegaconfr   nemo.collections.common.lossesr   2nemo.collections.nlp.metrics.classification_reportr   #nemo.collections.nlp.modules.commonr	   &nemo.collections.nlp.parts.utils_funcsr
   6nemo.collections.tts.g2p.data.heteronym_classificationr   nemo.collections.tts.g2p.utilsr   r   r   nemo.core.classes.commonr   
nemo.utilsr   %nemo.collections.nlp.models.nlp_modelr   NLP_AVAILABLEModuleNotFoundErrorImportError__all__r   r   r   r   r   <module>   s.   