o
    }oi<L                  
   @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZmZmZmZmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dlm Z  d d	l!m"Z"m#Z# d d
l$m%Z% zd dl&m'Z' W n e(y Z) zW Y dZ)[)ndZ)[)ww ee
e*f Z+G dd deZ,G dd deZ-G dd deZ.d%ddZ/d%ddZ0d&ddZ1G dd  d Z2G d!d" d"e2e"Z3G d#d$ d$e2e#Z4dS )'    )annotationsNPath)	AnyCallableDictIterableList
NamedTupleOptionalSetUnion)pad_sequence)tqdm)_speech_collate_fn)TokenizerSpec)DatasetIterableDataset)logging)
Normalizerc                   @  s&   e Zd ZU ded< ded< ded< dS )TextToTextItemtorch.Tensortts_text
transcriptintspeakerN)__name__
__module____qualname____annotations__ r    r    Z/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/data/text_to_text.pyr   -   s   
 r   c                   @  sD   e Zd ZU ded< ded< ded< ded< ded< edddZdS )TextToTextBatchr   	tts_textstts_text_lengthstranscriptstranscript_lengthsspeakersbatchList[TextToTextItem]
asr_pad_idr   tts_text_pad_idreturnc              	   C  sv   t tdd | D d|dtdd | D  tdd | D d|dtdd | D  tdd | D  d	S )
Nc                 S     g | ]}|j qS r    r   .0itemr    r    r!   
<listcomp>=       z.TextToTextBatch.collate_fn.<locals>.<listcomp>Tbatch_firstpadding_valuec                 S     g | ]}|j jd  qS r   r   shaper/   r    r    r!   r2   >       c                 S  r-   r    r   r/   r    r    r!   r2   ?   r3   c                 S  r7   r8   r   r:   r/   r    r    r!   r2   @   r;   c                 S  r-   r    r   r/   r    r    r!   r2   A   r3   )r#   r$   r%   r&   r'   )r"   r   torchtensorlongr(   r*   r+   r    r    r!   
collate_fn:   s   zTextToTextBatch.collate_fnN)r(   r)   r*   r   r+   r   r,   r"   r   r   r   r   staticmethodrC   r    r    r    r!   r"   3   s   
 r"   c                   @  sT   e Zd ZU ded< ded< ded< ded< ded< ded< ded< edddZdS )TextOrAudioToTextBatchr   audio_signalsaudio_signal_lengthsr#   r$   r'   r%   r&   r(   "List[Union[TextToTextItem, tuple]]r+   r   r*   r,   5Union[TextToTextBatch, TextOrAudioToTextBatch, tuple]c              	   C  s  dd | D }|st | |dS dd | D }|s tj|||dS tdd |D ddd	}td
d |D  }tdd |D d|d	}tdd |D  }tdd |D  }	tdd |D dd |D  d|d	}
tdd |D dd |D   }t|||||	|
|dS )
        Collate function for dataloader
        Can accept mixed batch of text-to-text items and audio-text items (typical for ASR)
        c                 S  s   g | ]	}t |tr|qS r    
isinstancer   r/   r    r    r!   r2   V       z5TextOrAudioToTextBatch.collate_fn.<locals>.<listcomp>)r(   pad_idc                 S  s   g | ]	}t |ts|qS r    rL   r/   r    r    r!   r2   [   rN   rB   c                 S     g | ]}|d  qS r8   r    r/   r    r    r!   r2   e       Tg        r4   c                 S  rP   )   r    r/   r    r    r!   r2   f   rQ   c                 S  r-   r    r.   r/   r    r    r!   r2   i   r3   c                 S  r7   r8   r9   r/   r    r    r!   r2   k   r;   c                 S  r-   r    r>   r/   r    r    r!   r2   l   r3   c                 S  r-   r    r<   r/   r    r    r!   r2   o   r3   c                 S  rP   )   r    r/   r    r    r!   r2   o   rQ   c                 S  r7   r8   r=   r/   r    r    r!   r2   t   r;   c                 S  rP   )   r    r/   r    r    r!   r2   t   rQ   )rG   rH   r#   r$   r'   r%   r&   )r   r"   rC   r   r?   r@   rA   rF   )r(   r+   r*   
text_items	asr_itemsrG   rH   r#   r$   r'   r%   r&   r    r    r!   rC   N   s@   z!TextOrAudioToTextBatch.collate_fnN)r(   rI   r+   r   r*   r   r,   rJ   rD   r    r    r    r!   rF   E   s   
 rF   textstrr,   
np.ndarrayc                 C  s8   t | }tdurtg| }tdur|t t|S )z
    Helper function for asr tokenization with multiprocessing pool only.
    Must be defined on the top level.
    Expects asr_tokenizer_global, asr_bos_id_global, asr_eos_id_global to exist in the current pool process
    N)asr_tokenizer_globaltext_to_idsasr_bos_id_globalasr_eos_id_globalappendnpasarray)rW   idsr    r    r!   _asr_text_to_tokens   s   



rb   c                 C  s   t t| S )z
    Helper function for asr tokenization with multiprocessing pool only.
    Must be defined on the top level.
    Expects tts_tokenizer_global to exist in the current pool process
    )r_   r`   tts_tokenizer_global)rW   r    r    r!   _tts_text_to_tokens   s   rd   filepathAnyPathIterable[Dict[str, Any]]c                 c  sP    t | ddd}|D ]
}t|}|V  qW d   dS 1 s!w   Y  dS )z-
    Helper function to iterate manifest
    rzutf-8)encodingN)openjsonloads)re   flinerecordr    r    r!   _iterate_manifest   s   
"rp   c                      s   e Zd ZU dZded< ded< dZded< dZded< d	ed
< 					d/d0 fd d!Zd1d&d'Zd2d1d)d*Z	d+d, Z
d-d. Z  ZS )3TextToTextDatasetBasezu
    Base class for loading text-to-text manifests
    Map-style and Iterable datasets should inherit this class
    r   r*   r+   NzOptional[int]
asr_bos_id
asr_eos_idzList[Dict[str, Any]]datarR   @B r   manifest_filepathUnion[AnyPath, List[AnyPath]]speakers_filepathasr_tokenizerr   asr_use_start_end_tokenbool
tts_parserr   tts_text_normalizer'Normalizer'tts_text_normalizer_call_kwargsr   	min_words	max_wordstokenizer_workers	num_partscurrent_part_indexc           "   
     sz  t    |rt|dr|j_|rt|dr|j_t|dr%|j_nd_|_	|_
|_|_|_t|trB|d}nt|trJ|g}t }|D ]'}tt| d}|tt|   W d    n1 sqw   Y  qOtt|_tdtj d t|tr|d}nt|tr|g}d	d
 |D _ d}d}g }g }d j D ]E}t!t"|D ]<}|d }t| }|	|  kr|
ksn ||7 }|d7 }q|#|d  d|v r|#|d  q|#|d  d qq rt$d |rt$d| d|  t|}|dkrC|| }|| }|| }td| d| d| d|  ||| }||| }|}dd
 t%|D _&t|dkr[t$d d S |dkrt$d t't!fdd|D t|dD ]\}}|j&| d< qvn@dd  }t(j)j*||jjf|d!%} t't!| jt+|d"d#t|dD ]\}}|j&| d< qW d    n	1 sw   Y  ~t,-  |dkrt$d t't! fd$d|D t|dD ]\}}|j&| d%< qnC rt.d&d'd( }!t(j)j*|!|f|d!%} t't!| jt/|d"d#t|dD ]\}}|j&| d%< qW d    n	1 s1w   Y  ~t,-  d S ))N	bos_token	eos_token	pad_tokenr   ,rh   zLoaded z	 speakersc                 S  s   g | ]}t |qS r    r   )r0   re   r    r    r!   r2      rQ   z2TextToTextDatasetBase.__init__.<locals>.<listcomp>FrW   rR   tts_text_normalizedr   TzLTTS normalization is extremely slow! It is recommended to normalize TTS textzSkipped z utterances with zTaking part of the dataset: z index, total z from z to c                 S  s   g | ]}t  qS r    )dict)r0   _r    r    r!   r2     r3   zText-to-text dataset is emptyzPreprocessing large text with tokenizer_workers=1 may be slow with TTS tokenizer. Prefer tokenizer_workers=(num_cpu_cores/num_gpus_per_node)c                 3  s    | ]}  |V  qd S N)rb   r0   rW   selfr    r!   	<genexpr>  s    z1TextToTextDatasetBase.__init__.<locals>.<genexpr>)totalasr_text_tokensc                 S  s"   t | at |at |ad S r   )copydeepcopyrZ   r\   r]   )	tokenizerbos_ideos_idr    r    r!   _init_asr_tokenize_process$  s   

zBTextToTextDatasetBase.__init__.<locals>._init_asr_tokenize_process)initializerinitargsmax_workersi  )	chunksizec                 3  s    | ]
}j | d V  qdS ))	normalizeN)rd   r   need_normalizationr   r    r!   r   @  s    tts_text_tokenszNormalization with tokenizer_workers > 1 is not implemented. It is not recommended to use normalization on the fly at all, since it's extremely slowc                 S  s   t | ad S r   )r   r   rc   )r   r    r    r!   _init_tts_tokenize_processM  s   zBTextToTextDatasetBase.__init__.<locals>._init_tts_tokenize_process)0super__init__hasattrr   rr   r   rs   rO   r*   ry   r|   tts_normalizertts_normalizer_kwargsr+   rM   rX   splitr   setrj   
expanduserupdatemapr   readr_   r`   sortedr'   r   infolenmanifest_pathsr   rp   r^   warningrangert   	enumerate
concurrentfuturesProcessPoolExecutorrb   gccollectNotImplementedErrorrd   )"r   rv   rx   ry   rz   r|   r+   r}   r   r   r   r   r   r   r'   re   rm   num_skipped_wordsnum_skipped_utterances	asr_textsr#   manifest_pathtmp_itemrW   	num_wordsnum_utterancesnum_utterances_partstartenditokenized_textr   poolr   	__class__r   r!   r      s   












	zTextToTextDatasetBase.__init__rW   rX   r,   rY   c                 C  sB   | j |}| jd ur| jg| }| jd ur|| j t|S r   )ry   r[   rr   rs   r^   r_   r`   )r   rW   ra   r    r    r!   rb   ]  s   


z)TextToTextDatasetBase._asr_text_to_tokensTc                 C  s.   |r| j j|fi | j}| |}t|S r   )r   r   r   r|   r_   r`   )r   rW   r   tokensr    r    r!   rd   e  s   

z)TextToTextDatasetBase._tts_text_to_tokensc                 C  s<   | j | }tt|d  t|d  t| jdS )Nr   r   )r   r   r   )rt   r   r?   
from_numpyrA   randomchoicer'   )r   indexr1   r    r    r!   __getitem__k  s   

z!TextToTextDatasetBase.__getitem__c                 C  s
   t | jS r   )r   rt   r   r    r    r!   __len__s  s   
zTextToTextDatasetBase.__len__rR   ru   rR   rR   r   rv   rw   rx   rw   ry   r   rz   r{   r|   r   r+   r   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r   rW   rX   r,   rY   )T)r   r   r   __doc__r   rr   rs   r   rb   rd   r   r   __classcell__r    r    r   r!   rq      s$   
  
/rq   c                      s4   e Zd ZdZ			dd fddZd ddZ  ZS )!TextToTextDatasetz8Text-to-Text Map-style Dataset for hybrid ASR-TTS modelsrR   ru   rv   rw   rx   ry   r   rz   r{   r|   r   r+   r   r}   r~   r   r   r   r   r   c                   s(   t  j|||||||||	|
|dd d S )NrR   )rv   rx   ry   rz   r|   r+   r}   r   r   r   r   r   r   r   )r   rv   rx   ry   rz   r|   r+   r}   r   r   r   r   r   r    r!   r   z  s   
zTextToTextDataset.__init__r(   rI   r,   rJ   c                 C     t j|| j| jdS rK   rB   rF   rC   r*   r+   r   r(   r    r    r!   rC        
zTextToTextDataset.collate_fn)rR   ru   rR   )rv   rw   rx   rw   ry   r   rz   r{   r|   r   r+   r   r}   r~   r   r   r   r   r   r   r   r   r(   rI   r,   rJ   )r   r   r   r   r   rC   r   r    r    r   r!   r   w  s    r   c                      s@   e Zd ZdZ					d#d$ fddZdd Zd%d!d"Z  ZS )&TextToTextIterableDatasetz
    Text-to-Text Iterable Dataset for hybrid ASR-TTS models
    Only part necessary for current process should be loaded and stored
    rR   ru   r   rv   rw   rx   ry   r   rz   r{   r|   r   r+   r   r}   r~   r   r   r   r   r   r   r   c                   s*   t  j|||||||||	|
|||d d S )N)rv   rx   ry   rz   r|   r+   r}   r   r   r   r   r   r   r   )r   rv   rx   ry   rz   r|   r+   r}   r   r   r   r   r   r   r   r    r!   r     s   
z"TextToTextIterableDataset.__init__c                 C  s   t jj }|d u rd}t| }nttt| t|j	 }|j
}|| }t|| t| }t||}tj| t| j|S )Nr   )r?   utilsrt   get_worker_infor   r   mathceilfloatnum_workersidminr_   aranger   shuffler   r   )r   worker_infor   r   
per_worker	worker_idindicesr    r    r!   __iter__  s   
z"TextToTextIterableDataset.__iter__r(   rI   r,   rJ   c                 C  r   r   r   r   r    r    r!   rC     r   z$TextToTextIterableDataset.collate_fnr   r   r   )r   r   r   r   r   r   rC   r   r    r    r   r!   r     s     r   r   )re   rf   r,   rg   )5
__future__r   concurrent.futuresr   r   r   rk   r   r   pathlibr   typingr   r   r   r   r	   r
   r   r   r   numpyr_   r?   torch.utils.datatorch.nn.utils.rnnr   	tqdm.autor   'nemo.collections.asr.data.audio_to_textr   "nemo.collections.common.tokenizersr   nemo.core.classesr   r   
nemo.utilsr   1nemo_text_processing.text_normalization.normalizer   	ExceptionerX   rf   r   r"   rF   rb   rd   rp   rq   r   r   r    r    r    r!   <module>   sF   ,
=

	
 U,