o
    iz*                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
m
Z
 dddZdd Zddd	Zd
d ZG dd dejjZG dd de	jZdd ZdS )    N)	extension)tqdmc           	      C   s>  |dur?t j|dd |d t j|  d }t j|r>td|  t|d}|d dd |d	 d
 |d d
 fS ntd td|   t	| |}t
||d \}}|durtd|  t|d&}|jdt|ftjtjdd}||dd< ||d	< ||d< W d   n1 sw   Y  |||fS )a  Load and save HDF5 that contains a dataset and stats for LM

    Args:
        path (str): The path of an input text dataset file
        label_dict (dict[str, int]):
            dictionary that maps token label string to its ID number
        outdir (str): The path of an output dir

    Returns:
        tuple[list[np.ndarray], int, int]: Tuple of
            token IDs in np.int32 converted by `read_tokens`
            the number of tokens by `count_tokens`,
            and the number of OOVs by `count_tokens`
    NT)exist_ok/z.h5zloading binary dataset: rdatan_tokens n_oovsz;skip dump/load HDF5 because the output dir is not specifiedzreading text dataset: <unk>zsaving binary dataset: w)vlendtype)osmakedirspathbasenameexistslogginginfoh5pyFileread_tokenscount_tokenscreate_datasetlenspecial_dtypenpint32)	r   
label_dictoutdirfilenamefretr   r
   r   r	   r	   F/home/ubuntu/.local/lib/python3.10/site-packages/espnet/lm/lm_utils.pyload_dataset   s.   &



r&   c                    sR   g } d t t| dddD ]}|tj fdd| D tjd q|S )zRead tokens as a sequence of sentences

    :param str filename : The name of the input file
    :param dict label_dict : dictionary that maps token label string to its ID number
    :return list of ID sequences
    :rtype list
    r   r   zutf-8)encodingc                    s   g | ]}  |qS r	   )get).0labelr    unkr	   r%   
<listcomp>K       zread_tokens.<locals>.<listcomp>r   )r   openappendr   arraysplitr   )r"   r    r   lnr	   r+   r%   r   =   s   	r   c                 C   s@   d}d}| D ]}|t |7 }|dur|t||k7 }q||fS )zCount tokens and oovs in token ID sequences.

    Args:
        data (list[np.ndarray]): list of token ID sequences
        unk_id (int): ID of unknown token

    Returns:
        tuple: tuple of number of token occurrences and number of oov tokens

    r   N)r   r   count_nonzero)r   unk_idr   r
   sentencer	   r	   r%   r   Q   s   r   c                 C   s<   t | d | d  | d< d| v rt | d | d< dS dS )zgComputes and add the perplexity to the LogReport

    :param dict result: The current observations
    z	main/lossz
main/count
perplexityzvalidation/main/lossval_perplexityN)r   exp)resultr	   r	   r%   compute_perplexityf   s   r;   c                   @   sL   e Zd ZdZ	dddZdd Zdd	 Zed
d Zedd Z	dd Z
dS )ParallelSentenceIteratora!  Dataset iterator to create a batch of sentences.

    This iterator returns a pair of sentences, where one token is shifted
    between the sentences like '<sos> w1 w2 w3' and 'w1 w2 w3 <eos>'
    Sentence batches are made in order of longer sentences, and then
    randomly shuffled.
    r   Tc                    s   | _ || _d| _d| _|| _t }g | _|dkrmttt  fddd}	d}
|
|k rdt	|
| |}|dkrQt |	|
  }t	||
t
||| d  d }| jt|	|
|  |}
|
|k s.|rlt| j ndd tj|D | _d| _|| _|| _d	| _d S )
Nr   F   c                    s   t  |   S N)r   )idatasetr	   r%   <lambda>   s    z3ParallelSentenceIterator.__init__.<locals>.<lambda>)keyc                 S   s   g | ]}t |gqS r	   )r   r1   )r)   r?   r	   r	   r%   r-      r.   z5ParallelSentenceIterator.__init__.<locals>.<listcomp>      )rA   
batch_sizeepochis_new_epochrepeatr   batch_indicessortedrangeminmaxr0   r   r1   randomshufflesixmoves	iterationsoseos_previous_epoch_detail)selfrA   rE   
max_lengthrS   rT   rH   rO   lengthindicesbsbesent_lengthr	   r@   r%   __init__z   s8   
z!ParallelSentenceIterator.__init__c                 C   s   t | j}| js| j|krtg }| j| j|  D ]}|t| jg| j| t| j| | j	gf q| j
| _|  jd7  _| j| }| j|k | _| jrQ|| _|S )Nr=   )r   rI   rH   rR   StopIterationr0   r   rS   rA   rT   epoch_detailrU   rF   rG   )rV   	n_batchesbatchidxrF   r	   r	   r%   __next__   s"   

z!ParallelSentenceIterator.__next__c                 C   s   t | j d S r>   )rN   rO   rI   rV   r	   r	   r%   start_shuffle   s   z&ParallelSentenceIterator.start_shufflec                 C   s   | j t| j S r>   )rR   r   rI   rd   r	   r	   r%   r_      s   z%ParallelSentenceIterator.epoch_detailc                 C   s   | j dk rd S | j S )Nr   )rU   rd   r	   r	   r%   previous_epoch_detail   s   
z.ParallelSentenceIterator.previous_epoch_detailc                 C   s   |d| j | _ |d| j| _z
|d| j| _W d S  tyB   | j| jd t| j  | _| jdkr<t| jd| _Y d S d| _Y d S w )NrR   rF   rf   r=   r           rD   )	rR   rF   rU   KeyErrorcurrent_positionr   rI   r_   rM   rV   
serializerr	   r	   r%   	serialize   s   
z"ParallelSentenceIterator.serializeN)r   r   r   TT)__name__
__module____qualname____doc__r]   rc   re   propertyr_   rf   rl   r	   r	   r	   r%   r<   q   s    	
+

r<   c                       s2   e Zd ZdZd
 fdd	Zdd Zdd	 Z  ZS )MakeSymlinkToBestModelzExtension that makes a symbolic link to the best model

    :param str key: Key of value
    :param str prefix: Prefix of model files and link target
    :param str suffix: Suffix of link target
    modelbestc                    s0   t t|   d| _d| _|| _|| _|| _d S )Nrg   )superrr   r]   
best_modelmin_lossrC   prefixsuffix)rV   rC   ry   rz   	__class__r	   r%   r]      s   
zMakeSymlinkToBestModel.__init__c                 C   s   |j }| j|v rO|| j }| jdks|| jk rQ|| _|jj| _d| j| jf }tj	|j
d| j| jf }tj|r@t| t|| td|  d S d S d S )Nru   z%s.%dz%s.%szbest model is )observationrC   rw   rx   updaterrF   ry   r   r   joinoutrz   lexistsremovesymlinkr   r   )rV   trainerr}   losssrcdestr	   r	   r%   __call__   s   



zMakeSymlinkToBestModel.__call__c                 C   s   t |tjjr'|d| j |d| j |d| j |d| j |d| j d S |dd| _|dd| _|dd| _|dd	| _|dd
| _d S )N_best_model	_min_loss_key_prefix_suffixru   rg    rs   rt   )	
isinstancechainerrk   
Serializerrw   rx   rC   ry   rz   rj   r	   r	   r%   rl     s   z MakeSymlinkToBestModel.serialize)rs   rt   )rm   rn   ro   rp   r]   r   rl   __classcell__r	   r	   r{   r%   rr      s
    rr   c                    s   i ddg}|   D ]g\}}|dkrp||krpd fdd|D v r!q	|d }t|D ]F\}} | }	|	|vrAi d|d |fg||	< n||	 d }
t|
d |d t|
d |f||	 d< |t|d kri|||	 d< ||	 d }q)q	|S )	z7Make a lexical tree to compute word-level probabilitiesru   Nr   Tc                    s   g | ]}| vqS r	   r	   )r)   csubword_dictr	   r%   r-     s    z%make_lexical_tree.<locals>.<listcomp>r=      )items	enumeraterL   rM   r   )	word_dictr   word_unkrootr   widsuccr?   r   cidprevr	   r   r%   make_lexical_tree  s"   
(r   r>   )r   r   rN   r   r   numpyr   rP   chainer.trainingr   r   r&   r   r   r;   rA   Iteratorr<   	Extensionrr   r   r	   r	   r	   r%   <module>   s    
(
r/