o
    i!                     @   s   d dl mZ d dlmZmZmZmZmZ d dlZd dl	m
Z d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ eej edkrjd dl!m"Z" nedddZ"G dd deZ#dS )    )contextmanager)DictListOptionalTupleUnionN)parse)check_argument_types)
AbsEncoder)AbsFrontend)AbsPreEncoder)
AbsSpecAug)HubertPretrainLoss)AbsNormalize)force_gatherable)AbsESPnetModel)ErrorCalculatorz1.6.0)autocastTc                 c   s    d V  d S )N )enabledr   r   O/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/hubert/espnet_model.pyr      s   
r   c                #       s  e Zd ZdZ										d0ded	eeed
f ee f de	e
 de	e de	e de	e dededededededededededef" fddZdejdejdejdejd eejeeejf ejf f
d!d"Zdejdejdejdejd eeejf f
d#d$Zdejdejd%ejd&ejd eejejf f
d'd(Zdejdejd eejejf fd)d*Zd+d, Zd-eeejf fd.d/Z  ZS )1HubertPretrainModelzHubert Pretrain model        F<space><blank>      ?
vocab_size
token_list.frontendspecaug	normalize
preencoderencoder	ignore_id
lsm_weightlength_normalized_loss
report_cer
report_wer	sym_space	sym_blankpred_masked_weightpred_nomask_weightloss_weightsc                    s   t  sJ t   |d | _|d | _|| _|| _| | _|| _	|| _
|| _|| _|| _t|||| _|| _|| _|| _|sB|rMt|||||| _d S d | _d S N   )r	   super__init__soseosr   r$   copyr   r   r    r!   r"   r#   r   criterion_attr+   r,   r-   r   error_calculator)selfr   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   	__class__r   r   r1   &   s2   







zHubertPretrainModel.__init__speechspeech_lengthstexttext_lengthsreturnc                 K   s   |  dksJ |j|jd |jd   kr&|jd   kr&|jd ks3n J |j|j|j|jf|jd }|ddd| f }| ||||}| |\}}	}
t| |	|
|	d}t|||f|j\}}}|||fS )zFrontend + Encoder + Calc loss

        Args:
            speech: (Batch, Length, ...)
            speech_lengths: (Batch, )
            text: (Batch, Length)
            text_lengths: (Batch,)
            kwargs: "utt_id" is among the input.
        r/   r   N)lossacc_mask
acc_unmaskacc)	dimshapemaxencode_calc_hubert_lossdictdetachr   device)r7   r:   r;   r<   r=   kwargs
batch_sizeencoder_outr?   r@   rA   statsweightr   r   r   forwardY   s0   



zHubertPretrainModel.forwardc                 K   s   |  ||\}}||dS )N)featsfeats_lengths)_extract_feats)r7   r:   r;   r<   r=   rK   rQ   rR   r   r   r   collect_feats   s   
z!HubertPretrainModel.collect_featsy_pady_pad_lengthc           
      C   s   t d- | ||\}}| jdur| jr| ||\}}| jdur*| ||\}}W d   n1 s4w   Y  | jdurF| ||\}}| ||||}t| jdr~| jj|d}| j	dksit
|dksiJ | jj|d}	| jdks~t
|	dks~J |S )zFrontend + Encoder. Note that this method is used by asr_inference.py

        Args:
            speech: (Batch, Length, ...)
            speech_lengths: (Batch, )
            y_pad: (Batch, Length, ...)
            y_pad_length: (Batch, )
        FNr#   Tr   )r   rS   r    trainingr!   r"   r#   hasattr
get_logitsr+   lenr,   )
r7   r:   r;   rU   rV   rQ   rR   rM   logp_m_listlogp_u_listr   r   r   rF      s"   


zHubertPretrainModel.encodec                 C   sb   |  dksJ |j|d d d | f }| jd ur(| ||\}}||fS ||}}||fS r.   )rC   rD   rE   r   )r7   r:   r;   rQ   rR   r   r   r   rS      s   

z"HubertPretrainModel._extract_featsc                 C   sz   |  dkrdS | dksJ |j|ddk}|ddk}||@ }|   |    }|  }||fS )Nr   r   r   r/   r   )numelrC   rD   argmaxargminlongsumitem)r7   logitsrE   minbothcorrcountr   r   r   compute_correct   s    z#HubertPretrainModel.compute_correctrM   c                 C   s   |  | jj|\}}}d\}}d\}}t 8 t|D ]\}	}
| |
\}}||7 }||7 }qt|D ]\}	}| |\}}||7 }||7 }q4W d    n1 sRw   Y  ||d  }||d  }|||fS )Nr]   g|=)r5   r#   torchno_grad	enumerateri   )r7   rM   loss_attr[   r\   corr_maskedcount_maskedcorr_unmaskcount_unmaskilogp_mcorr_mcount_mlogp_ucorr_ucount_u	acc_att_m	acc_att_ur   r   r   rG      s&   





z%HubertPretrainModel._calc_hubert_loss)
r   r   FFFr   r   r   r   r   )__name__
__module____qualname____doc__intr   r   strr   r   r   r   r   r   r
   floatboolr1   rj   Tensorr   rP   rT   rF   rS   ri   rG   __classcell__r   r   r8   r   r   #   s    	
3
1

-
r   )T)$
contextlibr   typingr   r   r   r   r   rj   packaging.versionr   V	typeguardr	   espnet2.asr.encoder.abs_encoderr
   !espnet2.asr.frontend.abs_frontendr   %espnet2.asr.preencoder.abs_preencoderr   espnet2.asr.specaug.abs_specaugr   espnet2.hubert.hubert_lossr   espnet2.layers.abs_normalizer    espnet2.torch_utils.device_funcsr   espnet2.train.abs_espnet_modelr   espnet.nets.e2e_asr_commonr   __version__torch.cuda.ampr   r   r   r   r   r   <module>   s&   