o
    ih                     @   s  d dl Z d dlmZmZmZmZmZ d dlZd dlZd dl	Z
d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dl m!Z! e!"ddG dd dej#Z$e!"ddG dd dej#Z%dS )    N)UnionDictListTupleOptional)autocast)LabelSmoothingLoss)CTC)add_sos_eos)th_accuracy)force_gatherable)load_audio_text_image_videoextract_fbank)postprocess_utils)DatadirWriter)tablesmodel_classesOpenAIWhisperModelc                3       s  e Zd ZdZ																					
					
d?dedededededededededededededededededed ed!ed"ed#ed$ed%ed&ef2 fd'd(Z	d)e
jd*e
jd+e
jd,e
jd-ee
jeee
jf e
jf f
d.d/Zd)e
jd*e
jd-ee
je
jf fd0d1Zd2e
jd3e
jd4e
jd5e
jfd6d7Zd2e
jd3e
jd4e
jd5e
jfd8d9Zd:d; Z				d@d<efd=d>Z  ZS )Ar   z*CTC-attention hybrid Encoder-Decoder modelN      ?        P   r         FT<space><blank>specaugspecaug_conf	normalizenormalize_confencoderencoder_confdecoderdecoder_confctcctc_conf
ctc_weightinterctc_weight
input_size
vocab_size	ignore_idblank_idsoseos
lsm_weightlength_normalized_loss
report_cer
report_wer	sym_space	sym_blankshare_embeddingc                     s  t    |d urtj|}|d	i |}|d ur'tj|}|d	i |}tj|}|d	d|i|}| }|d urHtj|}||}|dkr\|
d u rRi }
t	d	||d|
}	|| _
|d ure|n|d | _|d urp|n|d | _|| _|| _|| _|| _|| _|| _t| jdsd| j_| jjrtj|| j | j_|| _|dkrd | _n|| _t||||d| _d | _|dkrd | _n|	| _|| _| jrd | j_ || _!d | _"d S )
Nr(   r   )odimencoder_output_sizer   interctc_use_conditioningF      ?sizepadding_idx	smoothingnormalize_length )#super__init__r   specaug_classesgetnormalize_classesencoder_classesoutput_sizedecoder_classesr	   r+   r,   r-   r)   r*   r&   r   r   r    hasattrr7   torchnnLinearconditioning_layerr'   r"   r   criterion_atterror_calculatorr$   r4   embedr/   beam_search) selfr   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   kwargsspecaug_classnormalize_classencoder_classr6   decoder_class	__class__r>   S/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/whisper_lid/model.pyr@      sf   
!

zOpenAIWhisperModel.__init__speechspeech_lengthstexttext_lengthsreturnc                 K   s,  t | dkr|dddf }t | dkr |dddf }|jd }| ||\}}d}	t|tr<|d }	|d }d\}
}}}d\}}t }| jdkrh| ||||\}}|dur`|	 nd|d< ||d< d}| j
dkr|	dur|	D ](\}}| ||||\}}|| }|dur|	 nd|d	|< ||d
|< qu|t |	 }d| j
 | | j
|  }| ||||\}
}}}| jdkr|
}n| jdkr|}n| j| d| j |
  }|
dur|
	 nd|d< ||d< ||d< ||d< t|	 |d< | jrt|d  }t|||f|j\}}}|||fS )zEncoder + Decoder + Calc loss
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
        r   Nr   NNNNNNr   loss_ctccer_ctczloss_interctc_layer{}zcer_interctc_layer{}r8   loss_attacccerwerloss)lenr:   shapeencode
isinstancetupledictr&   _calc_ctc_lossdetachr'   format_calc_att_lossrH   cloner/   intsumr   device)rP   rY   rZ   r[   r\   rQ   
batch_sizeencoder_outencoder_out_lensintermediate_outsrb   acc_attcer_attwer_attr`   ra   statsloss_interctc	layer_idxintermediate_outloss_iccer_icrf   weightr>   r>   rX   forward   s`   





zOpenAIWhisperModel.forwardc                 K   s   t d% | jdur| jr| ||\}}| jdur"| ||\}}W d   n1 s,w   Y  | jjrB| j||| jd\}}}n	| ||\}}}d}t|trZ|d }|d }|durd||f|fS ||fS )zFrontend + Encoder. Note that this method is used by asr_inference.py
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                ind: int
        FNr$   r   r   )	r   r   trainingr   r    r7   r$   rj   rk   )rP   rY   rZ   rQ   rv   rw   _rx   r>   r>   rX   ri      s"   


zOpenAIWhisperModel.encoderv   rw   ys_padys_pad_lensc                 C   s   t || j| j| j\}}|d }| ||||\}}	| ||}
t|d| j|| jd}| j	s5| j
d u r:d\}}n|jdd}| 
| | \}}|
|||fS )Nr   r   )ignore_labelr_   )dim)r
   r,   r-   r*   r"   rL   r   viewr)   r   rM   argmaxcpu)rP   rv   rw   r   r   	ys_in_pad
ys_out_pad
ys_in_lensdecoder_outr   rb   ry   rz   r{   ys_hatr>   r>   rX   rp     s   
z!OpenAIWhisperModel._calc_att_lossc                 C   sR   |  ||||}d }| js%| jd ur%| j |j}| j| | dd}||fS )NT)is_ctc)r$   r   rM   r   datar   )rP   rv   rw   r   r   r`   ra   r   r>   r>   rX   rm   "  s   z!OpenAIWhisperModel._calc_ctc_lossc              
   K   s   ddl m} ddlm} ddlm} i }| jd kr'|| j| jd}|j|d |	d}|j| j
|t|d d }||d	< td
|	dd |	dd|	dd|	dd|	ddd}	||	dd|	|| j| jt||| jd
krtd ndd}
|
| _d S )Nr   )
BeamSearch)CTCPrefixScorer)LengthBonus)r$   r-   r   
token_list)r"   length_bonusngramr8   decoding_ctc_weightr   	lm_weightr   ngram_weightpenalty)r"   r$   lmr   r   	beam_size
   full)r   weightsscorersr,   r-   r)   r   pre_beam_score_key) funasr.models.transformer.searchr   %funasr.models.transformer.scorers.ctcr   .funasr.models.transformer.scorers.length_bonusr   r$   r-   updaterB   r"   rg   rl   r,   r&   rO   )rP   rQ   r   r   r   r   r$   r   r   r   rO   r>   r>   rX   init_beam_search3  s@   








z#OpenAIWhisperModel.init_beam_searchkeyc                     s  | dddkrtd jd u r%td  jd$i | | dd _i }t|tj	rV| dddkrV||}}	t
|jd	k rL|d d d d d f }|	d u rU|jd }	nKt }
t||j| d
d| dd|d}t }||
 d|d< t|| dd|d\}}	t }|| d|d< |	  |j |j d |d< |j|d d}|	j|d d}	 ||	\}}t|tr|d } j|d | dd| ddd}|d  j }g }| \}}}t|D ]}t|D ]\}}d }| dd urt ds
t| d _ j|d  d }d}t|jtr%|jd| }n	|jd|   }tt! fdd|}|"|}|#|}t$%|\}}|| ||d }|&| |d urod!'||d" || < ||d# || < qq||fS )%Nru   r   !batch decoding is not implementedzenable beam_searchnbest	data_typesoundfbank   fs>  r   audio_fsr   	tokenizer0.3f	load_datar   frontendextract_feat  batch_data_timert   rt   r   maxlenratior   minlenratio)xr   r   
output_dirwriter
best_recogr   c                    s   |  j ko|  jko|  jkS )N)r-   r,   r+   )r   rP   r>   rX   <lambda>  s    z.OpenAIWhisperModel.inference.<locals>.<lambda>)r   tokenr[    r   r[   r>   )(rB   NotImplementedErrorrO   logginginfor   r   rj   rH   Tensorrg   rh   timeperf_counterr   r   r   rs   itemframe_shiftlfr_ntori   rk   r:   range	enumeraterG   r   r   yseqlisttolistfilter
ids2tokenstokens2textr   sentence_postprocessappendjoin) rP   data_indata_lengthsr   r   r   rQ   	meta_datarY   rZ   time1audio_sample_listtime2time3rv   rw   
nbest_hypsresultsbndi	nbest_idxhypibest_writerlast_pos	token_intr   r[   text_postprocessedr   result_ir>   r   rX   	inference`  s   















!zOpenAIWhisperModel.inference)NNNNNNNNNNr   r   r   r   r   r   r   r   r   FTTr   r   Fr^   )__name__
__module____qualname____doc__strrl   floatrr   boolr@   rH   r   r   r   r   ri   rp   rm   r   r   r   __classcell__r>   r>   rV   rX   r      s    	
g
]
'

0OpenAIWhisperLIDModelc                       s   e Zd ZdZ									ddededededed	ed
edededef fddZde	j
de	j
de	j
de	j
fddZde	j
de	j
dee	j
e	j
f fddZ				ddefddZ  ZS )r   z*WhisperEncoder and EResNet based LID ModelNFr)   r   r   r    r!   lid_predictorlid_predictor_confproj_dimclip_framesrandom_clipc                    s   t    |d urtj|}|di |}tj|}|di |}tj|}|di |}| |krAtj	
| || _nd | _tj	
| || _t|dddd| _|| _|| _|| _|	| _|
| _d | _d | _t| jdsxd| j_d S d S )Nr   r   Fr9   r7   r>   )r?   r@   r   rA   rB   rD   lid_predictor_classesrE   rH   rI   rJ   
proj_layeroutput_layerr   criterion_lidr   r    r   r   r   r   rO   rG   r7   )rP   r)   r   r   r    r!   r   r   r   r   r   rQ   rR   rT   lid_predictor_classrV   r>   rX   r@     s8   
zOpenAIWhisperLIDModel.__init__rY   rZ   lidlid_lengthsc                 C   s  |j d dks	J |j d }| ||\}}| jd u rGt|| |j d |j|j}t	|D ]\}	}
||	d |
f ||	d |
f< q3nt|| j|j d |j|j}| j
rt	|D ]C\}	}
|
| jkr}||	d |
f ||	d |
f< |
||	< qa|
 | j }tjd|d }||	||| j f ||	d | jf< | j||	< qan%t	|D ] \}	}
|
| jkr| jn|
}
||	d |
f ||	d |
f< |
||	< q| jd ur| |}| ||}| |}| |d d d d d f |}t ' t|d\}}||d d df k  }|d |j d  }W d    n	1 sw   Y  t }||d< t| |d< ||d< | |d< t|||f|j\}}}|||fS )	Nr   r   r   r8   ru   rf   rc   token_length)rh   ri   r   rH   zerosmaxr   dtypert   r   r   r   nprandomrandintr  r   r  r  no_gradrs   rl   rq   rn   r   )rP   rY   rZ   r  r  ru   rv   rw   reduced_encoder_outr   
enc_lengthmax_start_indexstart_index
lid_output
lid_logitsrf   r   predicted_lidcorrectlid_accr|   r   r>   r>   rX   r     s`   









zOpenAIWhisperLIDModel.forwardr]   c                 C   s   t d= | jdur-| jr-|ddd}t||jd  }| ||\}}|ddd}| jdur:| ||\}}W d   n1 sDw   Y  | jj	rZ| j||| j
d\}}}n	| ||\}}}d}t|trr|d }|d }|dur|||f|fS ||fS )zFrontend + Encoder. Note that this method is used by asr_inference.py
        Args:
            speech: (Batch, Length, ...)
            speech_lengths: (Batch, )
        FNr   r   r   r   )r   r   r   permuterH   	ones_likerh   r   r    r7   r$   rj   rk   )rP   rY   rZ   padded_speech_lengthsrv   rw   r   rx   r>   r>   rX   ri   5  s(   


zOpenAIWhisperLIDModel.encoder   c                 K   sj  | dddkrtdi }t|tjr=| dddkr=||}}	t|jdk r3|d d d d d f }|	d u r<|jd }	nKt }
t	||j
| dd	| dd|d
}t }||
 d|d< t|| dd|d\}}	t }|| d|d< |	  |j |j d |d< |j|d d}|	j|d d}	| ||	\}}| dd }| jd ur/|d u rt|jd | j|jd |j|j}t|D ] \}}|| jkr| jn|}||d |f ||d |f< |||< qnr|dksJ dt|jd ||jd |j|j}t|D ] \}}||kr|n|}||d |f ||d |f< |||< qn/t|jd | |jd |j|j}t|D ]\}}||d |f ||d |f< qJ| jd uri| |}| ||}| |}t|d\}}||d  gd }| dd urt| dst| d| _| jd }|||d < |d |dg}||fS )Nru   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rt   r   inference_clip_lengthr   r   z+inference_clip_length must be larger than 0r   r   r  )r   r  ) rB   r   rj   rH   r   rg   rh   r   r   r   r   r   rs   r   r   r   r   ri   r   r	  r  rt   r   r
  r  r   r  r   r   rG   r   r   )rP   r   r   r   r   r   rQ   r   rY   rZ   r   r   r   r   encenc_out_lensr  reduced_encr   r  r  r  r   predicted_lid_indexr  
lid_writerr   r>   r>   rX   r   \  s   







 


zOpenAIWhisperLIDModel.inference)	NNNNNNNNFr^   )r   r   r   r   rr   r   rl   r   r@   rH   r   r   r   ri   r   r   r   r>   r>   rV   rX   r     sn    	
,
<
*)&r   typingr   r   r   r   r   r   rH   numpyr  torch.nnrI   torch.cuda.ampr   "funasr.losses.label_smoothing_lossr   funasr.models.ctc.ctcr	   +funasr.models.transformer.utils.add_sos_eosr
   funasr.metrics.compute_accr   funasr.train_utils.device_funcsr   funasr.utils.load_utilsr   r   funasr.utilsr   funasr.utils.datadir_writerr   funasr.registerr   registerModuler   r   r>   r>   r>   rX   <module>   s.    
   
6