o
    it                     @   s*  d dl Z d dlmZmZmZmZmZ d dlZd dlZd dl	m
Z
 d dlm
  mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' e'(ddG dd de
j)Z*e'(ddG dd de
j)Z+dS )    N)UnionDictListTupleOptional)autocast)sequence_mask)LabelSmoothingLoss)CTC)add_sos_eos)th_accuracycompute_accuracy)force_gatherable)load_audio_text_image_videoextract_fbank)postprocess_utils)mae_loss)DatadirWriter)tablesmodel_classes	LLMASRNARc                9       sV  e Zd ZdZ																								
					
d=dedededededededededededededededededed ed!ed"ed#ed$ed%ed&ed'ed(ed)ef8 fd*d+Z	d,e
jd-e
jd.e
jd/e
jd0e
jd1e
jd2e
jd3e
jd4e
jd5ee
jeee
jf e
jf fd6d7Zd,e
jd-e
jfd8d9Z				d>d:efd;d<Z  ZS )?r    N      ?P   r                 FT<space><blank>specaugspecaug_conf	normalizenormalize_confencoderencoder_confdecoderdecoder_confctcctc_conf
ctc_weightllmllm_confadaptoradaptor_conf
input_size
vocab_size	ignore_idblank_idsoseos
lsm_weightlength_normalized_loss
report_cer
report_wer	sym_space	sym_blankshare_embeddingc           -         s  t    |d urtj|}|di |}|d ur'tj|}|di |}|dd } | dkrLddlm}! |dd}"|!|"dd}#d |#j_	|#j| _
n| d	krQntj|}$|$dd
|i|}| }%|dd	} d | _| d	krddlm}&m}'m}( |dd}"|&j|"d d d d}#|dd})|)r|# D ]\}*}+d|+_q|#  |#| _tj|},|,di |}|| _|| _|d ur|n|d | _|d ur|n|d | _|| _|| _|| _|| _|| _t ||||d| _!d | _"|| _#d | _$d S )Nhubfunasrr   	AutoModelinit_param_pathFiic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorchmastermodelmodel_revisionhfr/   AutoModelForCausalLMAutoTokenizer
AutoConfigvicuna-7b-v1.5load_in_8bit
device_map	use_cachefreezeTFr   sizepadding_idx	smoothingnormalize_length )%super__init__r   specaug_classesgetnormalize_classesr=   r?   rD   r&   audio_encoderencoder_classesoutput_sizer+   transformersrH   rI   rJ   from_pretrainednamed_parametersrequires_gradevaladaptor_classesr-   r2   r3   r4   r0   r1   r    r"   r$   r	   criterion_atterror_calculatorr6   beam_search)-selfr    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   kwargsspecaug_classnormalize_classr<   r?   r@   rD   encoder_classencoder_output_sizerH   rI   rJ   rP   nameparamadaptor_class	__class__rV   S/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/llm_asr_nar/model.pyrX      sv   
$

zLLMASRNAR.__init__speechspeech_lengthstexttext_lengths	input_idsattention_mask
labels_ids
label_mask
audio_maskreturnc
                 K   sN  t | dkr|dddf }t | dkr |dddf }|jd }| j|||	d\}}| |}|durd||dk< d||dk< t| jjdrS| jj|}nt| jjjdrd| jjj|}n	| jjjj|}|	dur|j\}}}|j\}}}t	j
|dd|| d dddfdd	}||	dddddf  |d
|	dddddf    }t	j
|ddddddf ddd	}| j|||d}|j}i }t * t|jd}t|ddddf |ddddf dd}||d< W d   n1 sw   Y  t| |d< | jrt|d  }t|||f|j\}}}|||fS )Encoder + Decoder + Calc loss
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
        r   Nr   r|   r   embed_tokensr   value      ?r   r   r   r   r   r   inputs_embedsry   labelsignore_labelaccloss)lenrR   shapeencoder-   hasattrr+   rD   r   Fpadr   torchno_gradargmaxlogitsr   clonedetachr6   intsumr   device)rh   rt   ru   rv   rw   rx   ry   rz   r{   r|   ri   
batch_sizeencoder_outencoder_out_lensr   	token_numdims_lencoder_outs_padmodel_outputsr   statspredsacc_attweightrV   rV   rs   forward   sL   

$&
.

zLLMASRNAR.forwardc                 K   s   | dd }|d ur|dnd }| dd }|d u r&tjt|gtjd}||d}| jjd
i |\}}	td, t	|	|
d|jdd d d d d f }
| jj||
|d	\}}}}W d    ||fS 1 siw   Y  ||fS )Nr|   r   text_token_intdtypert   ru   Fr   r   masktarget_label_lengthrV   )rZ   r   r   tensorr   int64r\   r   r   r   rR   r   	predictor)rh   rt   ru   ri   r|   audio_token_lengthsr   batchencenc_lensenc_maskpre_acoustic_embedspre_token_lengthr   rV   rV   rs   r      s$   

(
zLLMASRNAR.encodekeyc                 K   s0  | dd}| dddkrtdi }t|tjrC| dddkrC||}	}
t|	jd	k r9|	d d d d d f }	|
d u rB|	jd }
nit }t	||j
| d
d| ddd d}t| dg dkru|\}}|d dd}||}nd }t }|| d|d< t|| dd|d\}	}
t }|| d|d< |
  |j |j d |d< |	j|d d}	|
j|d d}
| j|	|
|d\}}| |}d|}||}t|}tj|tjd|d }t| jjdr| jj|}nt| jjjdr| jjj|}n	| jjjj|}tj|d d d d d f |fdd}tj| d d tjd|d }| j||d d}t|j d}|j!|dd d!}|d "d"d }|# }d }| d#d uryt| d$sqt$| d#| _%| j%d d% }g }|d |d&}|&| |d ur||d' |d < ||fS )(NpromptTranscribe speech to text.r   r   !batch decoding is not implemented	data_typesoundfbank   fs>  r   audio_fsr   	tokenizerr   r    0.3f	load_datar   frontendextract_feat  batch_data_timer   r   r   USER: 
INSTRUCTION: {}
INPUT: r   r   dimr   r   FTadd_special_tokensskip_special_tokensz: 
output_dirwriter
best_recogr   rv   rv   )'rZ   NotImplementedError
isinstancer   Tensorr   r   timeperf_counterr   r   replacer   r   r   itemframe_shiftlfr_ntor-   formatr   r   r   r+   rD   r   catonesrR   longr   r   batch_decodesplitstripr   r   append)rh   data_indata_lengthsr   r   r   ri   r   	meta_datart   ru   time1audio_sample_listtext_token_int_listr   time2time3r   r   
prompt_pre
prompt_idsprompt_lengthr   ry   r   r   rv   ibest_writerresultsresult_irV   rV   rs   	inference   s   












zLLMASRNAR.inference)NNNNNNNNNNr   NNNNr   r   r   r   r   r   r   FTTr   r   FNNNN)__name__
__module____qualname____doc__strdictfloatr   boolrX   r   r   r   r   r   r   listr   __classcell__rV   rV   rq   rs   r      s    	
x	

G
LLMASRNARPromptc                ;       s~  e Zd ZdZ																									
						dDdedededededededededededededededededed ed!ed"ed#ed$ed%ed&ed'ed(ed)ed*ef: fd+d,Z	d-e
jd.e
jd/e
jd0e
jd1e
jd2e
jd3e
jd4e
jd5e
jd6ee
jeee
jf e
jf fd7d8Zd-e
jd.e
jfd9d:Zd;e
jd<e
jd=e
jd>e
jfd?d@Z				dEdAefdBdCZ  ZS )Fr  r   Nr   r   r   r   r   r   Fr   Tr   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   predictor_weightr7   r8   r9   r:   r;   c           .         s0  t    |d urtj|}|di |}|d ur'tj|} | di |}|dd }!|!dkrOddlm}" |dd}#|"|#dd}$d |$j_	|$j| _
|| _n|!d	krTntj|}%|%dd
|i|}| }&|dd	}!d | _|!d	krddlm}'m}(m}) |dd}#|'j|#d d d d}$|dd}*|*r|$ D ]\}+},d|,_q|$  |$| _tj|}-|-di |}|| _|| _|d ur|n|d | _|d ur|n|d | _|| _|| _|| _|| _|| _ t!||||d| _"t#|d| _$d | _%|| _&d | _'|dkr|
d u ri }
t(d||d d|
}	|| _)|	| _*d S )Nr<   r=   r   r>   r@   rA   rB   rC   rF   r/   rG   rK   rL   rP   TFr   rQ   )rU   r   encoder_dim)odimrm   rV   )+rW   rX   r   rY   rZ   r[   r=   r?   rD   r&   r\   r	  r]   r^   r+   r_   rH   rI   rJ   r`   ra   rb   rc   rd   r-   r2   r3   r4   r0   r1   r    r"   r$   r	   re   r   criterion_prerf   r6   rg   r
   r*   r(   ).rh   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r	  r7   r8   r9   r:   r;   ri   rj   rk   r<   r?   r@   rD   rl   rm   rH   rI   rJ   rP   rn   ro   rp   rq   rV   rs   rX   v  s   
%


zLLMASRNARPrompt.__init__rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   c
           "      K   s2  t | dkr|dddf }t | dkr |dddf }|jd }i }| j|||	d}|d |d }}|d |d |d }}}| jdkrd| ||||\}}|dur`t| nd|d	< | 	|}|durd||d
k< d||dk< t
| jjdr| jj|}nt
| jjjdr| jjj|}n	| jjjj|}|	dur|
dd}|dusJ |d  }|j\}}}|j\}}}tj|dd||| | ddfdd}||	dddddf  |d|	dddddf    }tj|ddddddf ddd}| j|||d}|j}t| |d< | jdkr(| j| | }||| j  }t * t|jd
}t|dddd
f |ddddf dd} | |d< W d   n	1 saw   Y  t| |d< t| |d< ||d< | jrt|d  }t|||f|j\}}}!|||!fS )r~   r   Nr   r   r   r      r   loss_ctcr   r   r   prompt_bos_lengthr   r   r   r   loss_llmr   r   loss_prer   r   )r   rR   r   r   r*   _calc_ctc_lossr   r   r   r-   r   r+   rD   r   rZ   r   r   r   r   r	  r   r   r   r   r6   r   r   r   r   )"rh   rt   ru   rv   rw   rx   ry   rz   r{   r|   ri   r   r   outsr   r   r   r   r  r  cer_ctcr   r  r   r   r   r   r   r   r  r   r   r   r   rV   rV   rs   r     st   





.

zLLMASRNARPrompt.forwardc                 K   s
  | dd }|d ur|dnd }| dd }|d u r*|d ur*tjt|gtjd}||d}| jjdi |\}}	td8 t	|	|
d|jdd d d d d f }
| jj||
|d	\}}}}d
}|d uro| |||}W d    n1 syw   Y  ||	|||fS )Nr|   r   r   r   r   Fr   r   r   r   rV   )rZ   r   r   r   r   r   r\   r   r   r   rR   r   r   r  type_as)rh   rt   ru   ri   r|   r   r   r   r   r   r   r   r   r   r  rV   rV   rs   r   _  s,   

(
zLLMASRNARPrompt.encoder   r   ys_padys_pad_lensc                 C   sR   |  ||||}d }| js%| jd ur%| j |j}| j| | dd}||fS )NT)is_ctc)r(   trainingrf   r   datacpu)rh   r   r   r  r  r  r  ys_hatrV   rV   rs   r  }  s   zLLMASRNARPrompt._calc_ctc_lossr   c                 K   s  | dd}| dddkrtdi }t|tjrC| dddkrC||}	}
t|	jd	k r9|	d d d d d f }	|
d u rB|	jd }
nrt }t	||j
| d
d| ddd d}t| dg dkr~|\}}|d }||}|d |jkr}|dd  }nd }t }|| d|d< t|| dd|d\}	}
t }|| d|d< |
  |j |j d |d< |	j|d d}	|
j|d d}
| j|	|
|d}|d }| |}d|}||}|d |jkr|dd  }t|}tj|tjd|d }tj|jgtjd|d }t| jjdr$| jj|}| jj|}nt| jjjdr6| jjj|}n	| jjjj|}tj|d d d d d f |fdd}tj| d d tjd|d }| j||d d}t |j!d}|j"|ddd}|d #d d }|$ }|%d!r|&d!d"}|$ }d }| d#d urt| d$st'| d#| _(| j(d d% }g }|d |d&}|)| |d ur||d' |d < ||fS )(Nr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   FTr   :zPlease
 r   r   r   r   r   rv   )*rZ   r   r   r   r   r   r   r   r   r   r   r   bos_token_idr   r   r   r   r   r   r-   r   r   r   pad_token_idr   r+   rD   r   r   r   rR   r   r   r   r   r   r   
startswithr   r   r   r   )rh   r   r   r   r   r   ri   r   r   rt   ru   r   r   r   r   r   r   resr   r   r   r   r   r   ry   r   r   rv   r   r   r   rV   rV   rs   r     s   












zLLMASRNARPrompt.inference)NNNNNNNNNNr   NNNNr   r   r   r   r   r   r   Fr   TTr   r   Fr   )r   r   r   r  r  r  r  r   r  rX   r   r   r   r   r   r   r  r  r   r  rV   rV   rq   rs   r  r  s
   	
 	

g

),loggingtypingr   r   r   r   r   r   r   torch.nnnntorch.nn.functional
functionalr   torch.cuda.ampr   funasr.models.scama.utilsr   "funasr.losses.label_smoothing_lossr	   funasr.models.ctc.ctcr
   +funasr.models.transformer.utils.add_sos_eosr   funasr.metrics.compute_accr   r   funasr.train_utils.device_funcsr   funasr.utils.load_utilsr   r   funasr.utilsr   &funasr.models.paraformer.cif_predictorr   funasr.utils.datadir_writerr   funasr.registerr   registerModuler   r  rV   rV   rV   rs   <module>   s0    
  
Z