o
    it                  	   @   s.  d dl mZ d dlmZ d dlmZmZ d dlZd dlZd dl	Z	d dl
m  mZ d dl	mZ d dl	mZ d dlZd dlmZmZ d dlmZ ed	d
ed	ded	ded	ded	dG dd dejZed	ded	ded	ded	ded	dG dd dejZdS )    )	dataclass)Dict)IterableOptionalN)Tensor)nn)load_audio_text_image_videoextract_fbank)tablesmodel_classeszQwen/Qwen-Audioz
Qwen-AudiozQwen/QwenAudio	QwenAudioQwenAudioWarpc                       s@   e Zd ZdZ fddZdd Z				d
defdd	Z  ZS )r   z
    Qwen-Audio: Advancing Universal Audio Understanding via Unified Large-Scale Audio-Language Models
    https://arxiv.org/abs/2311.07919
    Modified from https://github.com/QwenLM/Qwen-Audio
    c           	         s`   t    ddlm}m} ddlm} |dd}|j|ddd}|j|dd	}|| _	|| _
d S )
Nr   AutoModelForCausalLMAutoTokenizerGenerationConfig
model_pathr   cpuT)
device_maptrust_remote_coder   super__init__transformersr   r   transformers.generationr   getfrom_pretrainedmodel	tokenizer)	selfargskwargsr   r   r   model_or_pathr   r    	__class__ R/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/qwen_audio/model.pyr      s   

zQwenAudioWarp.__init__c                 C      d S Nr'   r!   r'   r'   r(   forward+      zQwenAudioWarp.forwardNkeyc                 K   s   | dddkrtdi }| dd}d|d  d| }	| j|	}
| j|	d	|
d
}|| jj}| jjdi |d|
i}| jj|	 d d|
d}g }|d |d}|
| ||fS )N
batch_size   !batch decoding is not implementedpromptzL<|startoftranscription|><|en|><|transcribe|><|en|><|notimestamps|><|wo_itn|>z<audio>r   z</audio>pt)return_tensors
audio_infor5   F)skip_special_tokensr5   r.   textr'   )r   NotImplementedErrorr    process_audiotor   devicegeneratedecoder   append)r!   data_indata_lengthsr.   r    frontendr#   	meta_datar2   queryr5   inputspredresponseresultsresult_ir'   r'   r(   	inference0   s$   	
zQwenAudioWarp.inferenceNNNN)	__name__
__module____qualname____doc__r   r,   listrJ   __classcell__r'   r'   r%   r(   r      s    zQwen/Qwen-Audio-ChatzQwen/QwenAudioChatzQwen-Audio-ChatQwenAudioChatQwenAudioChatWarpc                       s<   e Zd Z fddZdd Z				d	defddZ  ZS )
rS   c                    s|   t    ddlm}m} ddlm} |dd}|dd}|dd}|j|d	||d
d}	|j|d
d}
|	| _	|
| _
dS )z
        Qwen-Audio: Advancing Universal Audio Understanding via Unified Large-Scale Audio-Language Models
        https://arxiv.org/abs/2311.07919
        Modified from https://github.com/QwenLM/Qwen-Audio
        r   r   r   r   r   bf16Ffp16r   T)r   rT   rU   r   r   Nr   )r!   r"   r#   r   r   r   r$   rT   rU   r   r    r%   r'   r(   r   X   s   


zQwenAudioChatWarp.__init__c                 C   r)   r*   r'   r+   r'   r'   r(   r,   m   r-   zQwenAudioChatWarp.forwardNr.   c                 K   s   | dddkrtdi }| dd}| di }	|	 dd }
|d d ur5| jd	|d id
|ig}n|}| jj| j||
d\}}
|
|	d< g }|d |d}|| ||fS )Nr/   r0   r1   r2   zwhat does the person say?cachehistoryr   audior8   )rD   rW   r7   )r   r9   r    from_list_formatr   chatr?   )r!   r@   rA   r.   r    rB   r#   rC   r2   rV   rW   rD   rG   rH   rI   r'   r'   r(   rJ   r   s&   	

zQwenAudioChatWarp.inferencerK   )rL   rM   rN   r   r,   rP   rJ   rQ   r'   r'   r%   r(   rS   R   s    )dataclassesr   typingr   r   r   timenumpynptorchtorch.nn.functionalr   
functionalFr   whisperfunasr.utils.load_utilsr   r	   funasr.registerr
   registerModuler   rS   r'   r'   r'   r(   <module>   s0    





<



 