o
    㥵it                     @   s   d dl Z d dlZd dlmZ d dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZmZ d d
lmZ G dd de
eZdS )    N)	Generator)logger)ReferenceLoader)InferenceResultwav_chunk_header)	VQManager)DAC)GenerateRequestGenerateResponseWrappedGenerateResponse)autocast_exclude_mpsset_seed)ServeTTSRequestc                
       s   e Zd Zdejdedejdeddf
 fddZ	e
 d	edeeddf fd
dZd	edededejfddZdedejfddZ  ZS )TTSInferenceEnginellama_queuedecoder_model	precisioncompilereturnNc                    s&   t    || _|| _|| _|| _d S )N)super__init__r   r   r   r   )selfr   r   r   r   	__class__ Y/home/ubuntu/.local/lib/python3.10/site-packages/fish_speech/inference_engine/__init__.pyr      s
   

zTTSInferenceEngine.__init__reqc                 c   s   |j }g g }}|dur| ||j\}}n|jr$| |j|j\}}|jdur7t|j td|j  | 	|||}t
| jdrJ| jjj}n| jj}|jratd|tt|dfddV  g }	 | }|jdkrtddt|jtry|jntd	dV  n,t|jtstd
|j}	|	jdkr| |	}
|jrtd||
fddV  ||
 nnqdtj rtj   t!"  t#|dkrtddt$ddV  dS tj%|dd}td||fddV  dS )z
        Main inference function:
        - Loads the reference audio and text.
        - Calls the LLAMA model for inference.
        - Decodes the VQ tokens to audio.
        Nz
set seed: spec_transformheader)sample_rate)codeaudioerrorTr"   zUnknown errorzGExpected GenerateResponse, got {type(wrapped_result.response).__name__}nextsegmentr   z0No audio generated, please check the input text.)axisfinal)&reference_id
load_by_iduse_memory_cache
referencesload_by_hashseedr   r   warningsend_Llama_requesthasattrr   r   r   	streamingr   nparrayr   getstatus
isinstanceresponse	Exceptionr
   	TypeErroractionget_audio_segmentappendtorchcudais_availableempty_cachegccollectlenRuntimeErrorconcatenate)r   r   ref_idprompt_tokensprompt_textsresponse_queuer   segmentswrapped_resultresultr$   r!   r   r   r   	inference'   s   	


	

	


$
zTTSInferenceEngine.inferencerF   rG   c                 C   sV   t | jj|j|j|j|j|j| j|j	dk|j	||d}t
 }| jt||d |S )zT
        Send a request to the LLAMA model to generate the symbolic tokens.
        r   )devicemax_new_tokenstexttop_prepetition_penaltytemperaturer   iterative_promptchunk_lengthrF   prompt_text)requestrH   )dictr   rM   rN   rO   rP   rQ   rR   r   rT   queueQueuer   putr	   )r   r   rF   rG   rV   rH   r   r   r   r.      s*   z%TTSInferenceEngine.send_Llama_requestrK   c                 C   sR   t | jjj| jd | j|jd}W d   n1 sw   Y  |  	 S )z0
        Decode the VQ tokens to audio.
        )device_typedtype)codesN)
r   r   rM   typer   decode_vq_tokensr]   floatcpunumpy)r   rK   r$   r   r   r   r:      s   z$TTSInferenceEngine.get_audio_segment)__name__
__module____qualname__rX   rY   r   r<   r\   boolr   inference_moder   r   r   rL   listr.   r
   r1   ndarrayr:   __classcell__r   r   r   r   r      s0    h
#r   )r@   rX   typingr   rb   r1   r<   logurur   -fish_speech.inference_engine.reference_loaderr   "fish_speech.inference_engine.utilsr   r   'fish_speech.inference_engine.vq_managerr   !fish_speech.models.dac.modded_dacr   *fish_speech.models.text2semantic.inferencer	   r
   r   fish_speech.utilsr   r   fish_speech.utils.schemar   r   r   r   r   r   <module>   s    