o
    㥵i                     @   s   d dl Z d dlmZ d dlmZ d dlmZmZmZ d dl	Z	d dl
Z
d dlmZ d dlmZ d dlmZmZmZmZ d dlmZ G d	d
 d
ZdS )    N)sha256)Path)CallableLiteralTuple)logger)DAC)AUDIO_EXTENSIONSaudio_to_bytes
list_filesread_ref_text)ServeReferenceAudioc                   @   sV   e Zd ZdddZdeded defdd	Zd
ee	 ded defddZ
dd ZdS )ReferenceLoaderreturnNc                 C   s8   i | _ i | _|  |  t }d|v rd| _dS d| _dS )z
        Component of the TTSInferenceEngine class.
        Loads and manages the cache for the reference audio and text.
        ffmpeg	soundfileN)	ref_by_idref_by_hash
torchaudiolist_audio_backendsbackend)selfbackends r   a/home/ubuntu/.local/lib/python3.10/site-packages/fish_speech/inference_engine/reference_loader.py__init__   s   

zReferenceLoader.__init__id	use_cache)onoffc                    s   t d| }|jddd t|tddd}|dks| jvr9 fdd|D }d	d |D }||f j|< ||fS td
  j| \}}||fS )N
referencesT)parentsexist_okF)	recursivesortr   c                    s"   g | ]} j tt|d dqS )Treference_audioenable_reference_audio)encode_referencer
   str.0	ref_audior   r   r   
<listcomp>8   s    
z.ReferenceLoader.load_by_id.<locals>.<listcomp>c                 S   s   g | ]}t t|d qS )z.lab)r   r)   with_suffixr*   r   r   r   r.   @   s    Use same references)r   mkdirr   r	   r   r   info)r   r   r   
ref_folder
ref_audiosprompt_tokensprompt_textsr   r-   r   
load_by_id)   s"   

zReferenceLoader.load_by_idr    c                 C   s   dd |D }d}g g }}t |D ]B\}}|dks!|| | jvr?|| j|jdd ||j |d |jf| j|| < q| j||  \}	}
||	 ||
 d}q|r\td ||fS )	Nc                 S   s   g | ]	}t |j qS r   )r   audio	hexdigest)r+   refr   r   r   r.   T   s    z0ReferenceLoader.load_by_hash.<locals>.<listcomp>Fr   Tr%   r0   )	enumerater   appendr(   r8   textr   r2   )r   r    r   audio_hashes
cache_usedr5   r6   ir:   cached_tokencached_textr   r   r   load_by_hashM   s(   



zReferenceLoader.load_by_hashc                 C   s   t |dkst| s|}t|}tj|| jd\}}|jd dkr,t	j
|ddd}||kr<tjj||d}||}|  }|S )z;
        Load the audio data from a file or bytes.
           )r   r      T)dimkeepdim)	orig_freqnew_freq)lenr   existsioBytesIOr   loadr   shapetorchmean
transformsResamplesqueezenumpy)r   r&   sr
audio_datawaveformoriginal_sr	resamplerr8   r   r   r   
load_audiop   s   
zReferenceLoader.load_audio)r   N)__name__
__module____qualname__r   r)   r   r   r7   listr   rD   r\   r   r   r   r   r      s"    

$
#r   )rM   hashlibr   pathlibr   typingr   r   r   rQ   r   logurur   !fish_speech.models.dac.modded_dacr   fish_speech.utils.filer	   r
   r   r   fish_speech.utils.schemar   r   r   r   r   r   <module>   s    