o
    xi                     @  s^  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZ d dlmZ d dlZd dlZd dlmZ d dlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z" dddZ#dddZ$dddZ%dddZ&dddZ'dd!d"Z(dd$d%Z)dd(d)Z*dd+d,Z+dd1d2Z,	3	4	5	6ddd;d<Z-ed=d>G d?d@ d@Z.eG dAdB dBZ/eG dCdD dDZ0ddIdJZ1ddMdNZ2ddUdVZ3ddZd[Z4d\Z5d]d^hZ6dd`daZ7dbdcddidjZ8ddkdlZ9ddodpZ:ddqdrZ;ddsdtZ<G dudv dvZ=e> Z?da@dweAdx< daBdyeAdz< dd}d~ZCdddZDdddZEdddZFdS )    )annotationsN)Callable)	dataclass)Path)	safe_open)	load_file   )DACVAECodecpatchify_latentunpatchify_latent)ModelConfig)TextToLatentRFDiT)sample_euler_rf_cfg)normalize_text)PretrainedTextTokenizerreturnboolc                  C  s2   t tdd } | d u st| dsdS ttjj S )NbackendsmpsF)getattrtorchhasattrr   r   r   is_available)r    r   9/home/ubuntu/Irodori-TTS/irodori_tts/inference_runtime.py_is_mps_available   s   r   devicestr | torch.devicetorch.devicec                 C  s|   t | }|jdkr|S |jdkrt j std|S |jdkr6|jd ur*tdt s1tdt dS td|d)	Ncpucudaz=CUDA device requested but torch.cuda.is_available() is False.r   z-MPS device index is not supported. Use 'mps'.zDMPS device requested but torch.backends.mps.is_available() is False.zUnsupported inference device=z". Expected one of: cpu, cuda, mps.)r   r   typer    r   
ValueErrorindexr   r   resolvedr   r   r   resolve_runtime_device!   s   






r&   	list[str]c                  C  s6   g } t j r| d t r| d | d | S )Nr    r   r   )r   r    r   appendr   )devicesr   r   r   list_available_runtime_devices2   s   



r*   strc                   C  s
   t  d S )Nr   )r*   r   r   r   r   default_runtime_device<   s   
r,   c                 C  s    t | }|jdkrddgS dgS )Nr    fp32bf16)r&   r!   r$   r   r   r   !list_available_runtime_precisions@   s   
r/   Nonec                 C  sZ   | j dkrtj|  d S | j dkr'ttdd }|d ur)t|dr+|  d S d S d S d S )Nr    r   synchronize)r!   r   r    r1   r   r   )r   r   r   r   r   _sync_deviceG   s   

r2   r)   c                  G  s<   t  }| D ]}|j|jf}||v rqt| || qd S N)setr!   r#   r2   add)r)   seenr   keyr   r   r   _sync_devicesP   s   r8   extra_devicesfloatc                 G  s   t | g|R   t S r3   r8   timeperf_counter)r   r9   r   r   r   _measure_startZ   s   r>   t0c                 G  s   t | g|R   t | S r3   r;   )r   r?   r9   r   r   r   _measure_end_   s   r@   latenttorch.Tensor
latent_dimintc                 C  s   | j dkr| jd dkr| d } | j dkrtdt| j | jd |kr(| S | jd |kr7| dd S tdt| j d| )N   r   r      zUnsupported latent shape: z(Could not infer latent layout for shape=z and latent_dim=)ndimshaper"   tuple	transpose
contiguous)rA   rC   r   r   r   _coerce_latent_shaped   s   
rL              皙?皙?target_valuewindow_sizestd_thresholdmean_thresholdc                 C  s   | j dkrtdt| j t| jd }|dks|dkr |S tj|| jd f| j| jd}tj	| |gdd}t
|jd | D ]'}||||  }	|	jdd}
|	 }|
|k rht|| |k rht|  S qA|S )	z
    Echo-style heuristic: find first index where a trailing window becomes near-flat and near-zero.

    Args:
      latent: (T, D) latent sequence.
    Returns:
      Flattening index in [0, T].
    rF   z"Expected latent shape (T, D), got r   r   r   dtype)dimF)unbiased)rG   r"   rI   rH   rD   r   zerosr   rV   catrangestdmeanabs)rA   rQ   rR   rS   rT   total_stepspadpaddediwindow
window_stdwindow_meanr   r   r   find_flattening_pointr   s&   
rf   T)frozenc                   @  sr   e Zd ZU ded< ded< dZded< dZded< dZded	< dZded
< dZded< dZ	ded< dZ
ded< dS )
RuntimeKeyr+   
checkpointmodel_devicezfacebook/dacvae-watermarked
codec_repor-   model_precisionr   codec_devicecodec_precisionFr   enable_watermarkcompile_modelcompile_dynamicN)__name__
__module____qualname____annotations__rk   rl   rm   rn   ro   rp   rq   r   r   r   r   rh      s   
 rh   c                   @  sr  e Zd ZU ded< dZded< dZded< dZded	< dZd
ed< dZded< dZ	ded< dZ
ded< dZded< dZd
ed< dZded< dZded< dZded< dZded< dZded< dZd
ed < d!Zded"< d#Zded$< dZd
ed%< dZd
ed&< dZd
ed'< d(Zded)< dZd
ed*< dZd
ed+< dZded,< dZded-< d(Zded.< d/Zded0< d1Zded2< d3Z ded4< dS )5SamplingRequestr+   textN
str | Noneref_wav
ref_latentFr   no_reffloat | Noneref_normalize_dbref_ensure_maxr   rD   num_candidates
sequentialdecode_modeg      >@r:   secondsmax_ref_secondsz
int | Nonemax_text_len(   	num_stepsg      @cfg_scale_textg      @cfg_scale_speakerindependentcfg_guidance_mode	cfg_scaleg      ?	cfg_min_t      ?	cfg_max_ttruncation_factor	rescale_krescale_sigmaTcontext_kv_cachespeaker_kv_scalespeaker_kv_min_tspeaker_kv_max_layersseed	trim_tailrN   tail_window_sizerO   tail_std_thresholdrP   tail_mean_threshold)!rr   rs   rt   ru   ry   rz   r{   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rv      s>   
 rv   c                   @  sF   e Zd ZU ded< ded< ded< ded< d	ed
< ded< ded< dS )SamplingResultrB   audiozlist[torch.Tensor]audiosrD   sample_ratezlist[tuple[str, float]]stage_timingsr:   total_to_decode	used_seedr'   messagesN)rr   rs   rt   ru   r   r   r   r   r      s   
 r   modelr   enableddynamicc                C  sl   |s| S t tdstddt|i}tj| jfi || _tj| jfi || _tj| jfi || _| S )Ncompilez7compile_model=True requires torch.compile (PyTorch 2+).r   )r   r   RuntimeErrorr   r   encode_conditionsbuild_context_kv_cacheforward_with_encoded_conditions)r   r   r   compile_kwargsr   r   r   _maybe_compile_inference_model   s   
r   	precisiontorch.dtypec                 C  sN   t |   }|dkrtjS |dkr|jdkrtdtjS td| d)Nr-   r.   r    z0precision='bf16' currently requires CUDA device.zUnsupported precision=z. Expected one of: fp32, bf16.)r+   striplowerr   float32r!   r"   bfloat16)r   r   moder   r   r   resolve_runtime_dtype   s   
r   r   r   r   r   r|   tuple[float, float, list[str]]c                 C  sv   g }t |}t |}|durt |}t |}t|   }|dkr6|dkr6|dkr6t|| dkr6td|||fS )z0Normalize/validate CFG scales for guidance mode.NjointrM   gư>z\cfg_guidance_mode='joint' requires equal cfg_scale_text/cfg_scale_speaker, or set cfg_scale.)r:   r+   r   r   r^   r"   )r   r   r   r   r   text_valspeaker_valr   r   r   r   resolve_cfg_scales   s   
r   pathr   dictc                 C  s0   t j| ddd}t|tstdt||S )Nr   Tmap_locationweights_onlyz%Unsupported checkpoint payload type: )r   load
isinstancer   r"   r!   )r   payloadr   r   r   _load_torch_checkpoint_payload  s   
r   config_jsonr   fixed_target_latent_steps1tuple[dict[str, torch.Tensor], dict, dict | None]c                 C  s   t | }|d}|d}|d}t|tstd|  t|ts+td|  |d ur;t|ts;td|  ||t|fS )Nr   model_configtrain_configz-Checkpoint missing model weights dictionary: z,Checkpoint missing model_config dictionary: z;Checkpoint train_config must be a dictionary when present: )r   getr   r   r"   _extract_inference_train_config)r   ckptmodel_state	model_cfg	train_cfgr   r   r   _load_checkpoint_from_pt  s   




r   F)requiredrawrx   fieldr   dict | Nonec             
   C  s   | d u r|rt d| d| d S zt| }W n tjy1 } zt d| d| |d }~ww t|tsAt d| d| |S )Nz!Missing required metadata field 'z' in checkpoint: zInvalid JSON in 'z' metadata for checkpoint: zMetadata field 'z' must decode to an object: )r"   jsonloadsJSONDecodeErrorr   r   )r   r   r   r   r   excr   r   r   _parse_json_mapping)  s   
r   c                 C  sf   | d u rd S i }t D ]$}| |}|d u rq
t|ts(td| dt|dt|||< q
|p2d S )NInference config key 'z' must be int, got .)_INFERENCE_CONFIG_KEYSr   r   rD   r"   r!   )r   inference_cfgr7   valuer   r   r   r   =  s   

r   flat_configtuple[dict, dict | None]c                 C  sb   i }i }|  D ]"\}}|tv r&t|tstd| d|  t|||< q|||< q||p/d fS )Nr   z&' must be int in checkpoint metadata: )itemsr   r   rD   r"   )r   r   r   r   r7   r   r   r   r   _split_flat_checkpoint_configM  s   

r   c                 C  s   t t| dd}t|tr|std|  tt| ddd}| p%i }W d    n1 s0w   Y  t|t	t	| dd}t
| |d\}}|||fS )	Nr   )r   z-Safetensors checkpoint has no model weights: pt)	frameworkr   T)r   r   r   )r   r   )load_safetensors_filer+   r   r   r"   r   metadatar   r   _CONFIG_META_KEYr   )r   r   handler   r   r   r   r   r   r   !_load_checkpoint_from_safetensors\  s   
r   c                 C  s   | j  dkrt| S t| S )Nz.safetensors)suffixr   r   r   )r   r   r   r   _load_checkpoint_for_inferencep  s   r   c                   @  sH   e Zd Zd&ddZed'ddZd(ddZddd)d"d#Zd*d$d%ZdS )+InferenceRuntimer7   rh   r   r   r   r   r   r   	tokenizerr   codecr	   default_text_max_lenrD   r   r0   c                C  sP   || _ t|j| _t|j| _|| _|| _|| _|| _|| _|| _	t
 | _d S r3   )r7   r&   rj   rm   r   r   r   r   r   r   	threadingLock_infer_lock)selfr7   r   r   r   r   r   r   r   r   r   __init__w  s   zInferenceRuntime.__init__c              	   C  sx  t |j}t |j}t|j|d}t|j|d}tt|j\}}}t	di |}	t
|	|}
|
| |
j|d}
|
  t|
t|jt|jd}
tj|	jt|	jdd}|j|	jkrotd|	j d|	j d|j d	d
}t|tr|d}t|tr|dkrt|}tj|jt ||t|j!d}|	j"|j"krtd|	j" d|j" d| ||	t|tr|nd |
|||dS )N)r   r   rV   )r   r   F)repo_idadd_boslocal_files_onlyz5text_vocab_size mismatch: checkpoint text_vocab_size=z but tokenizer (z) vocab_size=r      r   r   )r   r   rV   ro   z1Latent dimension mismatch: checkpoint latent_dim=z but codec latent_dim=z). Use a compatible codec/checkpoint pair.)r7   r   r   r   r   r   r   r   )#r&   rj   rm   r   rl   rn   r   r   ri   r   r   toload_state_dictevalr   r   rp   rq   r   from_pretrainedtext_tokenizer_repotext_add_bos
vocab_sizetext_vocab_sizer"   r   r   r   rD   r	   r   rk   r+   ro   rC   )clsr7   rj   rm   model_dtypecodec_dtyper   model_cfg_dictr   r   r   r   r   ckpt_text_max_lenr   r   r   r   from_key  sx   






zInferenceRuntime.from_keyreqrv   
batch_sizer   r'   !tuple[torch.Tensor, torch.Tensor]c             
   C  s  t | j j}|jr6tdt| jj}t	j
||| jj| jj f| j|d}t	j
||ft	j| jd}||fS |jd u rD|jd u rDtdd }|jd urj|jdkrjtdtt|jt| jj tt| jjj }|jd urt	j|jddd}	t|	| jjd	d}
|
j|d
}
nt|j\}}|jd ur|jdkrtdtt|jt| }|jd |kr|d|j dt|jd t| ddt|t| dd |d d d |f }|j d ur|dt|j dd |j!r|d | jj"|dt||j t|j!d# }
|d ur0|
jd |kr0|d|
jd  d| d |
d d d |f }
t$|
| jj| j}|jd dkrGtd|dkrS|%|dd}t	j&||jd ft	j| jd}||fS )Nr   rU   )rV   r   z6Specify either ref_wav/ref_latent, or set no_ref=True.r   r   Tr   )rC   r   z2warning: reference audio exceeds max_ref_seconds (zs). Trimming from z.2fzs to zs.z6info: reference loudness normalize enabled (target_db=z).z>info: reference peak safety scaling enabled (ensure_max=True).)r   normalize_db
ensure_maxz!warning: reference latent steps (z ) exceed max_ref_seconds bound (z# steps). Trimming reference latent.zOReference latent length became zero after patchify. Use longer reference audio.)'nextr   
parametersrV   r{   maxrD   r   speaker_patch_sizer   rY   rC   latent_patch_sizerj   r   ry   rz   r"   r   mathceilr:   r   r   
hop_lengthr   rL   	unsqueezer   _load_audiorH   r(   r}   r~   encode_waveformr   r
   repeatones)r   r  r  r   runtime_dtyperef_lenref_latent_patchedref_maskmax_ref_latent_steps
latent_rawrz   wavsrmax_ref_samplesr   r   r   _load_reference_latent  s   	

	



z'InferenceRuntime._load_reference_latentNlog_fnr#  Callable[[str], None] | Noner   c          -        st  dU fdd}g }|d | jj| jj| jj| jj| jj|j|j	|j
|jd u r(dnt|j|j|j |j	d	kr@td
|j	 t|j}|d	krPtd| t|j  }|dvrftd|jdt|j}t| }|dkrytd|jd u r| jnt|j}	|	d	krtd|	 |jd u rd nt|j}
|jd u rd nt|j}|jd u rd nt|j}|
d ur|
d	krtd|
 |d u |d u krtd|d ur|d	krtd| |d ur|d	krtd| |jd u rd nt|j}d }|jd u rd nt|j}|d urL|d	krtd| |jd u r"dnt|j}d|  kr3dks;n td| |d urL|d	k rLtd| t|j  }|dvrctd|jdt||j|j |j!d\}}}|"| |D ]}|| qxg }|jd u rtt#$d }d!| d"}|%| || nt|j}|d#|  t&| j| j}| j'f t() P t&| j}| j*j+|g| |	d$\}}t,| j|}|%d%|f |d&|d' d(d) |-| j}|-| j}tt|j	| jj. }t/0|t| jj1j2 }t/0|| j3j4 }t5| j6t7rB| j68d*}t5|trB|d	krB||krBd+| d,| d-}|%| || t&| j| j}t9|} | j:|||d.\}!}"t,| j|| j}|%d/|f || d  D ]}|| ql|d0|d' d(d) t&| j}t;dVi d1| j1d2|d3|d4|!d5|"d6|d7t|j
d8|d9|d:|d;t|j<d<t|j=d=|d>|
d?|d@|dAt>|j?dB|dC|dD|}#t,| j|}|%dE|f |dF|d' d(d) t&| j}t@|#| j3j4| j3jAdG}$t,| j|}|%dH|f |dI|d' d(d) |$d d d |f }$t&| j| j}g }%|dJkr| jB|$C }&tD|D ]G}'|&|' }(|})t>|jErotF|$|' tGdKt|jHt|jIt|jJdL}*t|*t| jj1j2 }+|+d	krotK|)|+})|%%|(d d d |)f  q6nXtD|D ]S}'| jB|$|'|'dK  C d	 }(|})t>|jErtF|$|' tGdKt|jHt|jIt|jJdL}*t|*t| jj1j2 }+|+d	krtK|)|+})|%%|(d d d |)f  qt,| j|| j}|%dM|f |dN| dO|d' d(d) t,| j|| j},|dP|,dQdR W d    n	1 sw   Y  W d    n	1 s w   Y  |dS tL|%d	 |%t| jj.||,||dTS )WNmsgr+   r   r0   c                   s    d ur
 |  d S d S r3   r   )r%  r"  r   r   _log,  s   z)InferenceRuntime.synthesize.<locals>._logz[runtime] start synthesize model_device={} model_precision={} codec_device={} codec_precision={} watermark={} mode={} seconds={} steps={} seed={} candidates={} decode_mode={}randomr   zseconds must be > 0, got z num_candidates must be > 0, got >   batchr   zUnsupported decode_mode=z%. Expected one of: sequential, batch. z&text became empty after normalization.zmax_text_len must be > 0, got z#truncation_factor must be > 0, got z1rescale_k and rescale_sigma must be set together.zrescale_k must be > 0, got zrescale_sigma must be > 0, got z"speaker_kv_scale must be > 0, got g?rM   r   z(speaker_kv_min_t must be in [0, 1], got z7speaker_kv_max_layers must be >= 0 when specified, got >   r   alternatingr   zUnsupported cfg_guidance_mode=z3. Expected one of: independent, joint, alternating.)r   r   r   r   ?   z,info: seed not specified; using random seed r   z[runtime] using seed: )
max_lengthtokenize_textz[runtime] tokenize_text: g     @@z.1fz msr   z"warning: requested latent length (z%) exceeds fixed_target_latent_steps (z4) used in training. Long-tail stability may degrade.)r  r  r   prepare_referencez[runtime] prepare_reference: r   text_input_ids	text_maskrz   r  sequence_lengthr   r   r   r   r   r   r   r   r   r   use_context_kv_cacher   r   r   	sample_rfz[runtime] sample_rf: )
patch_sizerC   r   z[runtime] unpatchify_latent: r(  r   )rR   rS   rT   decode_latentz[runtime] decode_latent (z): z[runtime] total_to_decode: z.3fz sz[runtime] done synthesize)r   r   r   r   r   r   r   )r%  r+   r   r0   r   )Mformatr7   rj   rl   rm   rn   r   ro   r   r   r   r   rD   r   r   r"   r+   r   r   rw   r   r   r   r   r:   r   r   r   r   r   r   r   r   r   extendsecretsrandbitsr(   r>   r   r   inference_moder   batch_encoder@   r   r   r  r  r   r  r   r  r   r   r   r   lenr!  r   r   r   r   r   r   rC   r5  r   r[   r   rf   r  r   r   r   minr   )-r   r  r#  r&  r   r   r   raw_textnormalized_texttext_max_lenr   r   r   r   r   r   cfg_moder   r   scale_messagesr%  r   r   post_load_t0r?   text_idsr0  	stage_sectarget_sampleslatent_stepspatched_stepsfixed_stepsmsg_count_before_refrz   r  	z_patchedztrimmed_audiosaudio_batchrb   audio_imax_samplesflattening_pointflattening_samplesr   r   r"  r   
synthesize&  s  











 



	







 

 u
zInferenceRuntime.synthesizec                 C  sp   | ` | `| `t  | j| jfD ]%}|jdkrtj	
  q|jdkr5ttdd }|d ur5t|dr5|
  qd S )Nr    r   empty_cache)r   r   r   gccollectrj   rm   r!   r   r    rT  r   r   )r   r   r   r   r   r   unload  s   

zInferenceRuntime.unload)r7   rh   r   r   r   r   r   r   r   r   r   r	   r   rD   r   r0   )r7   rh   r   r   )r  rv   r  rD   r   r'   r   r  )r  rv   r#  r$  r   r   r   r0   )	rr   rs   rt   r   classmethodr  r!  rS  rW  r   r   r   r   r   v  s    

BZ nr   zRuntimeKey | None_RUNTIME_CACHE_KEYzInferenceRuntime | None_RUNTIME_CACHE_VALUEr7   tuple[InferenceRuntime, bool]c                 C  sx   t & td urt| krtdfW  d    S t}t| }| a|aW d    n1 s+w   Y  |d ur8|  |dfS )NFT)_RUNTIME_CACHE_LOCKr[  rZ  r   r  rW  )r7   old_runtimeruntimer   r   r   get_cached_runtime&  s   
	r`  c                  C  sH   t  t} d ad aW d    n1 sw   Y  | d ur"|   d S d S r3   )r]  r[  rZ  rW  )r_  r   r   r   clear_cached_runtime7  s   ra  
str | Pathtuple[torch.Tensor, int]c                 C  sp   zt t| W S  ty7   dd l}|jt| dd\}}t|}|jdkr.|	d}n|j
}||f Y S w )Nr   r   r   r   )
torchaudior   r+   r   	soundfilereadr   
from_numpyrG   r  T)r   sfdatar  r  r   r   r   r  B  s   

r  r   r   c              	   C  sj   t | }|jjddd ztt||| W |S  ty4   dd l}|t||	d
 | Y |S w )NT)parentsexist_okr   )r   parentmkdirrd  saver+   r   re  writesqueezenumpy)r   r   r   out_pathri  r   r   r   save_wavQ  s   rt  )r   r   )r   r   r   r   )r   r'   )r   r+   )r   r   r   r'   )r   r   r   r0   )r)   r   r   r0   )r   r   r9   r   r   r:   )r   r   r?   r:   r9   r   r   r:   )rA   rB   rC   rD   r   rB   )rM   rN   rO   rP   )rA   rB   rQ   r:   rR   rD   rS   r:   rT   r:   r   rD   )r   r   r   r   r   r   r   r   )r   r+   r   r   r   r   )
r   r+   r   r:   r   r:   r   r|   r   r   )r   r   r   r   )r   r   r   r   )
r   rx   r   r+   r   r   r   r   r   r   )r   r   r   r   )r   r   r   r   r   r   )r7   rh   r   r\  rX  )r   rb  r   rc  )r   rb  r   rB   r   rD   r   r   )G
__future__r   rU  r   r  r8  r   r<   collections.abcr   dataclassesr   pathlibr   r   rd  safetensorsr   safetensors.torchr   r   r   r	   r
   r   configr   r   r   rfr   text_normalizationr   r   r   r   r&   r*   r,   r/   r2   r8   r>   r@   rL   rf   rh   rv   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r]  rZ  ru   r[  r`  ra  r  rt  r   r   r   r   <module>   s    







	



$!









   .


