o
    i/2                     @   s^   d dl Z d dlZd dlZd dlZd dlmZmZ d dlm	Z	 ddl
mZmZ G dd dZdS )    N)	GeneratorOptional)snapshot_download   )VoxCPMModel
LoRAConfigc                    @   sd  e Zd Z					d:dededededee d	ee fd
dZe								d;dededededededee d	ee fddZ	de
jfddZdee
jddf fddZ												d<d ed!ed"ed#ed$ed%ed&ed'ed(ed)ed*ed+ed,edee
jddf fd-d.Zd	edefd/d0Zd1d2 Zd3efd4d5Zdefd6d7Zedefd8d9ZdS )=VoxCPM-iic/speech_zipenhancer_ans_multiloss_16k_baseTNvoxcpm_model_pathzipenhancer_model_pathenable_denoiseroptimizelora_configlora_weights_pathc           
      C   s   t d| d| d|  |dur#|du r#tdddd}t d|  tj|||d	| _|durMt d
|  | j|\}}t dt| dt|  d| _|rb|durbddlm	}	 |	|| _
nd| _
t d | jjddd dS )a  Initialize VoxCPM TTS pipeline.

        Args:
            voxcpm_model_path: Local filesystem path to the VoxCPM model assets
                (weights, configs, etc.). Typically the directory returned by
                a prior download step.
            zipenhancer_model_path: ModelScope acoustic noise suppression model
                id or local path. If None, denoiser will not be initialized.
            enable_denoiser: Whether to initialize the denoiser pipeline.
            optimize: Whether to optimize the model with torch.compile. True by default, but can be disabled for debugging.
            lora_config: LoRA configuration for fine-tuning. If lora_weights_path is 
                provided without lora_config, a default config will be created.
            lora_weights_path: Path to pre-trained LoRA weights (.pth file or directory
                containing lora_weights.ckpt). If provided, LoRA weights will be loaded.
        zvoxcpm_model_path: z, zipenhancer_model_path: z, enable_denoiser: NTF)	enable_lm
enable_ditenable_projz:Auto-created default LoRAConfig for loading weights from: )r   r   zLoading LoRA weights from: zLoaded z LoRA parameters, skipped r   )ZipEnhancerzWarm up VoxCPMModel...z'Hello, this is the first test sentence.
   )target_textmax_len)printr   r   
from_local	tts_modelload_lora_weightslentext_normalizerzipenhancerr   denoisergenerate)
selfr
   r   r   r   r   r   loaded_keysskipped_keysr    r#   ?/home/ubuntu/.local/lib/python3.10/site-packages/voxcpm/core.py__init__
   s.   
zVoxCPM.__init__openbmb/VoxCPM1.5Fhf_model_idload_denoiserzipenhancer_model_id	cache_dirlocal_files_onlyc	              	   K   sT   |}
|
st dtj|
r|
}nt|
||d}| d||r|nd||||d|	S )a  Instantiate ``VoxCPM`` from a Hugging Face Hub snapshot.

        Args:
            hf_model_id: Explicit Hugging Face repository id (e.g. "org/repo") or local path.
            load_denoiser: Whether to initialize the denoiser pipeline.
            optimize: Whether to optimize the model with torch.compile. True by default, but can be disabled for debugging.
            zipenhancer_model_id: Denoiser model id or path for ModelScope
                acoustic noise suppression.
            cache_dir: Custom cache directory for the snapshot.
            local_files_only: If True, only use local files and do not attempt
                to download.
            lora_config: LoRA configuration for fine-tuning. If lora_weights_path is 
                provided without lora_config, a default config will be created with
                enable_lm=True and enable_dit=True.
            lora_weights_path: Path to pre-trained LoRA weights (.pth file or directory
                containing lora_weights.ckpt). If provided, LoRA weights will be loaded
                after model initialization.
        Kwargs:
            Additional keyword arguments passed to the ``VoxCPM`` constructor.

        Returns:
            VoxCPM: Initialized instance whose ``voxcpm_model_path`` points to
            the downloaded snapshot directory.

        Raises:
            ValueError: If neither a valid ``hf_model_id`` nor a resolvable
                ``hf_model_id`` is provided.
        zYou must provide hf_model_id)repo_idr*   r+   N)r
   r   r   r   r   r   r#   )
ValueErrorospathisdirr   )clsr'   r(   r)   r*   r+   r   r   r   kwargsr,   
local_pathr#   r#   r$   from_pretrained@   s(   (
zVoxCPM.from_pretrainedreturnc                 O   s   t | j|ddi|S )N	streamingF)next	_generater    argsr2   r#   r#   r$   r      s   zVoxCPM.generatec                 O   s   | j |ddi|S )Nr6   T)r8   r9   r#   r#   r$   generate_streaming   s   zVoxCPM.generate_streaming       @r                  @textprompt_wav_pathprompt_text	cfg_valueinference_timestepsmin_lenr   	normalizedenoiseretry_badcaseretry_badcase_max_timesretry_badcase_ratio_thresholdr6   c                 c   s   |  r
t|tstd|durtj|std| |du |du kr+td|dd}t	
dd|}d}z|durw|durw|	rn| jdurntjdd	d
}|j}W d   n1 s_w   Y  | jj||d |}| jj||d}nd}|r| jdu rddlm} | | _| j|}| jj|||||||
|||d
}|D ]\}}}|d  V  qW |rtj|rzt| W dS  ty   Y dS w dS dS |rtj|rzt| W w  ty   Y w w w w )a  Synthesize speech for the given text and return a single waveform.

        This method optionally builds and reuses a prompt cache. If an external
        prompt (``prompt_wav_path`` + ``prompt_text``) is provided, it will be
        used for all sub-sentences. Otherwise, the prompt cache is built from
        the first generated result and reused for the remaining text chunks.

        Args:
            text: Input text. Can include newlines; each non-empty line is
                treated as a sub-sentence.
            prompt_wav_path: Path to a reference audio file for prompting.
            prompt_text: Text content corresponding to the prompt audio.
            cfg_value: Guidance scale for the generation model.
            inference_timesteps: Number of inference steps.
            max_len: Maximum token length during generation.
            normalize: Whether to run text normalization before generation.
            denoise: Whether to denoise the prompt audio if a denoiser is
                available.
            retry_badcase: Whether to retry badcase.
            retry_badcase_max_times: Maximum number of times to retry badcase.
            retry_badcase_ratio_threshold: Threshold for audio-to-text ratio.
            streaming: Whether to return a generator of audio chunks.
        Returns:
            Generator of numpy.ndarray: 1D waveform array (float32) on CPU. 
            Yields audio chunks for each generations step if ``streaming=True``,
            otherwise yields a single array containing the final audio.
        z&target text must be a non-empty stringNz prompt_wav_path does not exist: zEprompt_wav_path and prompt_text must both be provided or both be None
 z\s+Fz.wav)deletesuffix)output_path)rB   rC   r   )TextNormalizer)
r   prompt_cacherF   r   rE   rD   rI   rJ   rK   r6   r   )strip
isinstancestrr-   r.   r/   existsFileNotFoundErrorreplaceresubr   tempfileNamedTemporaryFilenameenhancer   build_prompt_cacher   utils.text_normalizerQ   rG   _generate_with_prompt_cachesqueezecpunumpyunlinkOSError)r    rA   rB   rC   rD   rE   rF   r   rG   rH   rI   rJ   rK   r6   temp_prompt_wav_pathtmp_filefixed_prompt_cacherQ   generate_resultwav_r#   r#   r$   r8      sr   *
zVoxCPM._generatec                 C   s    | j jdu r
td| j |S )a  Load LoRA weights from a checkpoint file.
        
        Args:
            lora_weights_path: Path to LoRA weights (.pth file or directory
                containing lora_weights.ckpt).
        
        Returns:
            tuple: (loaded_keys, skipped_keys) - lists of loaded and skipped parameter names.
        
        Raises:
            RuntimeError: If model was not initialized with LoRA config.
        NzCannot load LoRA weights: model was not initialized with LoRA config. Please reinitialize with lora_config or lora_weights_path parameter.)r   r   RuntimeErrorr   )r    r   r#   r#   r$   	load_lora   s
   zVoxCPM.load_lorac                 C   s   | j   dS )zXUnload LoRA by resetting all LoRA weights to initial state (effectively disabling LoRA).N)r   reset_lora_weightsr    r#   r#   r$   unload_lora  s   zVoxCPM.unload_loraenabledc                 C   s   | j | dS )zEnable or disable LoRA layers without unloading weights.
        
        Args:
            enabled: If True, LoRA layers are active; if False, only base model is used.
        N)r   set_lora_enabled)r    rr   r#   r#   r$   rs     s   zVoxCPM.set_lora_enabledc                 C   s
   | j  S )zGet current LoRA parameters state dict.
        
        Returns:
            dict: State dict containing all LoRA parameters (lora_A, lora_B).
        )r   get_lora_state_dictrp   r#   r#   r$   rt     s   
zVoxCPM.get_lora_state_dictc                 C   s   | j jduS )z&Check if LoRA is currently configured.N)r   r   rp   r#   r#   r$   lora_enabled  s   zVoxCPM.lora_enabled)r	   TTNN)r&   Tr	   NFTNN)NNr<   r   r=   r>   FFTr?   r@   F)__name__
__module____qualname__rU   boolr   r   r%   classmethodr4   npndarrayr   r   r;   floatintr8   tuplern   rq   rs   dictrt   propertyru   r#   r#   r#   r$   r   	   s    
6@	

fr   )r.   rY   r[   rd   r{   typingr   r   huggingface_hubr   model.voxcpmr   r   r   r#   r#   r#   r$   <module>   s    