o
    oiK6                     @   sr   d Z ddlmZmZmZ ddlmZ ddlZddl	Z	ddl
Z
ddlZddlmZmZmZ dd ZG d	d
 d
ZdS )zSimple API for Kani-TTS.    )TupleOptionalUnion)PathN   )	TTSConfigNemoAudioPlayer	KaniModelc                  C   s   t d zddl} | j  | j  W n	 ty   Y nw tdtj	 tdtj	 tdtj	 tdtj	 tdtj	 td	tj	 td
tj	 t tj	 dS )z
    Suppress all logging output from transformers, NeMo, PyTorch, and other libraries.
    Only print() statements from user code will be visible.
    ignorer   Nnemonemo_loggertorchpytorchnumba
matplotlibPIL)
warningsfilterwarningstransformersloggingset_verbosity_errordisable_progress_barImportError	getLoggersetLevelERROR)r    r   @/home/ubuntu/.local/lib/python3.10/site-packages/kani_tts/api.pysuppress_all_logs   s    

r   c                   @   sn  e Zd ZdZ												d1deded	ed
edededee dee dee dee dee dee dee fddZ	dd Z
					d2dedee deeejeef  deded ed!eejef fd"d#Z					d2dedee deeejeef  deded ed!eejef fd$d%Zd&eeef d!ejfd'd(Zd)ejd*efd+d,Zd-d. Zd3d/d0ZdS )4KaniTTSz
    Simple interface for Kani text-to-speech model.

    Example:
        >>> model = KaniTTS('your-model-name')
        >>> audio, text = model("Hello, world!")
    auto    TN
model_name
device_mapmax_new_tokenstokeniser_lengthsuppress_logs	show_infotext_vocab_sizetokens_per_frame
audio_stepuse_learnable_rope	alpha_min	alpha_maxspeaker_emb_dimc                 C   s   |rt   t||||||	|
|||d
| _|| _t| j| _t| j|| j| _| jj| _| jj	| _	| jj
| _
|   |rA|   dS dS )a  
        Initialize Kani-TTS model.

        Args:
            model_name: Hugging Face model ID or path to local model
            device_map: Device mapping for model (default: "auto")
            max_new_tokens: Maximum number of tokens to generate (default: 3000)
            tokeniser_length: Length of text tokenizer vocabulary (default: 64400)
            suppress_logs: Whether to suppress library logs (default: True)
            show_info: Whether to display model info on initialization (default: True)
            text_vocab_size: Text vocabulary size for position encoding.
                           If None, reads from model config. (default: None)
            tokens_per_frame: Number of audio tokens per frame.
                            If None, reads from model config. (default: None)
            audio_step: Position step size per audio frame.
                       If None, reads from model config. (default: None)
            use_learnable_rope: Enable learnable RoPE with per-layer alpha.
                              If None, reads from model config. (default: None)
            alpha_min: Minimum alpha value for learnable RoPE.
                      If None, reads from model config. (default: None)
            alpha_max: Maximum alpha value for learnable RoPE.
                      If None, reads from model config. (default: None)
            speaker_emb_dim: Dimension of speaker embeddings.
                           If None, reads from model config. (default: None)
        )
r$   r&   r%   r)   r*   r+   r,   r-   r.   r/   N)r   r   configr#   r   playerr	   modelstatuslanguage_tags_listsample_rate_sync_config_from_modelshow_model_info)selfr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r   r   r   __init__5   s0   )


zKaniTTS.__init__c                 C   s   | j j }| jjdu rt|dd| j_| jjdu r t|dd| j_| jjdu r.t|dd| j_| jjdu r<t|dd| j_| jjdu rJt|dd| j_| jjdu rXt|dd| j_| jj	du rht|dd| j_	dS dS )	z
        Synchronize config with actual values from loaded model.
        This ensures display shows the correct values that were loaded.
        Nr)   r*   r+   r,   r-   r.   r/   )
r2   r0   r)   getattrr*   r+   r,   r-   r.   r/   )r8   loaded_modelr   r   r   r6   {   s    zKaniTTS._sync_config_from_model      ?ffffff?皙?textlanguage_tagspeaker_embtemperaturetop_prepetition_penaltyreturnc                 C   s   |  ||||||S )  
        Generate audio from text.

        Args:
            text: Input text to convert to speech
            language_tag: Optional language tag if model supports different languages or accents
            speaker_emb: Optional speaker embedding. Can be:
                - torch.Tensor: [1, speaker_emb_dim] or [speaker_emb_dim]
                - str/Path: Path to .pt file containing speaker embedding
            temperature: Sampling temperature (default: 1.0)
            top_p: Top-p sampling parameter (default: 0.95)
            repetition_penalty: Repetition penalty (default: 1.1)

        Returns:
            Tuple of (audio_waveform, text) where audio_waveform is a numpy array
            containing the audio samples and text is the input text.
        )generater8   r?   r@   rA   rB   rC   rD   r   r   r   __call__   s   zKaniTTS.__call__c                 C   sP   |durt |tjs| |}|dur|jdkr|d}| j||||||S )rF   Nr   r   )
isinstancer   Tensorload_speaker_embeddingndim	unsqueezer2   	run_modelrH   r   r   r   rG      s
   

zKaniTTS.generatepathc                 C   s   t |}| std| |jdkrtd|j t|}|jdkr@| jj	}|j
d |kr>td| d|j
d  |S |jdkr]|j
d | jj	kr[td	| jj	 d
|j
 |S td|j
 )z
        Load speaker embedding from a .pt file.

        Args:
            path: Path to .pt file containing speaker embedding

        Returns:
            Speaker embedding tensor [speaker_emb_dim] or [1, speaker_emb_dim]
        z"Speaker embedding file not found: z.ptz+Speaker embedding must be a .pt file, got: r   r   z0Speaker embedding has wrong dimension: expected z, got    z6Speaker embedding has wrong dimension: expected [..., z], got z/Speaker embedding must be 1D or 2D, got shape: )r   existsFileNotFoundErrorsuffix
ValueErrorr   loadrM   r0   r/   shape)r8   rP   rA   expected_dimr   r   r   rL      s0   




zKaniTTS.load_speaker_embeddingaudiooutput_pathc                 C   s6   zddl }|||| j W dS  ty   tdw )z
        Save audio waveform to file.

        Args:
            audio: Audio waveform as numpy array
            output_path: Path to save audio file (e.g., "output.wav")
        r   NzKsoundfile is required to save audio. Install it with: pip install soundfile)	soundfilewriter5   r   )r8   rY   rZ   sfr   r   r   
save_audio   s   zKaniTTS.save_audioc                 C   s  t   t d t d t d t d t d t   t d t d t d t   t d | j}t|d	kr>d
|dd  }t d|  tj rLdnd}t d|  | jdkrt dt| j d | jr}t| jdkr}d| j}t d|  n| jrt d| jd  d| jd  d nt d t   t d t d| j	 d t d| j
j  t d| j
jpd   t d!| j
jpd   t d"| j
jpd   t d#| j
jpd   | j
jrt d$ t d%| j
jpd  d| j
jpd  d& nt d' t d t   t d( t   dS ))z=
        Display beautiful model information banner.
        u   ╔════════════════════════════════════════════════════════════╗uB   ║                                                            ║uD   ║                   N I N E N I N E S I X  😼                ║u   ╚════════════════════════════════════════════════════════════╝z              /\_/\  z             ( o.o )z              > ^ <u   ──────────────────────────────────────────────────────────────2   z...iNz	  Model: z
GPU (CUDA)CPUz
  Device: available_language_tagsz!  Mode: Available language tags (z language tags)   z, z  Tags: r   r   z,, ... (use .show_language_tags() to see all)z  Mode: No language tagsz  Configuration:u       • Sample Rate: z Hzu       • Max Tokens: u       • Speaker Embedding Dim: Unknownu       • Text Vocab Size: u       • Tokens per Frame: u       • Audio Step: u=       • Learnable RoPE: Enabled (per-layer frequency scaling)u       • Alpha Range: []u0       • Learnable RoPE: Disabled (standard RoPE)u      Ready to generate speech! 🎵)printr#   lenr   cudais_availabler3   r4   joinr5   r0   r%   r/   r)   r*   r+   r,   r-   r.   )r8   model_displaydevicelang_strr   r   r   r7     sV   
"(
zKaniTTS.show_model_infoc                 C   sr   t d | jdkr/t d t d | jr*t| jdD ]\}}t d| d|  qn	t d nt d	 t d d S )
Nz2==================================================ra   zAvailable language tags:z2--------------------------------------------------r   z  z. z  No tags configuredz3This model does not support language tag selection.)re   r3   r4   	enumerate)r8   itagr   r   r   show_language_tagsH  s   

zKaniTTS.show_language_tags)r    r!   r"   TTNNNNNNN)NNr<   r=   r>   )rE   N)__name__
__module____qualname____doc__strintboolr   floatr9   r6   r   r   rK   r   r   npndarrayrI   rG   rL   r^   r7   rp   r   r   r   r   r   ,   s    	

F 

$&9r   )rt   typingr   r   r   pathlibr   numpyry   r   r   r   corer   r   r	   r   r   r   r   r   r   <module>   s    !