o
    oiL+                     @   s   d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
 ddlZddlZddlmZ eG d	d
 d
ZG dd dZG dd dZdS )z0Core components for Kani-TTS-2 audio generation.    N)AudioCodecModel)AutoTokenizer)	dataclass)OptionalTuple   )KaniTTS2ForCausalLMc                   @   s   e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed	< d
Zeed< dZeed< dZdZee ed< dZee ed< dZee ed< dZee ed< dZee ed< dZee ed< dZee ed< dS )	TTSConfigzConfiguration for TTS model.auto
device_mapi  tokeniser_lengthr   start_of_text   end_of_texti  max_new_tokensz,nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fpsnanocodec_modeli"V  Ntext_vocab_sizetokens_per_frame
audio_stepuse_learnable_rope	alpha_min	alpha_maxspeaker_emb_dim)__name__
__module____qualname____doc__r   str__annotations__r   intr   r   r   r   sample_rater   r   r   r   floatr   boolr   r   r    r#   r#   A/home/ubuntu/.local/lib/python3.10/site-packages/kani_tts/core.pyr	      s    
 r	   c                   @   s   e Zd ZdZddedee ddfddZdej	ddfd	d
Z
dej	deej	ej	f fddZdej	defddZdej	deejee f fddZdS )NemoAudioPlayerz1Handles audio codec operations using NVIDIA NeMo.Nconfigtext_tokenizer_namereturnc                 C   s   || _ t| j j | _tj rdnd| _	| j
| j	 || _| jr+t| j| _| j j| _| j j| _| j j| _| jd | _| jd | _| jd | _| jd | _| jd | _| jd | _| jd	 | _| jd
 | _d| _d S )Ncudacpur   r                  
   i  )confr   from_pretrainedr   evalnemo_codec_modeltorchr)   is_availabledevicetor'   r   	tokenizerr   r   r   start_of_speechend_of_speechstart_of_humanend_of_humanstart_of_ai	end_of_ai	pad_tokenaudio_tokens_startcodebook_size)selfr&   r'   r#   r#   r$   __init__*   s*   



zNemoAudioPlayer.__init__out_idsc                 C   s(   | j |v }| j|v }|r|stddS )z6Validate that output contains required speech markers.z Special speech tokens not exist!N)r:   r;   
ValueError)rC   rE   start_of_speech_flagend_of_speech_flagr#   r#   r$   output_validationA   s
   

z!NemoAudioPlayer.output_validationc                    s   | j kjddd  }| jkjddd  }||kr"td||d | }t|d r4td|dd}|t fd	d
t	dD  }| j
 }|dk   dkr]td|jd}t|jd g}||fS )z2Extract and process audio codes from model output.Tas_tupler   zInvalid audio codes sequence!r   r,   z3The length of the sequence must be a multiple of 4!c                    s   g | ]} j | qS r#   )rB   ).0irC   r#   r$   
<listcomp>S   s    z2NemoAudioPlayer.get_nano_codes.<locals>.<listcomp>zInvalid audio tokens!)r:   nonzeroitemr;   rF   lenreshaper5   tensorrangerA   sumT	unsqueezeshape)rC   rE   start_a_idx	end_a_idxaudio_codeslen_r#   rO   r$   get_nano_codesH   s    
zNemoAudioPlayer.get_nano_codesc                 C   sX   || j kjddd  }|| jkjddd  }|||d  }| jj|dd}|S )z!Extract text from token sequence.TrJ   r   r   )skip_special_tokens)r   rQ   rR   r   r9   decode)rC   rE   start_t_idx	end_t_idx
txt_tokenstextr#   r#   r$   get_text\   s
   zNemoAudioPlayer.get_textc                 C   s   |  }| | | |\}}|| j|| j}}t  | jj||d\}}|	 
   }W d   n1 s@w   Y  | jrQ| |}||fS |dfS )z.Convert model output tokens to audio waveform.)tokens
tokens_lenN)flattenrI   r_   r8   r7   r5   inference_moder4   ra   r*   detachnumpysqueezer'   rf   )rC   rE   r]   r^   reconstructed_audio_output_audiore   r#   r#   r$   get_waveformd   s   


zNemoAudioPlayer.get_waveformN)r   r   r   r   r	   r   r   rD   r5   TensorrI   r   r_   rf   npndarrayrq   r#   r#   r#   r$   r%   '   s     &r%   c                   @   s   e Zd ZdZdedededdfddZdd	ed
edee	j
e	j
f fddZ				dde	j
de	j
dee	j
 dededede	j
fddZ					dded
edee	j
 dedededeejef fddZdS )	KaniModelz1Text-to-speech model using causal language model.r&   
model_nameplayerr(   Nc                 C   s   || _ || _tj rdnd| _tj|| jj| j j	| j j
| j j| j j| j j| j jtj| j jd
| _t|| _t| jjdd | _d| _g | _| jd ur\| jd| _| jdg | _d S d S )Nr)   r*   )	rA   r   r   r   r   r   r   torch_dtyper   language_settingsno_language_tagsstatuslanguage_tags_list)r1   rx   r5   r)   r6   r7   r   r2   rA   r   r   r   r   r   r   bfloat16r   modelr   r9   getattrr&   rz   r|   r}   get)rC   r&   rw   rx   r#   r#   r$   rD   x   s.   
zKaniModel.__init__text_promptlanguage_tagc                 C   s   |dur|   d| }| jj}| jj}| jj}| j|ddj}tj|ggtj	d}tj||ggtj	d}tj
|||gdd}	tjd|	jd tj	d}
|	|
fS )z*Prepare input tokens with special markers.Nz: pt)return_tensors)dtyper   )dim)striprx   r<   r   r=   r9   	input_idsr5   rU   int64catonesrZ   )rC   r   r   START_OF_HUMANEND_OF_TEXTEND_OF_HUMANr   start_token
end_tokensmodified_input_idsattention_maskr#   r#   r$   get_input_ids   s   zKaniModel.get_input_ids      ?ffffff?皙?r   r   speaker_embtemperaturetop_prepetition_penaltyc           	   
   C   s   | | j}| | j}||| jjd|||d| jjd	}|dur*| | j}||d< t  | jj	di |}W d   n1 sBw   Y  | dS )a  
        Generate audio tokens from text tokens.

        Args:
            input_ids: Text token IDs
            attention_mask: Attention mask
            speaker_emb: Optional speaker embedding [batch_size, speaker_emb_dim]
            temperature: Sampling temperature (default: 1.0)
            top_p: Top-p sampling parameter (default: 0.95)
            repetition_penalty: Repetition penalty (default: 1.1)
        Tr   )	r   r   r   	do_sampler   r   r   num_return_sequenceseos_token_idNr   r*   r#   )
r8   r7   r1   r   rx   r;   r5   no_gradr   generate)	rC   r   r   r   r   r   r   
gen_kwargsgenerated_idsr#   r#   r$   model_request   s&   

zKaniModel.model_requestre   c                 C   s   | j dkr"|du r"td td td t| jddi td n| j dkr7|dur7td td	 td | ||\}}| j||||||d
}	| j|	\}
}|
|fS )a  
        Generate audio from text.

        Args:
            text: Input text to convert to speech
            language_tag: Optional language ID (for models that support language or accent tags)
            speaker_emb: Optional speaker embedding tensor [1, speaker_emb_dim]
            temperature: Sampling temperature (default: 1.0)
            top_p: Top-p sampling parameter (default: 0.95)
            repetition_penalty: Repetition penalty (default: 1.1)
        available_language_tagsNz(========================================z+!!! YOU NEED TO SELECT THE LANGUAGE TAG !!!zLanguages available:sep
r{   z:!!! This model does not support language tag selection !!!)r   r   r   r   )r|   printr}   r   r   rx   rq   )rC   re   r   r   r   r   r   r   r   model_outputaudioro   r#   r#   r$   	run_model   s&   
zKaniModel.run_modelrr   )Nr   r   r   )NNr   r   r   )r   r   r   r   r	   r   r%   rD   r   r5   rs   r   r   r!   r   rt   ru   r   r#   r#   r#   r$   rv   u   sN    $

,rv   )r   r5   nemo.collections.tts.modelsr   transformersr   dataclassesr   typingr   r   rl   rt   osr   r   r	   r%   rv   r#   r#   r#   r$   <module>   s    N