o
    ib                     @   s  d Z ddlZddlmZmZmZmZmZ ddlZddl	m
Z
 ddlm
  mZ ddlZddlZddlmZ ddlmZ z
ddlmZ dZW n eyP   dZY nw dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z* ddl+m,Z,m-Z- G dd deZ.G dd deZ/G dd deZ0G dd deZ1e02  G dd de
j3Z4dS )a  
VoxCPM: A Tokenizer-free speech generation model

This module contains the main VoxCPM model implementation, including configuration classes
and the core VoxCPMModel for text-to-speech generation.

Copyright 2025 OpenBMB
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)TupleUnion	GeneratorListOptional)	rearrange)	BaseModel)	load_fileTF)tqdm)LlamaTokenizerFast   )AudioVAEAudioVAEConfig)ScalarQuantizationLayer)"apply_lora_to_named_linear_modules)	CfmConfig
UnifiedCFMVoxCPMLocDiT)VoxCPMLocEnc)MiniCPM4ConfigMiniCPMModel   )	get_dtypemask_multichar_chinese_tokensc                   @   sJ   e Zd ZU dZeed< dZeed< dZeed< dZeed< d	Z	eed
< d	S )VoxCPMEncoderConfig   
hidden_dim   ffn_dim   	num_heads   
num_layersNkv_channels)
__name__
__module____qualname__r   int__annotations__r   r    r"   r#    r)   r)   G/home/ubuntu/.local/lib/python3.10/site-packages/voxcpm/model/voxcpm.pyr   1   s   
 r   c                   @   sR   e Zd ZU dZeed< dZeed< dZeed< dZeed< d	Z	eed
< e
ed< d	S )VoxCPMDitConfigr   r   r   r   r   r    r!   r"   Nr#   
cfm_config)r$   r%   r&   r   r'   r(   r   r    r"   r#   r   r)   r)   r)   r*   r+   9   s   
 r+   c                   @   s   e Zd ZU eed< dZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< eed< eed< dZee ed< dZeed< dZeed< dZeed< dZeed< dS )VoxCPMConfig	lm_configr   
patch_size@   feat_dim   residual_lm_num_layers   scalar_quantization_latent_dim	   scalar_quantization_scaleencoder_config
dit_configNaudio_vae_configr   
max_lengthcudadevicebfloat16dtypeFdit_mean_mode)r$   r%   r&   r   r(   r/   r'   r1   r3   r5   r7   r   r+   r:   r   r   r;   r=   strr?   r@   boolr)   r)   r)   r*   r-   C   s   
 r-   c                   @   s   e Zd ZU dZeed< dZeed< dZeed< dZe	ed< dZ
e	ed< d	Zeed
< g dZee ed< g dZee ed< g dZee ed< dS )
LoRAConfigF	enable_lm
enable_ditenable_proj   rr   alpha        dropout)q_projv_projk_projo_projtarget_modules_lmtarget_modules_dit)enc_to_lm_projlm_to_dit_projres_to_dit_projtarget_proj_modulesN)r$   r%   r&   rD   rB   r(   rE   rF   rH   r'   rI   rK   floatrP   listrA   rQ   rU   r)   r)   r)   r*   rC   U   s   
 rC   c                       s2  e Zd Z	dadedededef fddZdd	 Zdbde	fddZ
dd
ddejdejdejdejdejdejdejdede	fddZdd ZdejfddZdeejddf fd d!Ze 	"	"	#	$	%	&	
	'	(	
dcd)ed*ed+ed,ed-ed.ed/ed0e	d1ed2ed3e	deejddf fd4d5Ze d*ed+efd6d7Zd8ed9ed:ejfd;d<Zdeejejejf fd=d>Zdeeejejeej f ddf fd?d@Ze 	#	$	%	&	
	'	(	
ddd)edAed,ed-ed.ed/ed0e	d1ed2ed3e	deeejejeejeej f f ddf fdBdCZdeejejf fdDdEZ deeejeej f ddf fdFdGZ!e 	#	$	%	&	
	'dedHejdejdIejdJejd,ed-ed.ed/ed3e	dKedeeejeejeej f f ddf fdLdMZ"e#dfdOedPe	dQe	defdRdSZ$dTdU Z%dadVedWefdXdYZ&dZe	fd[d\Z'd]d^ Z(defd_d`Z)  Z*S )gVoxCPMModelNconfig	tokenizer	audio_vaelora_configc                    s  t    || _|| _|j| _|j| _|j| _tj	 s)tj
j	 r&d| _nd| _td| j d| jj  t|j| _| jd|j| jt| jj t|| _d| _d| _|jjdd	}|j|_d
|_t|| _| jd|j| jt| jj |jjdd	}|jj|_|jj |_!|jj"|_#|jj$|_|jj%|_%d
|_t&||jd| _'|jjdd	}|j(j|_|j(j |_!|j(j"|_#|j(j$|_|j(j%|_%d
|_t)|j|j(j*t+||jd|j,d| _-t.|jj|jj|j/|j0| _1t23|jj|jj| _4t23|jj|j(j| _5t23|jj|j(j| _6t23|jj|jj| _7t28 | _9t2j3|jjddd| _:t2j;dd| _<|| _=|j>| _>|j?| _?| jd ur@| @  d S d S )NmpscpuzRunning on device: z	, dtype: r   e   f   T)deepr   )	input_dim)in_channels)rc   
cfm_params	estimator	mean_moder   F)biasnone)	reduction)Asuper__init__rY   r\   r1   r/   r=   torchr<   is_availablebackendsr]   printr?   r   r.   base_lmsetup_cacher;   r   r   text_tokenizeraudio_start_tokenaudio_end_token
model_copyr3   num_hidden_layers
vocab_sizeresidual_lmr8   r   hidden_sizer   intermediate_sizer    num_attention_headsr"   r#   r   feat_encoderr9   r   r,   r   r@   feat_decoderr   r5   r7   	fsq_layernnLinearrR   rS   rT   	stop_projSiLU	stop_actn	stop_headCrossEntropyLoss	stop_lossr[   
chunk_sizesample_rate_apply_lora)selfrY   rZ   r[   r\   residual_lm_configr8   decoder_config	__class__r)   r*   rk   i   sz   














zVoxCPMModel.__init__c                 C   s   | j }t|j|j|jd}|jr$| j| jfD ]}t|fd|j	i| q|j
r4t| jjfd|ji| |jr\ddlm} |jD ]}t| |d}t|tjr[t| ||dd|i| q@dS dS )u$   注入 LoRA 到 LM / DiT / 投影层)rH   rI   rK   target_submodule_namesr   
LoRALinearNbaser)   )r\   dictrH   rI   rK   rD   rp   rx   r   rP   rE   r}   re   rQ   rF   modules.layers.lorar   rU   getattr
isinstancer   r   setattr)r   cfglora_kwargslmr   	attr_namemoduler)   r)   r*   r      s8   
zVoxCPMModel._apply_loraFdisablec              
   C   s   |r| S zH| j dkrtdzdd l}W n   tdtj| jjddd| j_tj| jjddd| j_tj| jddd| _tj| j	j
ddd| j	_
W | S  tyf } ztd|  W Y d }~| S d }~ww )	Nr<   z0VoxCPMModel can only be optimized on CUDA devicer   ztriton is not installedzreduce-overheadT)mode	fullgraphz"Warning: torch.compile disabled - )r=   
ValueErrortritonrl   compilerp   forward_steprx   r|   r}   re   	Exceptionro   )r   r   r   er)   r)   r*   optimize   s&   
zVoxCPMModel.optimizerJ   )progresssample_generatetext_tokens	text_maskaudio_feats
audio_mask	loss_maskposition_idslabelsr   r   c          &      C   s  ~|j | jtjd}|j | j|  d}|j | j|  d}|j | j|  d}|j | j|  d}|j | jtjd}|j\}
}}}| |}| |}t| j	j
dd}t| j	j
dds^d}| j|| }|d| |d|  }| j|dd\}}| |  }| ||d ||d  }tjt|d d d	d
d d f |d d d dd d f fd
d}||d|  }| j|dd\}}| |  }tjt|d d d	d
d d f |d d d dd d f fd
d}| || | }t|d}|  }t| |d}tjt|d d d	d
df |d d d ddf fd
d}t| |d}|dd
d
| j}t|d |}| jj|d
d ||d
d |d
d |d}| | | |}| |d
d|}tj|  dd} ||   |  }!d }"|	r|d
d }#| j|| j|#t!| j	j"j#dr| j	j"j#j$ndd}$t|$d
dd|
| jd}"t|d|
| jd}%||!|%|"dS )Nr?   	scale_emb      ?use_mupFTinputs_embeds	is_causalr   r   dimzb t c -> (b t) czb t p d -> (b t) p d.zb t p -> (b t) p 1r   )condtgt_maskr   )mininference_cfg_rate
   )mur/   r   n_timestepsz(b t) d p -> b d (t p)bpz(b t) p d -> b d (t p))z	loss/diffz	loss/stopfeat_gt	feat_pred)%tor=   rl   long_dtypeshaper|   rR   r   rY   r.   rp   embed_tokens	unsqueezer~   cat
zeros_likerx   rS   rT   r   repeatr/   r}   compute_loss	transpose
contiguousr   r   r   r   clampsumhasattrr9   r,   r   )&r   r   r   r   r   r   r   r   r   r   BTPD
feat_embedr   
text_embedcombined_embedenc_outputs_	lm_hiddenresidual_inputsresidual_outputsresidual_hidden
dit_hiddentarget_dtyper   	feat_condloss_seq_mask	diff_lossstop_logitsstop_lossesdenomr   r   feat_cond_for_samplefeat_pred_seqfeat_gt_tensorr)   r)   r*   forward   s   

"D8
0zVoxCPMModel.forwardc                 C   s   t | jjS N)r   rY   r?   r   r)   r)   r*   r   J  s   zVoxCPMModel._dtypereturnc                 O      t | j|ddi|S N	streamingF)next	_generater   argskwargsr)   r)   r*   generateN     zVoxCPMModel.generatec                 O      | j |ddi|S Nr   T)r   r   r)   r)   r*   generate_streamingQ     zVoxCPMModel.generate_streaming r     r          @         @target_textprompt_textprompt_wav_pathmin_lenmax_leninference_timesteps	cfg_valueretry_badcaseretry_badcase_max_timesretry_badcase_ratio_thresholdr   c                  c   s   |r|rt d d}t|dkr_|}t| |}tj|tj| jgtj	|j
dgdd}|jd }tj|| j| jjftj|j
d}t|tj	|j
}t|tj	|j
}n|| }t| |}tj|tj| jgtj	|j
dgdd}|jd }t|\}}|ddkr|jddd	}|| jkrtj||| j}| j| j }|d| dkr||d|  }tjj||df}| j|| j
| j }| | jjd| j!dd
d}|d}tj|tj	|j
d}t||g}tj|| j| jjftj|j
d}tj||gdd}tt|t|gtj	|j
}tt|t|gtj	|j
}|"d| j
}|"d| j
}|"d| j
t#| j$j%}|"d| j
}t| |}d}||	k r| j&|||||t't(||
 d ||||d	}|r| j| j }|D ] \}}| j)|tj}|d| d f *d }|V  qn(t+|\}}|r|jd ||
 krt,d|jd |  d |d7 }qrn	 |s| j)|tj*d }|V  d S d S )NSRetry on bad cases is not supported in streaming mode, setting retry_badcase=False.Fr   r?   r=   r   r   r   Tr   keepdimr   r   r  r  r	  r
  r   .%  Badcase detected, audio_text_ratio=, retrying...)-warningswarnlenrl   
LongTensorrr   r   tensorrs   int32r=   r   zerosr/   r[   
latent_dimfloat32onestyper   
torchaudioloadsizemeanr   
functionalresampler   r   padencoder^   viewpermuter   r   rY   r?   
_inferencer   r'   decodesqueezer   ro   ) r   r  r  r  r  r  r	  r
  r  r  r  r   text
text_tokentext_length
audio_featr   r   audiosr	patch_lenpadding_sizeaudio_lengthtext_pad_tokenaudio_pad_feattarget_text_lengthretry_badcase_timesinference_resultlatent_predr   decode_audiopred_audio_featr)   r)   r*   r   T  s   





(( 


zVoxCPMModel._generatec           	      C   s   |r|st dt|\}}|ddkr|jddd}|| jkr+tj||| j}| j| j	 }|d| dkrM||d|  }t
jj||df}| j|| j| j }|| jjd| jddd}||d}|S )	a  
        Build prompt cache for subsequent fast generation.
        
        Args:
            prompt_text: prompt text (required)
            prompt_wav_path: prompt audio path (required)
            
        Returns:
            prompt_cache: dict with prompt_text (raw text) and audio features.
                         Text tokenization will be done during generation for consistency.
        z,prompt_text and prompt_wav_path are requiredr   r   Tr  r   r   r  r0  )r   r   r!  r"  r#  r   r$  r%  r/   r   rl   r   r&  r[   r'  r   r=   r^   r(  r  r)  )	r   r  r  r1  r2  r3  r4  r0  prompt_cacher)   r)   r*   build_prompt_cache  s.   

zVoxCPMModel.build_prompt_cacheoriginal_cachenew_textnew_audio_featc           	      C   sJ   |du r	||dS |d }|d }|| }t j||gdd}||d}|S )as  
        Merge original prompt cache with newly generated content to stabilize voice.
        
        Args:
            original_cache: original prompt cache
            new_text: newly generated text 
            new_audio_feat: newly generated audio features
            
        Returns:
            merged_cache: merged cache with prompt_text and audio_feat
        Nr>  r  r0  r   r   )rl   r   )	r   rA  rB  rC  original_prompt_textoriginal_audio_featmerged_prompt_textmerged_audio_featmerged_cacher)   r)   r*   merge_prompt_cache  s   zVoxCPMModel.merge_prompt_cachec                 O   r   r   )r   _generate_with_prompt_cacher   r)   r)   r*   generate_with_prompt_cache3  r   z&VoxCPMModel.generate_with_prompt_cachec                 O   r   r   )rJ  r   r)   r)   r*   $generate_with_prompt_cache_streaming7  s   z0VoxCPMModel.generate_with_prompt_cache_streamingr?  c                 c   s   |r|
rt d d}|du r!tjd| j| jjftjd}|}n|d }|d }|| }t| 	|}tj
|tj| jgtj|jdgd	d
}t| 	|}|d}|jd }tj|tj|jd}tj|jd | j| jjftj|jd}t
||g}tj
||gdd
}t
t|t|gtj|j}t
t|t|gtj|j}|d| j}|d| j}|d| jt| jj}|d| j}t| 	|}d}||k rX| j|||||tt||	 d ||||
d	}|
r1| j| j }|D ]#\}}| j|tj}|d| df d  }|||fV  qn't!|\}}|rW|jd ||	 krVt"d|jd |  d |d7 }qn	 |
sr| j|tjd  }|||fV  dS dS )a$  
        Generate audio using pre-built prompt cache.
        
        Args:
            target_text: Text to convert to speech
            prompt_cache: Cache built by build_prompt_cache (can be None)
            min_len: Minimum audio length to avoid very short audio
            max_len: Maximum audio length
            inference_timesteps: Number of diffusion sampling steps
            cfg_value: Classifier-free guidance value
            retry_badcase: Whether to retry on bad cases
            retry_badcase_max_times: Maximum retry attempts
            retry_badcase_ratio_threshold: Threshold for audio-to-text ratio
            streaming: Whether to return a generator of audio chunks
            
        Returns:
            Generator of Tuple containing:
                - Decoded audio tensor for the current step if ``streaming=True``, else final decoded audio tensor
                - Tensor of new text tokens
                - New audio features up to the current step as a List if ``streaming=True``, else as a concatenated Tensor
        r  FNr   r   r0  r  r  r   r   r   r  .r   r  r  )#r  r  rl   emptyr/   r[   r  r  r  rr   r   r  rs   r  r=   r"  r   r  r  r  r   r   r   rY   r?   r  r*  r   r'   r   r+  r,  r^   r   ro   )r   r  r?  r  r  r	  r
  r  r  r  r   prompt_audio_featr-  r  r.  target_text_tokenr5  r/  r6  r7  r0  r   r   r8  r9  r:  r3  r;  r=  r<  r)   r)   r*   rJ  =  s   #


** 


z'VoxCPMModel._generate_with_prompt_cachec                 O   r   r   )r   r*  r   r)   r)   r*   	inference  r   zVoxCPMModel.inferencec                 O   r   r   )r*  r   r)   r)   r*   inference_streaming  r   zVoxCPMModel.inference_streamingr-  feat	feat_maskstreaming_prefix_lenc           $   	   c   s   |j \}}}}| |}| |}| jjjr| jjj}nd}| j|| }|	d| |	d|  }|ddddf }g }d}| j|dd\}}| jj
| | ||	d ||	d  }|dddddf }| j||	d|  dd\}}| jj
| |dddddf }tt|D ]}| |}| |}|| }| j|| j|dd ||d	dd} | | 	d}| |}|| 	d | }|	rtj||
 d dd
}!t|!d|| jd}"|"|fV  | | | |jdd
d   }#||kr|#dkr nB| j |dddddf tj!| jj
" g|j#d$ }| |}| j ||dddddf  tj!| jj
" g|j#d$ }q|	sptj|dd
}t|d|| jd}"|"|%d fV  dS dS )a  Core inference method for audio generation.
        
        This is the main inference loop that generates audio features
        using the language model and diffusion transformer.
        
        Args:
            text: Input text tokens
            text_mask: Mask for text tokens
            feat: Input audio features
            feat_mask: Mask for audio features
            min_len: Minimum generation length
            max_len: Maximum generation length
            inference_timesteps: Number of diffusion steps
            cfg_value: Classifier-free guidance value
            streaming: Whether to yield each step latent feature or just the final result
            
        Returns:
            Generator of Tuple containing:
                - Predicted latent feature at the current step if ``streaming=True``, else final latent features
                - Predicted audio feature sequence so far as a List if ``streaming=True``, else as a concatenated Tensor
        r   r   N.Tr   r   r   )r   r/   r   r   r
  r   zb t p d -> b d (t p)r   r   r=   )&r   r|   rR   rY   r.   r   r   rp   r   r   kv_cachefill_cachesr~   rx   r
   rangerS   rT   r}   r/   r   r   appendrl   r   r   r   r   r   argmaxr^   itemr   r  stepr=   cloner,  )$r   r-  r   rR  rS  r  r  r	  r
  r   rT  r   r   r   r   r   r   r   r   prefix_feat_condpred_feat_seq
curr_embedr   kv_cache_tupler   residual_enc_outputsresidual_kv_cache_tupler   idit_hidden_1dit_hidden_2r   	pred_featpred_feat_chunkr   	stop_flagr)   r)   r*   r*    s   #



"





*,
0zVoxCPMModel._inferenceTpathr   trainingc                 C   s  t ttj|d }t|}t	|dd }|r t
|dnt
 }tjtj|ddddd }	| ||||}
|sGt|
jj}|
|}
n|
 D ]\}}d	|v rWd
|_qK|d urbd|vrbd
|_qK|
jtj|
_tj|d}tj|d}tj|rtrtd|  t|}n&tj|rtd|  tj|ddd}|d|}n
td| d| |	 D ]\}}||d| < q|
j|d
d |r|
S |
|
j j| dS )Nzconfig.jsonr:   )rY   zaudiovae.pthr^   Tmap_locationweights_only
state_dictr[   Florazmodel.safetensorszpytorch_model.binz Loading model from safetensors: z&Loading model from pytorch_model.bin: z&Model file not found. Expected either  or z
audio_vae.)strict)r   ) r-   model_validate_jsonopenosrj  joinreadr   from_pretrainedr   r   rl   r!  r   rY   r?   r   named_parametersrequires_gradr[   r  existsSAFETENSORS_AVAILABLEro   r	   getFileNotFoundErroritemsload_state_dictr=   evalr   )clsrj  r   rk  r\   rY   rZ   r:   r[   vae_state_dictmodellm_dtypenameparamsafetensors_pathpytorch_model_pathmodel_state_dict
checkpointkwvalr)   r)   r*   
from_local7  s\   

zVoxCPMModel.from_localc                 c   s0    ddl m} |  D ]
}t||r|V  qdS )zIterate over all LoRA modules.r   r   N)r   r   modulesr   )r   r   r   r)   r)   r*   _iter_lora_modulesq  s   
zVoxCPMModel._iter_lora_modules	lora_pathr=   c                 C   s@  ddl m} |p
| j}||}| r|d }|d }n|jdkr#|nd}|jdv r,|nd}|r?| r?tr?tt||d}n|rT| rTt	j
||d	d
}|d|}n
td| d| t|  }dd |D }	g g }
}| D ]'\}}||v r~|n|	|}|r|| j|| |
| qt|| qt|
|fS )a  
        Load LoRA weights from file, supports calling after torch.compile.
        Uses named_parameters() to handle compile's _orig_mod wrapper.
        Supports both safetensors and pytorch formats.
        
        Args:
            lora_path: Checkpoint path (directory or .safetensors/.ckpt file)
            device: Target device, defaults to model's current device
        Returns:
            tuple: (loaded_keys, skipped_keys)
        r   )Pathzlora_weights.safetensorszlora_weights.ckptz.safetensorsN)z.ckptz.pthrU  Frl  ro  z+LoRA checkpoint not found. Expected either rq  c                 S   s"   i | ]}d |v r| d d|qS )z._orig_mod..)replace).0kr)   r)   r*   
<dictcomp>  s   " z1VoxCPMModel.load_lora_weights.<locals>.<dictcomp>)pathlibr  r=   is_dirsuffixr{  r|  r	   rA   rl   r!  r}  r~  r   ry  r  datacopy_r   rY  )r   r  r=   r  safetensors_file	ckpt_filero  ckptmodel_paramskey_mappingloaded_keysskipped_keyskeyvalue
target_keyr)   r)   r*   load_lora_weightsx  s4   


zVoxCPMModel.load_lora_weightsenabledc                 C   s   |   D ]}|| qdS )zEnable/disable all LoRA layers.N)r  set_enabled)r   r  r   r)   r)   r*   set_lora_enabled  s   zVoxCPMModel.set_lora_enabledc                 C   s   |   D ]}|  qdS )zJReset all LoRA weights (A: kaiming, B: zeros), effectively unloading LoRA.N)r  reset_lora_parameters)r   r   r)   r)   r*   reset_lora_weights  s   
zVoxCPMModel.reset_lora_weightsc                 C   s   dd |   D S )z(Get all LoRA parameters (lora_A/lora_B).c                 S   s$   i | ]\}}d |v r||j  qS )lora_)r  r]  )r  r  r  r)   r)   r*   r    s
    
z3VoxCPMModel.get_lora_state_dict.<locals>.<dictcomp>)ry  r   r)   r)   r*   get_lora_state_dict  s   zVoxCPMModel.get_lora_state_dictr   )F)
r   r   r   r   r   r  Fr  r  F)r   r   r   r  Fr  r  F)r   r   r   r  Fr  )TFN)+r$   r%   r&   r-   r   r   rC   rk   r   rB   r   rl   TensorrV   r   r   r   r   r   inference_moderA   r'   r   r@  r   rI  r   rK  r   rL  r   rJ  rP  rQ  r*  classmethodr  r  r  r  r  r  __classcell__r)   r)   r   r*   rX   h   sD   U

`	
 3
% 
	
*}(	
&u93rX   )5__doc__ru  typingr   r   r   r   r   rl   torch.nnr   torch.nn.functionalr$  Fr   r  einopsr   pydanticr   safetensors.torchr	   r|  ImportErrorr
   transformersr   modules.audiovaer   r   modules.layersr   r   r   modules.locditr   r   r   modules.locencr   modules.minicpm4r   r   utilsr   r   r   r+   r-   rC   model_rebuildModulerX   r)   r)   r)   r*   <module>   s>    
