o
    پim.                    @   s  d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
mZ ddlZddlZddlm  mZ ddlm  m  mZ ddlZddlmZ ddlmZ ddlmZ ddlmZmZmZmZ dd	l m!Z! dd
l"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2m3Z3m4Z4 ddl5m6Z6 ddl7m8Z8 ddl9m:Z: ddl;m<Z< ddl=m>Z>m?Z? ddl@mAZA ddlBmCZC zddlmDZD ddlEmFZF dZGW n   dZDdZGY 					dGdejHdejHdejHdeId eIf
d!d"ZJeG d#d$ d$e'ZK	%	&	'		dHd(ejHd)eId*ejHd+eId,eId-eId.eId/eLd0ejHfd1d2ZMG d3d4 d4ejNZOG d5d6 d6ejNZPG d7d8 d8ejNZQG d9d: d:ejNZRG d;d< d<ZSG d=d> d>eZTG d?d@ d@ejNZUG dAdB dBe+ZVG dCdD dDejNZWG dEdF dFe>ZXeXgZYdS )IzCInference-only MiniCPM-o model compatible with HuggingFace weights.    N)	dataclass)AnyIterableListLiteralOptionalTupleUnion)nn)parametrizations)tqdm)LlamaConfig
LlamaModelPretrainedConfigPreTrainedModel)ACT2FN)DynamicCacheEncoderDecoderCache)BaseModelOutputWithPastModelOutput)WhisperAttentionWhisperConfigWhisperEncoder)QuantizationConfig))MultiModalityDataPaddingPatternTokenPairsgeneral_mm_embed_routine)MultimodalDataItemMultimodalInputsflatten_nested_list)ForwardBatch)set_default_torch_dtype)default_weight_loader)Idefics2VisionTransformer)MiniCPMBaseModelResampler2_5Qwen2ForCausalLM)logger)LogitsWarper)GroupedResidualFSQTF   	input_idsspk_embinput_embedsspk_emb_token_idnum_spk_embsc                 C   s|   | j d }t|D ]2}| | }|| }||k}	|	jdd}
|
j d |ks&J |
 }|
 }|||||d ddf< q	dS )a  
    Replace consecutive `num_spk_embs` speaker embedding placeholders in input_embeds with pre-prepared speaker embeddings. This is an in-place replacement, no new tensor is created, so no value is returned.

    Args:
        input_ids (torch.Tensor): Input ID tensor, shape [batch_size, seq_len_max]
        spk_emb (torch.Tensor): Speaker embedding tensor, shape [batch_size, num_spk_emb, hidden_dim]
        input_embeds (torch.Tensor): Input embedding tensor, shape [batch_size, seq_len_max, hidden_dim]
        spk_emb_token_id (int): ID of the speaker embedding token
        num_spk_embs (int): Number of speaker embeddings

    Returns:
        None
    r   F)as_tupler*   N)shaperangenonzerominmax)r+   r,   r-   r.   r/   
batch_sizeidx
input_ids_spk_emb_mask_nonzero_position_idx	begin_idxend_idx r>   N/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/minicpmo.pyapply_spk_embD   s   
r@   c                   @   sT   e Zd ZU dZdZejed< dZejed< dZ	e
eeej   ed< dZeed< dS )"ConditionalChatTTSGenerationOutputa  
    Output class for ConditionalChatTTS generation.

    Args:
        new_ids (torch.LongTensor): Newly generated audio code sequence, shape (batch_size, sequence_length, num_vq).
        audio_input_ids (torch.LongTensor): Updated input IDs including condition and generated audio codes, shape (batch_size, full_sequence_length, num_vq).
        past_key_values (Tuple[Tuple[torch.FloatTensor]]): Tuple containing pre-computed keys and values used for attention mechanism. Each element has shape (batch_size, num_heads, sequence_length, embed_size_per_head).
        finished (bool): Boolean indicating whether generation is complete.

    Nnew_idsaudio_input_idspast_key_valuesfinished)__name__
__module____qualname____doc__rB   torch
LongTensor__annotations__rC   rD   r   r   FloatTensorrE   boolr>   r>   r>   r?   rA   h   s   
 rA   ,  2   
   inputs_embedspast_seen_tokensstreaming_tts_text_maskstreaming_reserved_lengthstreaming_audio_chunk_sizestreaming_text_chunk_sizenum_spk_embuse_spk_embreturnc                 C   s   | j d dks	J | j}| j}	t|j}
tjd|| j d  fd||	d}tt|| | | |d ||  }|d ||  d }|
|d||f< |ddd||  | d f 	|dk|
 |
d
d}|S )a  
    In streaming audio generation, determine which `text` positions the TTS model can attend to when generating each chunk of `audio` tokens.

    This function creates a mask that allows the model to attend to a specific chunk of text
    tokens when generating each chunk of audio tokens, enabling streaming TTS generation.

    Args:
        inputs_embeds (torch.Tensor): Input embeddings tensor.
        past_seen_tokens (int): Number of tokens already seen by the model.
        streaming_tts_text_mask (torch.Tensor): Mask for the text tokens.
        streaming_reserved_length (int, optional): Number of reserved tokens for streaming. Defaults to 300.
        streaming_text_chunk_size (int, optional): Size of each text chunk. Defaults to 7.

    Returns:
        torch.Tensor: Causal mask for streaming TTS generation, shape is [batch_size=1, 1, seq_len=1, past_seen_tokens+1]

    Raises:
        AssertionError: If the batch size is not 1 (only supports batch size of 1 for inference).
    r   r*   )
fill_valuedtypedevice)r1   r\   r]   rJ   finfor4   fullmathceilmasked_fill_	unsqueeze)rR   rS   rT   rU   rV   rW   rX   rY   r\   r]   	min_dtypecausal_maskinvisible_text_tokens_startinvisible_text_tokens_endr>   r>   r?   $make_streaming_chunk_mask_generation{   sD   		rh   c                       sN   e Zd Z	ddededededef
 fddZdd
ejdejfddZ  Z	S )ConvNeXtBlockư>dimintermediate_dimkerneldilationlayer_scale_init_valuec                    s   t    tj|||||d  ||d| _tj|dd| _t||| _t	 | _
t||| _|dkrCtj|t| dd| _d S d | _d S )N   )kernel_sizepaddingrn   groupsrj   )epsr   T)requires_grad)super__init__r
   Conv1ddwconv	LayerNormnormLinearpwconv1GELUactpwconv2	ParameterrJ   onescoef)selfrk   rl   rm   rn   ro   	__class__r>   r?   rw      s$   
	
	
zConvNeXtBlock.__init__NxrZ   c                 C   sx   |}|  |}|dd | |}~| |}~| |}~| |}~| jd ur/|| j9 }|dd || }~|S )Nr*   rp   )ry   
transpose_r{   r}   r   r   r   )r   r   condresidualyr>   r>   r?   forward   s"   






zConvNeXtBlock.forward)rj   N)
rF   rG   rH   intfloatrw   rJ   Tensorr   __classcell__r>   r>   r   r?   ri      s     ri   c                       sL   e Zd Z						ddedef fd	d
ZddejdejfddZ  ZS )DVAEDecoder   @         rp   Fidimodimc	           	         sz   t    || _tt||dddt t|ddd| _t fddt	|D | _
tj|ddd| _d S )N   r*   c                    s   g | ]}t d   qS )   )ri   .0_rn   hiddenrm   r>   r?   
<listcomp>  s    z(DVAEDecoder.__init__.<locals>.<listcomp>F)rq   bias)rv   rw   upr
   
Sequentialrx   r~   conv_in
ModuleListr2   decoder_blockconv_out)	r   r   r   n_layerbn_dimr   rm   rn   r   r   r   r?   rw      s   
zDVAEDecoder.__init__Nr   rZ   c                 C   s2   |  |}~| jD ]}|||}q	| |}~|S r   )r   r   r   )r   r   conditioningr   fr>   r>   r?   r     s   


zDVAEDecoder.forward)r   r   r   r   rp   Fr   )	rF   rG   rH   r   rw   rJ   r   r   r   r>   r>   r   r?   r      s     r   c                	       sx   e Zd Z		ddedee dedef fddZd	ejfd
dZd	ejdejf fddZ	d	ejdejfddZ
  ZS )GFSQh㈵>Trk   levelsGRc                    sL   t t|   t|t|||d| _t|| _|| _	|| _
|| _|| _d S )N)rk   r   num_quantizersrs   )rv   r   rw   r)   list	quantizerr`   prodn_indrt   	transposer   r   )r   rk   r   r   r   rt   r   r   r>   r?   rw   +  s   	
zGFSQ.__init__r   c                 C   s`   | j r	| dd}||d|d| j| jdddd}| j|}| j r.|ddS |S )Nr*   rp   r   r   )	r   viewsizer   r   permuter   get_output_from_indicesr   )r   r   featr>   r>   r?   _embedA  s
   ,zGFSQ._embedrZ   c                    s   t  |S r   )rv   __call__)r   r   r   r>   r?   r   H  s   zGFSQ.__call__c                 C   sd   | j r	|dd | |\}}|dddd }||d|dd}| j r0|ddS |S )Nr*   rp   r   r   )r   r   r   r   
contiguousr   r   )r   r   r   indr>   r>   r?   r   K  s   zGFSQ.forward)r   T)rF   rG   rH   r   r   rw   rJ   r   r   r   r   r   r>   r>   r   r?   r   *  s    r   c                       sF   e Zd Z fddZe 	d
dejded dejfdd	Z  Z	S )DVAEc                    s   t    td}t|dd| _t	t
dddddt t
dddddt | _tddd	d
dd| _tddd	d
dd| _tj
ddddddd| _tddddd| _d S )Nd   r   rp   i   r   r*   r   i   r   r      )r   r   r   r   r   Fr   )   r   r   r   )rk   r   r   r   )rv   rw   rJ   randr
   r   rc   
unsqueeze_r   r   rx   r~   downsample_convr   encoderdecoderout_convr   vq_layer)r   r   r   r>   r?   rw   V  s<   

zDVAE.__init__decodeinpmode)encoder   rZ   c                 C   s   |dkr7t | dr7| jd ur7| }| tj|| jdd|j	|d
d}~| |}| |}~|S | jd urC| j|}n|}||dd|dd |dfddddd}| | j|d	}~tj|| j|dS )
Nr   r   r   r*   )outr   rp   r   )r   )hasattrr   cloner   rJ   divr   r   expandr1   r   r   r   r   r   flattenr   r   mul)r   r   r   melr   r   vq_featsdec_outr>   r>   r?   r   ~  s8    


 zDVAE.forward)r   )
rF   rG   rH   rw   rJ   inference_moder   r   r   r   r>   r>   r   r?   r   U  s    (r   c                   @   s>   e Zd ZdededefddZdejdejdejfd	d
Z	dS ),CustomRepetitionPenaltyLogitsProcessorRepeatpenaltymax_input_idspast_windowc                 C   s6   t |tr	|dkstd| || _|| _|| _d S )Nr   z6`penalty` has to be a strictly positive float, but is )
isinstancer   
ValueErrorr   r   r   )r   r   r   r   r>   r>   r?   rw     s   
z5CustomRepetitionPenaltyLogitsProcessorRepeat.__init__r+   scoresrZ   c           	      C   s   | d| jkr|d| j | j}t|| dd}| d| jkr6|d| j| d| j   t	| j
|}| }||}||}|dk }t|||}~~~~~|S )Nr*   r   )r   r   narrowFone_hotsumr   zero_rJ   powr   r   multiplydividewhere)	r   r+   r   freqalphar   othconr   r>   r>   r?   r     s    


z5CustomRepetitionPenaltyLogitsProcessorRepeat.__call__N)
rF   rG   rH   r   r   rw   rJ   rK   rM   r   r>   r>   r>   r?   r     s    
r   c                       sN  e Zd ZdZeZg Zdef fddZe	 	ddej
deej
 fdd	Ze	 	ddej
d
ejdeeej
ej
f  deej
 fddZe	 		ddej
deeej
ej
f  defddZe	 							d dej
deeej
ej
f  dej
deeej
f deee  deee  fddZe	 deej
 fddZ  ZS )!ConditionalChatTTSad  A conditional text-to-speech model that can generate speech from text with speaker conditioning.

    This model extends PreTrainedModel to provide text-to-speech capabilities with:
    - LLM hidden state conditioning
    - Streaming generation

    The model uses a transformer architecture with LLM hidden states and can operate in both
    streaming and non-streaming modes for flexible deployment.

    The model process sequence in the following format:
    | text bos token | LLM embedding projected to tts embedding space | text tokens (fixed length, reserved for future tokens) | audio bos token | audio tokens (audio token length is not fixed)| audio eos token |

    The format is designed to support LLM-conditioned streaming audio generation.

    Usage:
    To support streaming generation, two global variables should be maintained outside of the model.
        1. `audio_input_ids`: stores *discrete* audio codes. It is a tensor with shape [1, sequence length+1, num_vq].
        2. `past_key_values`: stores the KV cache for both text tokens and audio codes. It is a list of tuples, each tuple contains two tensors with shape [1, num_attention_heads, sequence length, hidden_size // num_attention_heads]

    where `num_vq` is the number of audio codebooks, in default setting, it is `4`.

    1. Create an empty `past_key_values` with
    ```python
    initial_kv_cache_length = 1 + model.num_spk_embs + model.streaming_text_reserved_len # where `1` denotes the `bos` token
    dtype = model.emb_text.weight.dtype
    device = model.emb_text.weight.device
    past_key_values = [
        (
            torch.zeros(1, model.config.num_attention_heads, initial_kv_cache_length, model.config.hidden_size // model.config.num_attention_heads, dtype=dtype, device=device),
            torch.zeros(1, model.config.num_attention_heads, initial_kv_cache_length, model.config.hidden_size // model.config.num_attention_heads, dtype=dtype, device=device)
        )
        for _ in range(model.config.num_hidden_layers)
    ]

    2. At the same time, create an empty `audio_input_ids` with shape [1, sequence length, num_vq], `num_vq` denotes multiple layer audio codebooks. But here we also include text tokens in the sequence, but they will be zeros, and will not be used, just a placeholder.

    ```python
    initial_audio_input_ids_length = 1 + model.num_spk_embs + model.streaming_text_reserved_len + 1
    # [bos token, speaker embeddings, text tokens, audio bos token]
    audio_input_ids = torch.zeros(batch_size=1, initial_audio_input_ids_length, model.num_vq)
    ```

    2. Prefill some text tokens to TTS model (for example, 10 tokens) using `prefill_text` method.

    ```python
    outputs = llm.generate(**kwargs)
    llm_tokens = some_function_to_extract_llm_tokens(outputs)
    lm_spk_emb_last_hidden_states = some_function_to_extract_lm_spk_emb_last_hidden_states(outputs)
    tts_text_input_ids = tts_tokenizer.encode(llm_tokenizer.decode(llm_tokens))
    # here assume we are prefilling text token 0 to text token 9 (included), totally 10 tokens.
    begin = 0
    end = 9+1
    position_ids = torch.arange(begin, end, dtype=torch.long, device=device)

    past_key_values = model.prefill_text(
        input_ids=tts_text_input_ids,
        position_ids=position_ids,
        past_key_values=past_key_values,
        lm_spk_emb_last_hidden_states=lm_spk_emb_last_hidden_states,
    )
    ```

    3. Make a `streaming_tts_text_mask` to denote which position contains valid text tokens, similar to `attention_mask` in standard causal attention.

    ```python
    streaming_tts_text_mask = torch.zeros(model.streaming_reserved_length)
    streaming_tts_text_mask[0:end] = 1 # denotes these post
    ```

    3. Generate audio codes using `generate` method.

    ```python
    outputs = model.generate(
        input_ids=audio_input_ids,
        past_key_values=past_key_values,
        streaming_tts_text_mask=streaming_tts_text_mask,
        max_new_token=50,
    )

    # update past_key_values and input_ids
    past_key_values = outputs.past_key_values
    audio_input_ids = outputs.input_ids
    ```

    The `past_key_values` is extended by `max_new_token=50`, and `audio_input_ids` is also extended by `max_new_token=50` after `generate` calling.

    4. Notice that after prefilling `10` text tokens, the model can generate up to `50` audio tokens, if you want to generate more audio tokens, you need to prefill next `10` text tokens. And it is okay to only generate `25` audio tokens for faster initial response.

    5. Repeat steps `2,3,4` as needed in your streaming audio generation cases, but ensure usage complies with the following guidelines discussed above.
    configc                    sL  t     j| _ j| _ j| _ j| _ j| _ j| _ j| _ j	| _	 j
| _
 j| _ j| _ j| _ j| _ j| _ j| _ j| _| jjrSt j j| _ntj j jdd| _t fddt jD | _t j j| _t fddt jD | _ t! }|| _"t# j j$ j% j& j' j(d}t)|}|| _*d S )NFr   c                    s   g | ]
}t  j jqS r>   )r
   	Embeddingnum_audio_tokenshidden_sizer   r   r>   r?   r   ?  s    z/ConditionalChatTTS.__init__.<locals>.<listcomp>c                    s*   g | ]}t jtj j jd dddqS )Fr   weight)name)r   weight_normr
   r|   r   r   r   r   r>   r?   r   F  s    )r   intermediate_sizenum_attention_headsnum_hidden_layersmax_position_embeddingsattn_implementation)+rv   rw   use_speaker_embeddinguse_llm_hidden_stater/   r.   use_text	streamingrW   rV   streaming_text_reserved_lenaudio_bos_token_idnum_mel_binsnum_vqr   top_ptop_krepetition_penaltyr   use_mlpMultiModalProjectorllm_dimr   	projectorr
   r|   r   r2   emb_coder   num_text_tokensemb_text	head_coder   dvaer   r   r   r   r   r   r   model)r   r   r  model_configr  r   r   r?   rw   $  sV   


	
zConditionalChatTTS.__init__Nr+   lm_spk_emb_last_hidden_statesc                 C   s   |j d dks	J | |}| jrC|| jk}| rA|dus J || jjjj	}| |}t
j|ddd}t|||| j| jd |S t)a  Merge `input_ids` and `lm_spk_emb_last_hidden_states` to `inputs_embeds`.

        Args:
            input_ids (torch.Tensor): Input token IDs.
            lm_spk_emb_last_hidden_states (Optional[torch.Tensor], optional): Last hidden states of speaker embeddings from the language model. Defaults to None.

        Raises:
            NotImplementedError: If speaker embedding is not used and language model hidden states are not implemented.

        Returns:
            torch.Tensor: Prepared input embeddings for the model.
        r   r*   Nrp   r   )prk   )r+   r,   r-   r.   r/   )r1   r  r   r.   anytor  linear1r   r\   r   	normalizer@   r/   NotImplementedError)r   r+   r  rR   spk_emb_maskprojected_spk_embr>   r>   r?   merge_inputs_embeds^  s(   




z&ConditionalChatTTS.merge_inputs_embedsposition_idsrD   c                 C   s  |j d dks	J |dusJ | j||d}g }tt|D ]=}||| d ddddd|dddf ddf  || d ddddd|dddf ddf  f q| jd|||dd|d}|j}	tt|D ]}
|	|
 d dddd|dddf |dddf d f  ||
 d dddd|dddf |dddf d ddf< |	|
 d dddd|dddf |dddf d f  ||
 d dddd|dddf |dddf d ddf< qq|S )	a  Prefill a chunk of new text tokens in streaming setting.
        Specifically speaking, update `past_key_values` using new text tokens, then the model will read the new text tokens.

        Args:
            input_ids (Tensor): Tensor of shape [batch_size, seq_len]
            position_ids (LongTensor): Tensor of shape [batch_size, seq_len]
            past_key_values (List[Tuple[Tensor]]): KV Cache of all layers, each layer is a tuple (Tensor, Tensor) denoting keys and values. Each tensor is of seq_len = `self.streaming_text_reserved_len`. `past_key_values` will be updated.
            lm_spk_emb_last_hidden_states (Tensor, optional): Tensor of shape [batch_size, num_spk_emb, llm_dim]. Defaults to None.

        Note that all `batch_size` should be `1`.
        r   r*   N)r+   r  TFattention_maskr  rD   rR   	use_cacheoutput_attentionscache_positionr   )r1   r  r2   lenappendr   r  rD   )r   r+   r  rD   r  rR   past_key_values_for_prefillioutputs_prefill#past_key_values_for_prefill_updated	layer_idxr>   r>   r?   prefill_text  sN   66
0
6
0
6	zConditionalChatTTS.prefill_textTadd_audio_bosc              	      s   j d dks	J |dusJ  fddtjD }t|dd} j d }|rKtjjggtjj	d}
|}	tj|	|gdd}|d7 }|d d j d	 }
tj|
|
| tjj	dd}| }t||d d j d	 |jjd
}j||||dd|d}|j}|S )a  Prefill a chunk of audio ids to the model. Used in sliding-window long audio generation.
        Specifically, prefill many audio ids (typically from last window) to the model in the new window.

        Args:
            input_ids (torch.Tensor): (1, seq_len, num_vq) Audio input token ids.
            past_key_values (List[Tuple[torch.Tensor, torch.Tensor]]): Past key values for attention mechanism.
        r   r*   Nc                    ,   g | ]}j |  d d d d |f qS r   r  r   r'  r+   r   r>   r?   r     s   , z8ConditionalChatTTS.prefill_audio_ids.<locals>.<listcomp>r   r\   r]   rk   rp   rR   rS   rT   rU   rW   TFr  )r1   r2   r  rJ   stackr   tensorr  longr]   r  catarangerc   r   rh   r  rW   r  rD   )r   r+   rD   rT   r,  code_embrR   	input_lennarrowed_input_idsbos_inputs_embedspast_key_values_lengthr  r#  re   outputsr>   r0  r?   prefill_audio_ids  sP   

		z$ConditionalChatTTS.prefill_audio_idsFrQ   rP   temperature	eos_tokenlogits_warperslogits_processorsc           &   
      s  |j d dks	J |dusJ |	pg }	|
pg }
djj  j d }tj|j d |jd }|d	|j d d
 dd}|j d }tj|j d || |j d |j|jd}|dd|| ~|dd|}d}|ryt|dd	d
}djj  j d }t|D ]}d}||krd}||d d j d d ksJ |rtjjggtjjd  } n#|jd|j d d dd  fddtjD }t|dd}tj|d d j d gtjjdd}| }t||d d j d |jjd}j||||dd|d}~~~~|j}|j}t 4 tj |!d|!dj"jtj#jd}tjD ]}j$| |}||d|f< ~q=W d   n	1 sZw   Y  ~|ddd%d# }|&ddd}|'d|!d}|d||!d| &ddd}|'|!d|!d d(j}~|| }|s|
D ]} | ||}q|s|	D ]}!|!||}q~||k rtj) |dd|f< |rtj) |dd|f< t*j+|dd}"~tj,|"dd}#~"|#dj}#|#-|.d}$|/|$ ~$|d|d|#0d |dkr|. r n~#|d7 }|dd|}|1 r. n|dur8|2d q|durB|3  |1 sR|rRt45d|  ~|1 rf|dd|dddf }%n|dd|dddf }%t6|%|||1 dS )a  Generate audio codes in streaming setting or non-streaming setting.
        Specifically speaking, generate audio codes when not all text tokens are prefilled.

        Always pass a valid `past_key_values` to the method. The method does not do `prefill` by itself. It relies on `prefill_text` method to provide valid `past_key_values`. Please refer to docstring of this class for more details.

        In this method, we borrowed a lot of codes from `https://github.com/2noise/ChatTTS/blob/main/ChatTTS/model/gpt.py`.

        Args:
            input_ids (torch.Tensor): Input token ids.
            past_key_values (List[Tuple[torch.Tensor, torch.Tensor]]): Past key values for attention mechanism.
            temperature (torch.Tensor): Temperature for sampling.
            eos_token (Union[int, torch.Tensor]): End of sequence token.
            streaming_tts_text_mask (Optional[torch.Tensor], optional): Mask for streaming TTS text. Defaults to None.
            max_new_token (int, optional): Maximum number of new tokens to generate. Defaults to 50.
            logits_warpers (List[LogitsWarper], optional): List of logits warpers. Defaults to [].
            logits_processors (List[CustomRepetitionPenaltyLogitsProcessorRepeat], optional): List of logits processors. Defaults to [].
            show_tqdm (bool, optional): Whether to show progress bar. Defaults to True.

        Returns:
            GenerationOutputs: Generation outputs.
        r   r*   Nr]   r   rp   r1  codezG{l_bar}{bar}| {n_fmt}/{total_fmt}(max) [{elapsed}, {rate_fmt}{postfix}])totaldesc
bar_formatFT)rk   startlengthc                    r-  r   r.  r/  r;  r   r>   r?   r     s    z/ConditionalChatTTS.generate.<locals>.<listcomp>r   r3  r  .r2  )num_samplesz&incomplete result. hit max_new_token: )rB   rC   rD   rE   )7r1   r/   r   r  rJ   zerosr]   rN   rc   r   r   r   r\   r   copy_r   r2   r5  r  r6  r  r  r4  r   r   rh   rW   r  last_hidden_staterD   Pcachedemptyr   r   r   r  squeeze_r   reshaper  infr   softmaxmultinomialeqr  logical_or_r   allupdatecloser'   inforA   )&r   r+   rD   r@  rA  rT   force_no_stopmin_new_tokenmax_new_tokenrB  rC  	show_tqdm	start_idxfinishprogressinput_ids_bufpbarcondition_lengthr'  	audio_bosrR   r9  r  r#  re   r>  hidden_stateslogitsnum_vq_iterr   input_ids_slicedlogits_tokenlogitsProcessorslogitsWarpersr   idx_next	finish_orgenrated_input_idsr>   rK  r?   generate  sZ  (




	










zConditionalChatTTS.generateresult_listc           	      C   s   | j }d}t|dkrtjg tjdS |D ]}|d|kr#|d}qtjt||d d|f|d j|d j	d}t
t|D ]}|| }|| dd|d|dd ~qB||}~|S )a-  Decode discrete audio codes to mel spectrograms.

        Borrowed from `https://github.com/2noise/ChatTTS/blob/main/ChatTTS/core.py`

        Args:
            result_list (List[torch.Tensor]): Audio codes output from `generate`.

        Returns:
            torch.Tensor: Mel spectrograms.
        r   r   r\   r*   r1  )r  r$  nparrayfloat32r   rJ   rM  r\   r]   r2   r   rN  r   )	r   rt  r   	max_x_lenresultbatch_resultr'  src	mel_specsr>   r>   r?   decode_to_mel_specs  s(   
&z&ConditionalChatTTS.decode_to_mel_specsr   )NT)NFrQ   rP   NNF)rF   rG   rH   rI   r   config_class_no_split_modulesrw   rJ   r   r   r   r  rK   r   r   r+  rN   r?  r	   r   r(   r   rs  r~  r   r>   r>   r   r?   r     s|    [:-G;

  r   c                       sf   e Zd Zddedef fddZ			ddejdejd	ejd
ede	e
 de	e dejfddZ  ZS )MiniCPMWhisperEncoderLayerNr   r*  c                    s   t    |j| _t| j|j|j||d| _t	| j| _
|j| _t|j | _|j| _t| j|j| _t|j| j| _t	| j| _d S )N)	embed_dim	num_headsdropoutr   r*  )rv   rw   d_modelr  r   encoder_attention_headsattention_dropout	self_attnr
   rz   self_attn_layer_normr  r   activation_functionactivation_fnactivation_dropoutr|   encoder_ffn_dimfc1fc2final_layer_norm)r   r   r*  r   r>   r?   rw   D  s    
z#MiniCPMWhisperEncoderLayer.__init__Fri  r   layer_head_maskr"  rD   r!  rZ   c                 C   s  |}|  |}| j|||||d\}}tjj|| jdd}|| }|}| |}| | |}tjj|| jdd}| 	|}tjj|| jdd}|| }|j
tjkrst| sat| rst|j
jd }	tj||	 |	d}|f}
|r}|
|f7 }
|r|
|f7 }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, embed_dim)`):
                Hidden states to be fed into the encoder layer.
            attention_mask (`torch.FloatTensor` of shape `(batch_size, 1, tgt_len, src_len)`):
                Attention mask where padding elements are indicated by large negative values.
            layer_head_mask (`torch.FloatTensor` of shape `(encoder_attention_heads,)`):
                Mask to nullify selected heads of the attention modules.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attention weights.
            past_key_values (`EncoderDecoderCache`, *optional*):
                Past key-value pairs used for incremental decoding.
            use_cache (`bool`, *optional*):
                Whether or not to return updated `past_key_values` for caching.

        Returns:
            A tuple of shape `(hidden_states, optional(attn_weights), optional(past_key_values))`.
        )ri  r   r  r"  past_key_valueFr  trainingi  )r4   r5   )r  r  r
   
functionalr  r  r  r  r  r  r\   rJ   float16isinfr  isnanr^   r5   clamp)r   ri  r   r  r"  rD   r!  r   attn_weightsclamp_valuer>  r>   r>   r?   r   V  sN   





z"MiniCPMWhisperEncoderLayer.forwardr   )FNF)rF   rG   rH   r   r   rw   rJ   r   rN   r   r   r   r   r>   r>   r   r?   r  C  s(    r  c                       sL   e Zd Zdef fddZ							d	dee dee fddZ  Z	S )
MiniCPMWhisperEncoderr   c                    s0   t    t fddt jD | _d S )Nc                    s   g | ]}t  |d qS ))r*  )r  r/  r   r>   r?   r     s    
z2MiniCPMWhisperEncoder.__init__.<locals>.<listcomp>)rv   rw   r
   r   r2   encoder_layerslayers)r   r   r   r   r?   rw     s   

zMiniCPMWhisperEncoder.__init__NrD   r!  c	              	   C   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|j| jjj| jjjd}t	j
| |}	t	j
| |	}	|	ddd}	| jj}
d}|r|du rXtt t }nt|trgtt|t }nt|trst|t }n	 |j|	jd }|	jd | |
jd krtd |
|dddf }t|tj|
dddf d|	jd |
jd  | ddf}
n|
||	jd | ddf }
n|
d|	jd ddf }
|	|
 }t	j
j|| jd	d
}|rdnd}|rdnd}|dur| d t| j ksJ dt| j  d| d  dt!| j D ]E\}}|r$||f }d	}|r,d}n||||dur8|| nd|||d}|d }|rP||rLdnd }nd}|r\||d f }q| "|}|rk||f }|szt#dd |||fD S t$||||dS )a  
        Forward pass of the Whisper encoder.

        Args:
            input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Float values of log-mel features extracted from the raw audio waveform. Typically generated
                by a feature extractor (e.g., `WhisperFeatureExtractor`) that processes `.flac` or `.wav`
                files into padded 2D mel spectrogram frames. These features are projected via convolution layers
                (`conv1` and `conv2`) and then transformed into embeddings for the encoder.

            attention_mask (`torch.Tensor`, *optional*):
                Not used by Whisper for masking `input_features`, but included for API compatibility with
                other models. If provided, it is simply ignored within the model. By default, Whisper
                effectively ignores silence in the input log-mel spectrogram.

            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected attention heads. The elements should be either 1 or 0, where:
                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked** (i.e., the attention head is dropped).

            output_attentions (`bool`, *optional*):
                Whether or not to return the attention tensors of all encoder layers. If set to `True`, the
                returned tuple (or `BaseModelOutputWithPast`) will contain an additional element with
                attention weights for each encoder layer.

            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. If set to `True`, the returned
                tuple (or `BaseModelOutputWithPast`) will contain a tuple of hidden states, including the
                initial embedding output as well as the outputs of each layer.

            return_dict (`bool`, *optional*):
                Whether or not to return a `BaseModelOutputWithPast` (a subclass of `ModelOutput`) instead
                of a plain tuple. If set to `True`, the output will be a `BaseModelOutputWithPast` object,
                otherwise it will be a tuple.

            past_key_values (`EncoderDecoderCache`, *optional*):
                When using caching for faster inference, this is an object that stores the key-value pairs
                for attention states. If provided, the model will append new states to the existing cache
                and return the updated cache. This speeds up sequential decoding or chunked inference.

                - If `past_key_values` is `None`, no past states are used or returned.
                - If `past_key_values` is not `None` and `use_cache=True`, the model will use the provided
                cache and return the updated cache (as `next_encoder_cache`).

            use_cache (`bool`, *optional*):
                Whether or not the model should use caching (`past_key_values`) to speed up processing
                during inference. When set to `True`, the model will:
                - Inspect and use `past_key_values` if provided.
                - Return updated `past_key_values` (under the name `next_encoder_cache` in
                    `BaseModelOutputWithPast`).

        Returns:
            `BaseModelOutputWithPast` or `tuple` (depending on `return_dict`):
                If `return_dict=True`, a `BaseModelOutputWithPast` is returned, which contains:
                - **last_hidden_state** (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                The output of the final encoder layer.
                - **hidden_states** (`tuple(torch.FloatTensor)`, *optional*, returned if `output_hidden_states=True`):
                Hidden states of the model at each layer (including the initial projection).
                - **attentions** (`tuple(torch.FloatTensor)`, *optional*, returned if `output_attentions=True`):
                Attention weights from each encoder layer.
                - **past_key_values** (an object of type `EncoderDecoderCache` or `None`, *optional*):
                Updated cache of key-value pairs if `use_cache=True`.

                If `return_dict=False`, a tuple is returned, where the format is:
                `(last_hidden_state, hidden_states, attentions)`, with `hidden_states` and `attentions`
                only present if their respective `output_*` arguments are set to `True`.

        Nr1  r   rp   r*   zHseems the audio is longer than 30s. repeating the last part of the audior   r2  Fr  r>   z&The head_mask should be specified for z layers, but it is for .)NN)r  r"  rD   r!  c                 s   s    | ]	}|d ur|V  qd S r   r>   )r   vr>   r>   r?   	<genexpr>p  s    z0MiniCPMWhisperEncoder.forward.<locals>.<genexpr>)rO  ri  
attentionsrD   )%r   r"  output_hidden_statesuse_return_dictr  conv1r   r\   r]   r
   r  geluconv2r   embed_positionsr   r   r   r   from_legacy_cacheself_attention_cacheget_usable_lengthr1   r'   warningrJ   r7  repeat_interleaverc   r  r   r$  r  	enumerate
layer_normtupler   )r   input_featuresr   	head_maskr"  r  return_dictrD   r!  rR   	embed_posr=  embed_pos_frontri  encoder_statesall_attentionsr7   encoder_layerto_droplayer_outputsnext_encoder_cacher>   r>   r?   r     s   Q





	

zMiniCPMWhisperEncoder.forward)NNNNNNN)
rF   rG   rH   r   rw   r   r   rN   r   r   r>   r>   r   r?   r    s    	r  c                       s$   e Zd Z fddZdd Z  ZS )r
  c                    s<   t    tj||dd| _t | _tj||dd| _d S )NT)in_featuresout_featuresr   )rv   rw   r
   r|   r  ReLUrelulinear2)r   in_dimout_dimr   r>   r?   rw   ~  s   

zMultiModalProjector.__init__c                 C   s   |  | |}| |}|S r   )r  r  r  )r   audio_featuresri  r>   r>   r?   r     s   
zMultiModalProjector.forward)rF   rG   rH   rw   r   r   r>   r>   r   r?   r
  }  s    r
  c                       s  e Zd Z	d:dedee ddf fddZdd Zd	d
 Z		d;dedee de	de
jfddZ	d<dedee de	fddZ		d;dededee de	de
jf
ddZdee defddZdejfddZdee fddZded d!fd"ed#ed$ed%ejd&edejfd'd(Zd=dee fd)d*Zdee dejfd+d,Z		-d>dee fd.d/Zdee dejfd0d1Zdejd2ejd3ed4e dejf
d5d6Z!d7e"e#e	ejf  fd8d9Z$  Z%S )?MiniCPMONr   quant_configrZ   c                    s   t  j||d | j||d| _| jjj| _| jjr0| j||d| _	| j	j| _
| | j| j
| _d| j_| jjr]|  | _t| jjjd }tj| jj| jjd| _t|| jd| _d| _d| j_td | jjrwtspJ d	|  | _d S d S )
N)r   r  Tr   )stride)r  r  r   FzTTS is disabled for nowzAplease make sure vector_quantize_pytorch and vocos are installed.)rv   rw   init_llmllmr   r   r  init_visioninit_vision_modulevpm
vision_diminit_resampler	resampler
init_audioinit_audio_moduleapmr   r  r
   	AvgPool1daudio_pool_stepaudio_avg_poolerr
  audio_projection_layeraudio_encoder_layerinit_ttsr'   r]  	_tts_depsinit_tts_moduletts)r   r   r  audio_output_dimr   r>   r?   rw     s6   


zMiniCPMO.__init__c                 C      t | jj}|S r   )r   r   
tts_configr   r  r>   r>   r?   r       zMiniCPMO.init_tts_modulec                 C   r  r   )r  r   audio_configr  r>   r>   r?   r    r  zMiniCPMO.init_audio_module prefixc                 C   s   t |||dS )Nr   r  r  r%   )r   r   r  r  r>   r>   r?   r    s   zMiniCPMO.init_llmc                 C   sr   | j jdkrd| j j_nd| j j_t|j||d}| j jr'|jjd d |j_t|d|jj	 t|d|jj
 |S )Nflash_attention_2eagerr  r   r  
patch_size)r   _attn_implementationvision_configr"   drop_vision_last_layerr   r  setattr
embeddingsr  r  )r   r   r  r  r  r>   r>   r?   r    s   
zMiniCPMO.init_vision_moduler  r  c              	   C   sX   t tj t| jj||d |||d}W d    n1 sw   Y  |jdt dS )Nr   )num_queriesr  r  kv_dimr  r  cudar]   r\   )r    rJ   r  r$   r   	query_numr  get_default_dtype)r   r  r  r  r  r  r>   r>   r?   r    s   zMiniCPMO.init_resamplerr+   mm_inputc           
      C   sT   |j }|j}|j}|j}||f||f|j|jfg}||jg}t||d}	|	||S )N)data_token_pairsdata_start_token_ids)im_start_id	im_end_idslice_start_idslice_end_idaudio_start_idaudio_end_idr   pad_input_tokens)
r   r+   r  r  r  r  r  r  r  patternr>   r>   r?   pad_input_ids  s   

zMiniCPMO.pad_input_idsinput_lengthsc                 C   s>   |d d d }|| j j | j j d }|jtjd}||fS )zs
        Computes the output length of the convolutional layers and the output length of the audio encoder
        r*   rp   ru  )r   r  r  rJ   int32)r   r  input_lengths_after_cnninput_lengths_after_poolingr>   r>   r?    _get_feat_extract_output_lengths  s   
z)MiniCPMO._get_feat_extract_output_lengthsitemsc              	   C   s  t dd |D }t dd |D }t|dkrt|}|j\}}}|dks)J |d d d }| jdur_| jd d jd }	| jjjjd }
|	| |
kr_t	
d|	|  d	|
 d
 d| _| j|| jdd}|j}|j| _| |}|dd}| |}|dd}| |\}}|}g }d}tt|D ](}g }tt|| D ]}|||d|| ddf  |d7 }q|| q|S g S )a  
        Extract audio embeddings in a streaming manner using cached key-value pairs.

        This method processes incoming audio features incrementally and stores/updates `past_key_values`
        for faster inference on subsequent audio frames. It only supports batch_size=1 and is intended
        for streaming scenarios.

        Returns:
            List[List[torch.Tensor]]: audio embeddings
        c                 S      g | ]}|j r|j qS r>   featurer   itemr>   r>   r?   r         z:MiniCPMO.get_audio_embedding_streaming.<locals>.<listcomp>c                 S   r  r>   audio_feature_lensr  r>   r>   r?   r     r	  r   r*   rp   Nzaudio_past_key_values length z exceed z, reset.T)rD   r!  )r   r$  rJ   hstackr1   audio_past_key_valuesr  r  r   r'   r  rO  rD   r  r   r  r  r2   r%  )r   r  wavformsaudio_feature_lens_rawr  r6   r   max_mel_seq_lenmax_seq_lencache_lengthapm_max_lenaudio_outputsaudio_statesaudio_embedsfeature_lens_after_poolingr   final_audio_embedsr7   r'  target_audio_embedsr>   r>   r?   get_audio_embedding_streaming  sV   




z&MiniCPMO.get_audio_embedding_streamingr   cpur   r   
chunk_sizenum_left_chunksr]   num_lookheadc           
      C   st   t j|||t jd}t|D ])}|dk rd}nt|| | | d}t|| d | | |}	d||||	f< q|S )a  Create mask for subsequent steps (size, size) with chunk size,
        this is for streaming encoder

        Args:
            size (int): size of mask
            chunk_size (int): size of chunk
            num_left_chunks (int): number of left chunks
                <0: use full chunk
                >=0: use num_left_chunks
            device (torch.device): "cpu" or "cuda" or torch.Tensor.device

        Returns:
            torch.Tensor: mask

        r  r   r*   T)rJ   rM  rN   r2   r5   r4   )
r   r   r  r  r]   r  retr'  rI  endingr>   r>   r?   subsequent_chunk_maskP  s   zMiniCPMO.subsequent_chunk_maskc              
   C   s  t dd |D }t dd |D }|rIt|d tjr$dd |D }n%t|d trIg }|D ]}t|tr<|| q/|| q/dd |D }g }t|tsRJ t|d tjs\J |D ]}t|dkrNg }	|D ]}t|trx|	| qk|	| qkt|	}
|j	\}}}|d d d }tj
d||
j|
jd	d||}|
d||}||k}||dd||d||}|j| jjjj| jjjjd	}|dkrt|d
 }| j||d|jd}t|t|}td||< | j|d|dj| j }| |}|dd}| |}|dd}| |
\}}|}d}tt|D ]*}g }tt|| D ]}|||d|| ddf  |d7 }q/|| q#|  S dS )as  
        Extract full audio embeddings with optional chunk-based attention.

        This method computes embeddings for all audio frames at once, either using full attention (when
        `chunk_length` is -1) or chunk-based attention (when `chunk_length` is a positive number). It does
        not use key-value caching and is suitable for non-streaming inference.

        Args:
            chunk_length (int, optional): Determines whether to use full attention (-1) or chunk-based
                attention (>0) during embedding computation.

        Returns:
            List[List[torch.Tensor]]: audio embeddings
        c                 S   r  r>   r  r  r>   r>   r?   r     r	  z0MiniCPMO.get_audio_embedding.<locals>.<listcomp>c                 S   r  r>   r
  r  r>   r>   r?   r     r	  r   c                 S   s   g | ]}|gqS r>   r>   )r   lensr>   r>   r?   r         c                 S   s    g | ]}t |ts|gn|qS r>   )r   r   r  r>   r>   r?   r         r*   rp   r1  rP   r   )r   r  r  r]   z-infT)r  r   N) r   r   rJ   r   r   extendr%  r$  r  r1   r8  r\   r]   rc   r   r   r  r  r  r   r   r!  
logical_orlogical_notr   ri  r  r  r   r  r  r2   )r   r  chunk_lengthr  r  	flattenedr  r  wavformflattened_lensr  r6   r   r  r  	seq_rangelengths_expandpadding_maskaudio_attention_mask_audio_attention_maskchunk_num_frame
chunk_maskr  r  r  r   r7   r'  r  r>   r>   r?   get_audio_embeddingq  s   









zMiniCPMO.get_audio_embeddingc                 C   s   | j || jjdd}|S )NF)r  r(  stream_input)get_omni_embeddingr   audio_chunk_length)r   r  	embeddingr>   r>   r?   get_audio_feature  s   zMiniCPMO.get_audio_featureFc                 C   s:   |r|  |}n| ||}t|}tjt|dd}|S )z
        Args:
            chunk_length: whisper use full attention or chunk attention
            stream_input: use streaming audio embedding
        Returns:
            final embeddings with audio feature
        r   r2  )r  r3  r$  rJ   r7  r   )r   r  r(  r4  audio_embeddingsbs
audio_embsr>   r>   r?   r5    s   zMiniCPMO.get_omni_embeddingc                 C   s  t dd |D }tjt dd |D dd}t||jd ks"J | jjjjj	}| jjjjj
}dd |D }|d d df |d d df    }t|tsRJ tjjjj|dd	d
}|j\}	}
}|ddd|	dd|
}tj|	d|ftj|d}| j|j	d}|d d df |d d df  }tj|d|j	dd|dk |d d dd d f< | j||||d}| ||S )Nc                 S      g | ]}|j qS r>   r  r  r>   r>   r?   r     r#  z.MiniCPMO.get_image_feature.<locals>.<listcomp>c                 S   r<  r>   )tgt_sizer  r>   r>   r?   r     r#  r   r2  c                 S   s    g | ]}|j d dd dqS )r*   )end_dimr   )r   r   r/  r>   r>   r?   r     r$  r*   Tg        )batch_firstpadding_valuerp   r   r   r1  rD  )patch_attention_mask	tgt_sizes)r   rJ   r4  r$  r1   r  r  position_embeddingr   r]   r\   r5   r  r   r   r
   utilsrnnpad_sequencer   rT  rM  rN   r   r  r8  r   rc   typer  )r   r  pixel_valuesrB  r]   r\   all_pixel_values_lstmax_patchesall_pixel_valuesBLr   patch_attn_masktgt_sizes_tensormask_shapesvision_embeddingr>   r>   r?   get_image_feature  sB   (
 zMiniCPMO.get_image_feature	positionsforward_batchkwargsc                 K   s   t ||| j| |d}|S )N)r+   rT  language_modelmultimodal_modelrS  )r   r  )r   r+   rS  rT  rU  ri  r>   r>   r?   r   4  s   zMiniCPMO.forwardweightsc                 C   s  g d}t |  }|D ]\}}d|v sd|v rqd|v s!d|v r"q| jjrSd|v rSd|v s2d|v r?|dd	}|dd
}nd|v rS||vrS|dd	}||v rS|}d|v r]|dd}| jjsfd|v rfq| jjssd|v srd|v rsq| jjs|d|v r|qd|v sd|v sd|v rd|v sd|v rd|v r|| }t|dt}||| q|D ](\}}	}
|	|vrq||	|}|	dr||vrq|| }|j
}||||
  n|	dr||vrq|| }t|dt}||| qd S )N))qkv_projq_projq)rY  k_projk)rY  v_projr  )gate_up_proj	gate_projr   )r_  up_projr*   zrotary_emb.inv_freq~r  zrotary_emb.cos_cachedzrotary_emb.sin_cachedr  z	.weight_gz	.weight_vz".parametrizations.weight.original0z".parametrizations.weight.original1z.weightr  zself_attn.out_projzself_attn.projr  audiosamplerr  ztts.model.layersz.mlpweight_loaderz.bias)dictnamed_parametersr   r  replacer  r  getattrr!   endswithrd  )r   rX  stacked_params_mappingparams_dictr   loaded_weight
param_nameparamrd  weight_nameshard_idr>   r>   r?   load_weightsE  sl   	

zMiniCPMO.load_weightsr   )Nr  )r  )r   )r   F)&rF   rG   rH   r   r   r   rw   r  r  strr
   Moduler  r  r   r  r   r   r  rJ   rK   r  r   r  r]   r   r!  r3  r8  r5  rR  r   r   r   r   r   rq  r   r>   r>   r   r?   r    s    *


G
!|
'
$r  )NNNr   r*   )rO   rP   rQ   r*   T)ZrI   r`   dataclassesr   typingr   r   r   r   r   r   r	   numpyrv  rJ   torch.nn.functionalr
   r  r   torch.nn.utils.parametrizerD  parametrizerP  torch.typestorch.nn.utilsr   r   transformersr   r   r   r   transformers.activationsr   transformers.cache_utilsr   r   transformers.modeling_outputsr   r   ,transformers.models.whisper.modeling_whisperr   r   r   sglang.srt.layers.quantizationr   sglang.srt.managers.mm_utilsr   r   "sglang.srt.managers.schedule_batchr   r   r   ,sglang.srt.model_executor.forward_batch_infor   sglang.srt.model_loader.utilsr    $sglang.srt.model_loader.weight_utilsr!   sglang.srt.models.idefics2r"   sglang.srt.models.minicpmvr#   r$   sglang.srt.models.qwen2r&   sglang.srt.utilsr'   r(   vector_quantize_pytorchr)   r  r   r   r@   rA   rN   rh   rs  ri   r   r   r   r   r   r  r  r
  r  
EntryClassr>   r>   r>   r?   <module>   s   $
$	
L6-+Q     _ \    
