o
    ci!                     @   sF   d Z ddlZddlmZ ddlmZmZ ddlmZ G dd dZ	dS )	a?  
CUDA graph capture for the code predictor's 15-step decode loop,
using transformers StaticCache.

The predictor generates 15 codebooks autoregressively:
- Step 0: prefill with 2 tokens (past_hidden + first_codebook_embed), get logits[0]
- Steps 1-14: decode 1 token at a time using previous codebook token's embedding

Strategy:
- Use transformers StaticCache for KV cache management
- Use the predictor's inner model forward (handles mask, RoPE, attention internally)
- Unroll the full 15-step loop for deterministic shapes
- Capture the entire loop as a single CUDA graph
    N)StaticCache)create_causal_mask!create_sliding_window_causal_mask   )sample_logitsc                   @   s   e Zd ZdZdejddddfddZd	d
 ZdejdejfddZ	dd Z
dd Ze dddZe dejdejfddZdS )PredictorGrapha7  
    Captures the full predictor 15-step loop as a CUDA graph,
    using the model's forward with transformers StaticCache.

    Usage:
        mpg = PredictorGraph(code_predictor, pred_config, talker_hidden_size)
        mpg.capture()
        codebook_tokens = mpg.run(pred_input)  # pred_input: [1, 2, H]
    cudaT2   g      ?g?c
                    s>   | _ t  j}
|
d ur|
ntj }
|
| _|| _|j| _|j	| _	|j
| _
| j
d | _d| j | _|| _|| _|| _|	| _|}|j| _|j| _|j| _|jj| _dt| jjdg v | _t|| jd| _tjd d| _ fddt | jd D | _!tj"dd|| d	| _#tj"| jtj$ d	| _%d | _&d
| _'d | _(d | _)d S )Nr      sliding_attentionlayer_types)configmax_cache_lendevicec                    s    g | ]}t jd | g dqS )r
   r   )torchtensor).0ir    V/home/ubuntu/vllm_env/lib/python3.10/site-packages/faster_qwen3_tts/predictor_graph.py
<listcomp>A   s    z+PredictorGraph.__init__.<locals>.<listcomp>dtyper   F)*r   r   indexr   current_devicedevice_indexr   num_hidden_layers
num_layershidden_sizenum_code_groupsnum_codebooksmax_seq	do_sampletop_ktop_ptemperaturesmall_to_mtp_projectionsmall_to_mtpmodel
pred_modellm_headlm_headscodec_embeddingcodec_embedsgetattrr   has_sliding_layersr   static_cachearangeprefill_cache_posrangedecode_cache_positionszeros	input_buflongoutput_tokensgraphcapturedprefill_attndecode_attn)selfcode_predictorpred_configtalker_hidden_sizer   r   r#   r$   r%   r&   r   cpr   r   r   __init__"   s>   


zPredictorGraph.__init__c                 C   sf   | j j}t|d|j}t|d|j|j }tjd|d|| j| jd}| j	j
D ]
}|js0|| q&dS )zEForce lazy initialization of StaticCache layers before graph capture.num_key_value_headshead_dimr   r   N)r*   r   r/   num_attention_headsr   r   r6   r   r   r1   layersis_initializedlazy_initialization)r>   r   num_kv_headsrE   dummy_klayerr   r   r   _init_cache_layersN   s   
z!PredictorGraph._init_cache_layersinput_embedscache_positionc                 C   sH   t | jj|d || jd}| jr t| jj|d || jd}||dS d|iS )N)r   rN   attention_maskrO   past_key_values)full_attentionr   rR   )r   r*   r   r1   r0   r   )r>   rN   rO   maskslidingr   r   r   _make_attn_maskX   s"   
zPredictorGraph._make_attn_maskc                 C   sn   t jdd| j| j| jd}t jdd| j| j| jd}| || j| _g | _| j	D ]}| j
| || q(d S )Nr   r
   r   )r   r6   r   r   r   rU   r3   r<   r=   r5   append)r>   dummy_prefilldummy_decodeposr   r   r   _build_attention_masksk   s   
z%PredictorGraph._build_attention_masksc                 C   sR  |  | j}| j|| j| j| jdd}|j}| jd |ddddddf }t|dddddf | j	| j
| j| jd}|d | jd< td| jD ]Z}| j|d  |d}|  |}| j|| j|d  | j| j|d  dd}|j}| j| |ddddddf }t|dddddf | j	| j
| j| jd}|d | j|< qK| jS )z2The full 15-step predictor loop on static buffers.T)inputs_embedsrP   rQ   rO   	use_cacher   N)r&   r$   r%   r#   r   )r(   r7   r*   r<   r1   r3   last_hidden_stater,   r   r&   r$   r%   r#   r9   r4   r!   r.   	unsqueezer=   r5   )r>   houtlogitstokcb_idxembr   r   r   
_full_loops   sN   $
$zPredictorGraph._full_loop   c              
   C   sZ  t d| d |   |   t|D ]}| j  |   qtj	  t d tj
| j_ tj }|tj  tj|< tj | _| j  |   tj	  | j  tj| j |   W d   n1 stw   Y  W d   n1 sw   Y  W d   n1 sw   Y  tj | tj	  d| _t d dS )z"Warmup and capture the CUDA graph.zWarming up predictor (z	 runs)...z%Capturing CUDA graph for predictor...NTzCUDA graph captured!)printrM   rZ   r4   r1   resetrf   r   r   synchronizer   r   Streamwait_streamcurrent_streamstream	CUDAGraphr:   r;   )r>   
num_warmup_sr   r   r   capture   s8   








zPredictorGraph.capture
pred_inputreturnc                 C   s*   | j | | j  | j  | j S )z
        Run the captured graph.
        pred_input: [1, 2, talker_hidden_size] (past_hidden cat first_codebook_embed)
        Returns: [15] long tensor of codebook tokens
        )r7   copy_r1   ri   r:   replayr9   clone)r>   rt   r   r   r   run   s   


zPredictorGraph.runN)rg   )__name__
__module____qualname____doc__r   bfloat16rC   rM   TensorrU   rZ   rf   inference_moders   ry   r   r   r   r   r      s    

,
6"r   )
r}   r   transformersr   transformers.masking_utilsr   r   samplingr   r   r   r   r   r   <module>   s   