o
    ci"                     @   s:   d Z ddlZddlmZ ddlmZmZ G dd dZdS )a'  
CUDA graph capture for the talker's single-token decode step,
using transformers StaticCache.

The talker has 28 transformer layers. Instead of reimplementing the
forward pass manually, we use the model's own forward with StaticCache.
The StaticCache provides fixed-size KV tensors compatible with CUDA graphs.

Strategy:
- Use transformers StaticCache for KV cache management
- Use the model's forward method (handles mask, RoPE, attention internally)
- Capture the single-token decode as a CUDA graph
- Update cache_position buffer between replays
    N)StaticCache)create_causal_mask!create_sliding_window_causal_maskc                   @   s   e Zd ZdZdejdfddZdd Zd!d	ejdB fd
dZ	de
fddZdd Ze d"ddZde
fddZdd Zd	ejdejdB fddZe dejde
dejfdd ZdS )#TalkerGraphz
    Captures the talker's single-token decode step as a CUDA graph,
    using the model's own forward with transformers StaticCache.
    cudai   c                 C   s   || _ t |j}|d ur|ntj }|| _|| _|| _|j| _|j	| _
|| _t||d| _tjdd| j||d| _tjdd| j||d| _tjdtj|d| _tjddtj|d| _tjdddtj|d| _d | _d| _d | _d | _d | _d S )N)configmax_cache_len   dtypedevice   F)r   torchindexr   current_devicedevice_indexr   max_seq_lenhidden_sizenum_hidden_layers
num_layersmodelr   static_cachezeros	input_buf
output_buflongcache_positionfloat32rope_deltasposition_idsgraphcaptured	attn_maskattn_mask_table	_mask_key)selftalker_modeltalker_configr   r   r   r    r(   S/home/ubuntu/vllm_env/lib/python3.10/site-packages/faster_qwen3_tts/talker_graph.py__init__   s(   
zTalkerGraph.__init__c                 C   sf   | j j}t|d|j}t|d|j|j }tjd|d|| j| jd}| j	j
D ]
}|js0|| q&dS )zEForce lazy initialization of StaticCache layers before graph capture.num_key_value_headshead_dimr	   r
   N)r   r   getattrnum_attention_headsr   r   r   r   r   r   layersis_initializedlazy_initialization)r%   r   num_kv_headsr,   dummy_klayerr(   r(   r)   _init_cache_layers=   s   
zTalkerGraph._init_cache_layersNattention_maskc                 C   s   t jdd| j| j| jd}| j}d g| | _| jjj	d u rt
nt}t|D ]}t j|g| jd}|| jj|||| jd}|| j|< q%| jd u rQ| jd  | _d S | j| jd  d S )Nr	   r
   r   )r   input_embedsr6   r   past_key_valuesr   )r   r   r   r   r   r   r#   r   r   sliding_windowr   r   rangetensorr   r"   clonecopy_)r%   r6   dummymax_lenmask_fniposfullr(   r(   r)   _build_attention_masksG   s"   
z"TalkerGraph._build_attention_maskspositionc                 C   s   | j | j|  d S N)r"   r>   r#   )r%   rF   r(   r(   r)   _set_attention_mask^   s   zTalkerGraph._set_attention_maskc                 C   s2   | j | j| j| j| j| jdd}| j|j dS )z0Single-token decode through the model's forward.T)inputs_embedsr6   r9   r   r   	use_cacheN)	r   r   r"   r   r   r   r   r>   last_hidden_state)r%   outr(   r(   r)   _decode_stepa   s   zTalkerGraph._decode_stepd   r   c              
   C   sP  t d| d |   |   || jd< | | t|D ]}|   qtj	  t d tj
| jU tj | _tj }|tj  tj|, |   tj	  tj| j |   W d   n1 sow   Y  W d   n1 s~w   Y  W d   n1 sw   Y  tj | tj	  d| _t d dS )z
        Capture CUDA graph for single-token decode.
        prefill_len: simulated prefill length for warmup (graph is position-independent).
        zWarming up talker graph (z	 runs)...r   z)Capturing CUDA graph for talker decode...NTzTalker CUDA graph captured!)printr5   rE   r   rH   r;   rM   r   r   synchronizer   r   	CUDAGraphr    Streamwait_streamcurrent_streamstreamr!   )r%   prefill_len
num_warmup_sr(   r(   r)   capturem   s6   







zTalkerGraph.capturerV   c                 C   s   | j   dS )zReset cache for new sequence.N)r   reset)r%   rV   r(   r(   r)   r[      s   zTalkerGraph.resetc                 C   s   | j   d}t| jD ]1}|| \}}|jd }|| jkr*td| d| j dtj|| j	d}| j 
|||d|i q|S )z
        Copy HF DynamicCache from prefill into our StaticCache.
        past_key_values: DynamicCache with num_layers layers of [1, kv_heads, seq_len, head_dim]
        r      zInput is too long: prefill has z tokens but max_seq_len=z.. Use shorter text or shorter reference audio.r7   r   )r   r[   r;   r   shaper   RuntimeErrorr   aranger   update)r%   r9   seq_lenlikv	cache_posr(   r(   r)   
prefill_kv   s   


zTalkerGraph.prefill_kvr   c                 C   s   d}d}|dur<|dkj dd}t| }tj|jd | j|j|jd}t	| D ]\}}|dkr;d||d|f< q+| j
du sF|| jkrN| | || _|du rY| j  dS | dkrd|d}| j|j| jj| jjd dS )zCSet padding-aware attention mask and rope deltas for decode parity.Nr   )dimr
   r	   )r   )sumtupletolistr   onesr]   r   r   r   	enumerater#   r$   rE   r   zero_rh   	unsqueezer>   to)r%   r6   r   mask_keyfull_attention_mask
pad_countsbpadsr(   r(   r)   set_generation_state   s.   

"z TalkerGraph.set_generation_stater8   returnc                 C   sf   | j | || jd< | | | j| jd | jj }| j|d	ddd | j
  | jS )z
        Run one decode step.
        input_embeds: [1, 1, hidden_size]
        position: current sequence position
        Returns: [1, 1, hidden_size] hidden states
        r   r   rg   )r   r>   r   rH   r   rp   r   r   ro   expandr    replayr   )r%   r8   rF   deltar(   r(   r)   run   s   


zTalkerGraph.runrG   )rN   r   )__name__
__module____qualname____doc__r   bfloat16r*   r5   TensorrE   intrH   rM   inference_moderZ   r[   rf   rv   r{   r(   r(   r(   r)   r      s     
"
' r   )r   r   transformersr   transformers.masking_utilsr   r   r   r(   r(   r(   r)   <module>   s
   