o
    ciF/                  #   @   s@  d Z ddlZddlmZmZ ddlZddlmZ ddlm	Z	m
Z
 ddlmZ e 				
					d#dejdejdejdejdededededededededededeeejef ddf fdd Ze 				
					d#dejdejdejdejdededededededededeeejef ddf fd!d"ZdS )$u   
Streaming generation with CUDA graphs for both predictor and talker.

Yields codec ID chunks during generation instead of collecting all at once.
CUDA graph usage is identical to non-streaming — same per-step performance.
    N)	GeneratorTuple   )PredictorGraph)apply_repetition_penaltysample_logits)TalkerGraph      ?2         ?T?   talker_input_embedsattention_masktrailing_text_hiddenstts_pad_embedpredictor_graphtalker_graphmax_new_tokensmin_new_tokenstemperaturetop_ktop_p	do_samplerepetition_penalty
chunk_sizereturnc           6      c   s   |j }|j}|j}tj|tj|d}td|d }t||D ]
}||kr)d||< q| j}| 	 }| j
}|	 }|j}t }| j||ddd||dddd
}|j}|j}|j}|jdddddf } |	dk}!t| |
|||||!rs|gndd}"||}#t| d	d}$|||$ tj  t | }%g }&g }'d}(d})t }*t|D ]}+|" |kr n||"d
},tj||,fd
d}-||-}.t|"d
|.g}/|&|/  |'|"  |,g}0t|d
 D ]}|0|| |.| dd qtj|0d
djd
dd}1||j d
 k r|1|dd|f d
 }1n|1| }1|#|+ }2|2|j!d
 kr- n|j|1|2d}3||3dddddf d} |dkrW|'rWt"|'}4t#| |4|} t$|'|	k }!t| %d|
|||||!rm|gndd}"|3ddddddf & }|d
7 }t$|&|krtj  t |* }5|(t$|&7 }(t"|&|)t$|&|)dkr|%d nd|5d |(ddfV  g }&|)d
7 })t }*q|&rtj  t |* }5|(t$|&7 }(t"|&|)t$|&|)dkr|%d nd|5d |(ddfV  dS dS )a  
    Streaming autoregressive generation with CUDA-graphed predictor and talker.

    Yields (codec_chunk, timing_info) tuples every chunk_size steps.
    codec_chunk: [chunk_steps, 16] tensor of codec IDs.
    The final chunk may be shorter than chunk_size.
    dtypedevicer      TN
inputs_embedsr   	use_cacheoutput_hidden_statesreturn_dicttrailing_text_hiddenr   generation_steppast_hiddenpast_key_valuesr   r   r   r   suppress_masksuppress_tokensrope_deltasr   dim)keepdim)positionr     Fchunk_indexchunk_steps
prefill_ms	decode_mstotal_steps_so_faris_final)'codec_eos_token_id
vocab_sizer!   torchzerosboolmaxrangecode_predictorget_input_embeddings
codec_headnum_code_groupstimeforwardr+   r*   r)   logitsr   
prefill_kvgetattrset_generation_statecudasynchronizeitem	unsqueezecatrunviewappenddetachsumshapemax_seq_lenstackr   lensqueezeclone)6talkerr   r   r   r   configr   r   r   r   r   r   r   r   r   r   eos_idr>   r!   r.   suppress_starti	predictortalker_codec_embedtalker_codec_headpredictor_codec_embedsrG   t_startouttalker_past_kvr*   gen_steprJ   suppress_eostokenprefill_lenr0   	t_prefillchunk_bufferall_first_tokenstotal_stepschunk_countchunk_startstep_idxlast_id_hidden
pred_inputcodebook_token_idsall_cbcodec_hiddensr$   current_poshidden_stateshistorychunk_decode_time r~   P/home/ubuntu/vllm_env/lib/python3.10/site-packages/faster_qwen3_tts/streaming.pyfast_generate_streaming   s   



$ 
	

	
r   c           '      c   s<   |j }|j}|j}tj|tj|d}td|d }t||D ]
}||kr)d||< qt }| j	||ddd||dddd
}|j
}|j}|j}|jdddddf }|dk}t|||	|
|||rb|gndd}|durn| }tj  t | }g }g }d}d} t }!t|D ]}"| |kr nd}#|durtj|||jd d	fgd	d
}tj|jd	 d	 g|jd}#| j	|d	d	|ddd|||||||	|
||#d}|jd	 }$|$du r n||$d  ||  |jdddddf }|dkr|rt|}%t||%|}t||k }t|||	|
|||r|gndd}|j
}|j}|j}t||kritj  t |! }&|t|7 }t|| t|| dkrT|d nd|&d |ddfV  g }| d	7 } t }!q|rtj  t |! }&|t|7 }t|| t|| dkr|d nd|&d |ddfV  dS dS )z
    Streaming generation without CUDA graphs (dynamic cache).

    Yields (codec_chunk, timing_info) tuples every chunk_size steps.
    r   r   r"   TNr#   r,   r-   r   r1   )r!   )	input_idsr   r%   r&   r'   r(   r   r)   r*   r+   subtalker_dosamplesubtalker_top_ksubtalker_top_psubtalker_temperaturecache_positionr   r5   Fr6   )r=   r>   r!   r?   r@   rA   rB   rC   rH   rI   r+   r*   r)   rJ   r   r]   rN   rO   rP   rR   new_onesrX   tensorrT   r{   rU   r\   rV   rZ   r   r[   )'r^   r   r   r   r   r_   r   r   r   r   r   r   r   r   r`   r>   r!   r.   ra   rb   rg   rh   ri   r*   rj   rJ   rk   rl   rn   ro   rp   rq   rr   rs   _r   	codec_idsr|   r}   r~   r~   r   parity_generate_streaming   s   







	
r   )r	   r
   r   r   r   Tr   r   )__doc__rH   typingr   r   r?   r   r   samplingr   r   r   r   inference_modeTensorintfloatrA   dictr   r   r~   r~   r~   r   <module>   s   
	
 -	
