o
    ci                  *   @   s   d Z ddlZddlmZmZ ddlZddlmZ ddlm	Z	m
Z
 ddlmZ e 				
									d%dejdejdejdejdedededededededededee dee dee d ee d!ed"eeej ef f&d#d$ZdS )&zP
Non-streaming generation loop using CUDA graphs for both predictor and talker.
    N)OptionalTuple   )PredictorGraph)apply_repetition_penaltysample_logits)TalkerGraph      ?2         ?T?Ftalker_input_embedsattention_masktrailing_text_hiddenstts_pad_embedpredictor_graphtalker_graphmax_new_tokensmin_new_tokenstemperaturetop_ktop_p	do_samplerepetition_penaltysubtalker_dosamplesubtalker_top_ksubtalker_top_psubtalker_temperatureparity_modereturnc           B         sN  |j  |j}|j}|j}tj|tj|d}td|d }t||D ]
}| kr+d||< q!|r fddt||D }t		 }| j
d*i d|d|d	|d
|d|d|	d|d|d|d|
d|d d|d|durq|n|d|durz|n|d|dur|n|d|dur|n|
dddd}tjdd |jD dddddddf }| k}tj| dd}|jdd} t| |jd }!fddt|!D }"tj  t		 | }#|"rt|"d jd nd}$d|#|$|$dkr|#|$ d  nd|#dkr|$|# ndd!}%|"r|"d |%fS d|%fS | j}&|  }'| j}(|& })t		 }| j||ddd||dddd"
}*|*j}+|*j},|*j}-|*jddd#ddf }.|	dk}/t|.|
|||||/r] gndd$}0||+}1t| d%d}2| ||2 tj  t		 | }3t		 }4g }5t|D ]}6|0!  kr n|'|0"d}7tj#|,|7fdd}8|$|8}9t#|0%d|9g}:|5&|:'  |7g};t|d D ]}|;&|)| |9| "d"d qtj#|;ddj(ddd&}<|-|jd k r|<|dd|-f "d }<n|<| }<|1|6 }=|=|j)d kr
 nb|j$|<|=d'}>|(|>ddd#ddf "d}.|d(kr=t*|5dkr=td)d |5D }?t+|.|?|}.t*|5|	k }/t|.,d|
|||||/rS gndd$}0|>ddd#dddf - },|-d7 }-qtj  t		 |4 }@t*|5}A|3d  |@|A|Adkr|@|A d  nd|@dkr|A|@ ndd!}%|5rt|5|%fS d|%fS )+zP
    Fast autoregressive generation with CUDA-graphed predictor and talker.
    )dtypedevicer   i   Tc                    s   g | ]}| kr|qS  r$   ).0i)eos_idr$   O/home/ubuntu/vllm_env/lib/python3.10/site-packages/faster_qwen3_tts/generate.py
<listcomp>5   s    z!fast_generate.<locals>.<listcomp>inputs_embedsr   trailing_text_hiddenr   r   r   r   r   r   r   r   eos_token_idsuppress_tokensr   Nr   r   r   output_hidden_statesreturn_dict_in_generatec                 S   s    g | ]}|d  dur|d  qS )Nr$   )r%   hidr$   r$   r(   r)   M   s     r   )dimc                    s&   g | ]\}} |d |d d f qS )Nr$   )r%   r&   length)talker_codesr$   r(   r)   U   s   & g        i  )
prefill_msdecode_sstepsms_per_stepsteps_per_s)
r*   r   	use_cacher.   return_dictr+   r   generation_steppast_hiddenpast_key_valuesr0   )r   r   r   r   suppress_maskr-   rope_deltas)keepdim)positionr   c                 S   s   g | ]}|d  qS )r   r$   )r%   cr$   r$   r(   r)      s    r$   ).codec_eos_token_idnum_code_groups
vocab_sizer#   torchzerosboolmaxrangetimegeneratestackhidden_statesargmaxintanywhereshape	enumeratecudasynchronizecode_predictorget_input_embeddings
codec_headforwardr>   r=   r<   logitsr   
prefill_kvgetattrset_generation_stateitem	unsqueezecatrunviewappenddetachsummax_seq_lenlenr   squeezeclone)Btalkerr   r   r   r   configr   r   r   r   r   r   r   r   r   r   r   r   r   r    rE   rF   r#   r?   suppress_startr&   r-   t_starttalker_resultfirst_codebookis_stop_tokenstop_indiceshas_stop_tokeneffective_lengthstalker_codes_list
total_timer7   timing	predictortalker_codec_embedtalker_codec_headpredictor_codec_embedsouttalker_past_kvr=   gen_stepr\   suppress_eostokenprefill_lenr@   	t_prefillt_decode_startall_codec_idsstep_idxlast_id_hidden
pred_inputcodebook_token_idsall_cbcodec_hiddensr*   current_posrO   historyt_decoden_stepsr$   )r'   r4   r(   fast_generate   sB  
	




& 	
r   )r	   r
   r   r   r   Tr   NNNNF)__doc__rL   typingr   r   rG   r   r   samplingr   r   r   r   inference_modeTensorrQ   floatrI   dictr   r$   r$   r$   r(   <module>   sv   
	
