o
    پig                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZ d dlZd dlm  mZ d dlZd dlmZmZmZmZmZmZ d dlmZ d dlmZ d dlmZmZmZ d dl m!Z! d d	l"m#Z#m$Z$ e rqd d
l%m&Z& e&  dd d dddgZ'ddgddddgdgZ(ej)*e+Z,e-ej).e,ddZ/e/0 Z1W d   n1 sw   Y  e'2e1 dZ3dd Z4dd Z5dd Z6	d4de	e7 fdd Z8eG d!d" d"Z9G d#d$ d$Z:G d%d& d&Z;d'd( Z<	)	*d5d+e9d,e9d-e=d.e=d/e=d0e>d1e?fd2d3Z@dS )6    N)	dataclass)AnyListOptionalTupleUnion)
AutoConfig	AutoModelAutoModelForCausalLMAutoModelForVision2SeqAutoProcessorGenerationConfig)Engine)ci_validate_and_clean_hf_cache)
get_deviceis_npu
load_image)get_tokenizer) DEFAULT_PORT_FOR_SRT_TEST_RUNNERcalculate_rouge_l)init_npu_backendz Apple is red. Banana is Yellow. i   zApple isz$The capital of the United Kingdom iszToday is a sunny day and I likez,AI is a field of computer science focused onzHow many people live in Berlin?z%Berlin is well known for its museums.)query	documentszcBerlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.zlong_prompt.txtr   c                 C   s0   | t ju rdS | t ju rdS | t ju rdS t )Nfloat16float32bfloat16)torchr   r   r   NotImplementedError)torch_dtype r!   G/home/ubuntu/.local/lib/python3.10/site-packages/sglang/test/runners.pyget_dtype_strL   s   


r#   c                 C   s,   t j| dtjd}~ tj||dd\}}|S )Ndimdtype)kr&   )Flog_softmaxr   r   topk)logitsr(   logprobstop_indicesr!   r!   r"   get_top_logprobsW   s   r/   c                 C   s$   t j| dtjd}~ |d|f }|S )Nr$   r%   .)r)   r*   r   r   )r,   	token_idsr-   r!   r!   r"   get_token_ids_logprobs^   s   r1   matryoshka_dimc           	      C   s   ddl m} ddlm} || r|| d|i|d}n ddl m} || j|d}|j| dd	}|||g|d
}|t	 S )Nr   )SentenceTransformer)is_sentence_transformer_modelr    )model_kwargstruncate_dim)models)r'   	lasttoken)pooling_mode)modulesr6   )
sentence_transformersr3   sentence_transformers.utilr4   r7   TransformertoPoolingget_word_embedding_dimensionr   )	
model_pathr    r2   r3   r4   modelr7   word_embedding_modelpooling_modelr!   r!   r"   )_get_sentence_transformer_embedding_modele   s$   rE   c                   @   s   e Zd ZU dZee ed< dZee ed< dZ	ee
j ed< dZee
j ed< dZeee  ed< dZee
j ed< dZee ed< dZeeeeedf   ed	< dZeeeeedf   ed
< dZee
j ed< dZee
j ed< dS )ModelOutputNoutput_strs
output_idstop_input_logprobstop_output_logprobstop_output_logprob_idxembed_logitsscoresinput_token_logprobs_lstoutput_token_logprobs_lsttoken_ids_input_logprobstoken_ids_output_logprobs)__name__
__module____qualname__rG   r   str__annotations__rH   intrI   r   TensorrJ   rK   rL   rM   floatrN   r   rO   rP   rQ   r!   r!   r!   r"   rF      s   
 rF   c                   @   s  e Zd Z					d.dedejdededed	ed
ee fddZ	dd Z
	d/deee  fddZ								d0deej deej deej deeej  deej deej deej deej dejfddZ	d/d
ee fddZeddddfd eeee  ee eej f deee  d!ed"eee  d#ee f
d$d%Zd&d' Zd(d) Zd*d+ Ze				d1d eee eej f d!edejd"eee  ded#ee d	ee defd,d-ZdS )2HFRunner
generationFNrA   r    
model_typeoutput_str_onlytrust_remote_codepatch_model_do_sample_falser2   c                 C   sZ   || _ || _|| _|| _t | _t | _tj| j	| j| j|||fd| _
| j
  d S )N)targetargs)r\   r]   r^   r_   mpQueuein_queue	out_queueProcessstart_model_process
model_procstart)selfrA   r    r\   r]   r^   r_   r2   r!   r!   r"   __init__   s    



zHFRunner.__init__c                 C   s   dg}||v r	dS dS )NzLxzGordon/URM-LLaMa-3.1-8BTFr!   )rj   rA   models_needs_trust_remoter!   r!   r"   needs_trust_remote_code   s
   z HFRunner.needs_trust_remote_code
image_datac                    s   d }|d urdd |D } j ||ddddd} fdd| D }t   jd	i |}W d    | S 1 s=w   Y  | S )
Nc                 S   s   g | ]}t |d  qS r   )r   ).0imager!   r!   r"   
<listcomp>       z9HFRunner._get_gme_qwen2_vl_embeddings.<locals>.<listcomp>Ti  pt)textimagespadding
truncation
max_lengthreturn_tensorsc                    s    i | ]\}}||  jjqS r!   )r>   rB   device)rp   r(   vrj   r!   r"   
<dictcomp>   s     z9HFRunner._get_gme_qwen2_vl_embeddings.<locals>.<dictcomp>r!   )	processoritemsr   no_grad_forward_gme_qwen2_vltolist)rj   promptsrn   rv   inputs
embeddingsr!   r}   r"   _get_gme_qwen2_vl_embeddings   s$   

z%HFRunner._get_gme_qwen2_vl_embeddings	input_idsattention_maskposition_idspast_key_valuesinputs_embedspixel_valuesimage_grid_thwpooling_maskreturnc	              
   K   s   |d u r9| j j |}|d ur/|| j j }| j j||d|j}
|| j jjk}|
||< |d ur9||j}| j ||||dd||d}|j	d d d df }t
jjj|ddd}| S )N)grid_thwT)r   r   r   r   output_hidden_statesreturn_dictr   r   r$         )pr&   )rB   embed_tokenstypevisual	get_dtyper>   r{   configimage_token_idhidden_statesr   nn
functional	normalize
contiguous)rj   r   r   r   r   r   r   r   r   kwargsimage_embeds
image_maskoutputsr   r!   r!   r"   r      s4   zHFRunner._forward_gme_qwen2_vlc                 C   s  t   t| | jdkr6tj|| jd}| jrt}nt|dd }tt|}|j||| jdd	t
 | _ni| jdkrvd| v rVtj||d	dd	t
 | _t|| _nId
| v rmt|	t
 | _t|| _n2t|||d| _n)| jdks| jdkrddlm}	 |	j||| |d	t
 | _ntd| j t|tj| jd| _	 | \}
}}}}|d urt|
t|ksJ |
d ur| jdkr|| j| j|
|| j||| j|| jd	 n| jdkrU| jrJ d| v r|  |
|}nSd
| v rD|d ur#t!|}| j|d dd}| jj"|j#d 	t
 d$ }n)| j|
ddd}| jj%|j#d 	t
 |j#d 	t
 d$ }n| j&|
$ }|t'|d nz| jdkr| j|
ddd	t
 }| jdi |j(}|) $ }t*|t+s|g}|t'|d nG| jdkrg }|
D ]+}| jj,|d	d	d}| j|dd	t
 }|-t.| jdi |j(d d /  q|t'|d ntd| j q)Nr[   r^   architecturesr   T)r    r^   low_cpu_mem_usage	embeddingzgme-qwen2-vlFclip)r2   rewardcross_encoder)"AutoModelForSequenceClassification)r    r^   zUnrecognized model type )	
base_modelr   max_new_tokens	tokenizer
lora_pathsr    r]   token_ids_logprobr_   rt   )rv   rz   r   )r   )rw   rz   r   r   )r   r   rL   rM   )tokenizer   rz   r!   )0monkey_patch_gemma2_sdpar   r\   r   from_pretrainedr^   r
   getattrtransformersr>   r   r   lowerr   rB   r   r   r	   rE   r   rm   	Exceptionr   r   r'   r   getlenputforward_generation_rawr]   r_   r   r   get_image_featuresdatar   get_text_featuresencoderF   r,   squeeze
isinstancelistapply_chat_templateappendrY   item)rj   rd   re   rA   r    r2   r   	model_cls
model_archr   r   rn   r   r   r   r,   rq   r   rM   convconv_formattedconv_tokenizedr!   r!   r"   rg      s   	









 zHFRunner.start_model_process   r   r   r   r   c                 C   s    | j |||||f | j S N)rd   r   re   r   )rj   r   rn   r   r   r   r!   r!   r"   forward  s   

zHFRunner.forwardc                 C      | j   d  | _| _d S r   rh   	terminaterd   re   r}   r!   r!   r"   r        
zHFRunner.terminatec                 C      | S r   r!   r}   r!   r!   r"   	__enter__     zHFRunner.__enter__c                 C   r   r   r   rj   exc_type	exc_value	tracebackr!   r!   r"   __exit__  r   zHFRunner.__exit__c	                    s  g }	g }
g } d urg }g }nd  }}t |D ]\}}t|tr,|j|ddt }n	tj|gt d}|d urQ|| d urQddlm	} |j
| || |dd}n| }|rYd|j_|j|tdd d |d| dd	d
}|j|d d t|d d  dd}| std|	| |s|dd |jD   d ur| fdd|jD  ~||jd }|
t|t   d ur|t|   ~|d ur|| d ur|  qt|	|
|||dS )Nrt   r   )r{   r   )	PeftModelF)r    is_trainableT)	do_sampletemperaturetop_pr   return_dict_in_generateoutput_scoresdisable_compile)r   generation_config)skip_special_tokensQReceived an empty text response. Please verify your input or model configuration.c                 S   s   g | ]}t |d  t qS ro   )r/   NUM_TOP_LOGPROBSr   rp   r,   r!   r!   r"   rr     s    z3HFRunner.forward_generation_raw.<locals>.<listcomp>c                    s   g | ]}t |d    qS ro   )r1   r   r   r   r!   r"   rr     s    )rG   rI   rJ   rP   rQ   )	enumerater   rU   r   r>   r   r   tensorpeftr   r   r   r   generater   decoder   strip
ValueErrorr   rM   r   r,   r/   r   r   r1   unloadrF   )r   r   r   r   r    r   r]   r   r_   rG   rI   rJ   rP   rQ   ir   r   r   rB   r   ru   input_logitsr!   r   r"   r     s   


zHFRunner.forward_generation_raw)r[   FFFNr   )NNNNNNNN)NFNF)rR   rS   rT   rU   r   r'   boolr   rW   rk   rm   r   r   
LongTensorrX   FloatTensorr   rg   DEFAULT_PROMPTSr   r   r   r   r   staticmethodrF   r   r!   r!   r!   r"   rZ      s    


	
-
 



	
rZ   c                U   @   s  e Zd Zddedddddddddddddddddddddddddddddddddddddf(d	ed
ejdededededee	e
e e
eeef  f  dedee dee dee dedededee dee dee dee dedee dededee d ee d!ee d"ee d#ee d$ee d%ee d&ee d'ed(ed)ee d*ed+ee d,ee
e  d-ee d.ee d/ee d0eeeef  d1ed2efTd3d4ZdRd5ed6ed7efd8d9Zd5efd:d;Zedd<dd=dddfd>e	e
e
e  e
e e
ej f d?ee
e  d@edee
e  dAedBee dCee
e  dDee fdEdFZedd<dfd>e	e
e e
ej f d?ee
e  fdGdHZdIdJ ZdKdL Ze	<		=		dSdMed>e	e
e e
ej f d@edee
e  dAedBee dCee
e  fdNdOZed>e	e
e e
ej f fdPdQZdS )T	SRTRunnerr   autoN   csgmvFg?lrurA   r    r\   tp_size
model_implportr   max_loras_per_batchattention_backendprefill_attention_backenddecode_attention_backendlora_backenddisable_cuda_graphdisable_radix_cachechunked_prefill_sizecontext_lengthmax_total_tokens	page_sizedp_sizetokenizer_pathmem_fraction_staticr^   speculative_draft_model_path speculative_draft_model_revisionspeculative_algorithmspeculative_num_stepsspeculative_eagle_topkspeculative_num_draft_tokens'speculative_ngram_min_match_window_size'speculative_ngram_max_match_window_sizedisable_overlap_scheduledisable_custom_all_reducetorchao_configcuda_graph_max_bsmax_lora_ranklora_target_modulesenable_loraenable_lora_overlap_loadingmax_loaded_lorasjson_model_override_argslora_eviction_policyenable_deterministic_inferencec,           .      C   s  || _ |dk| _|dk},i }-|r)||-d< ||-d< ||-d< ||-d< ||-d< ||-d< n|d	kr9||-d< ||-d
< ||-d< td2i d|d|dt|d|d|d|!d|d|d| j d|d|d|d|	d|
d|d|d|d|d|d|d |d!|,d"|d#|d$|d%|"d&| d'|#d(|$d)|%d*|&d+|'d,|(d-|)rt|)nd.d/|*d0|+|-| _|d u rt||d1| _d S d | _d S )3Nr[   r   r  r  r  r  r  r  NGRAMr  r  rA   r  r'   r  r  r  r  r^   is_embeddingr   r  r  r  r  r  r	  r
  r  r  r  r  enable_dp_attentionr  r  r  r  r  sleep_on_idler  r  r   r!  r"  r#  z{}r$  r%  r   r!   )	r\   is_generationr   r#   jsondumpsenginer   r   ).rj   rA   r    r\   r  r  r  r   r  r  r  r  r  r	  r
  r  r  r  r  r  r  r  r^   r  r  r  r  r  r  r  r  r  r  r  r  r)  r  r  r   r!  r"  r#  r$  r%  r(  spec_kwargsr!   r!   r"   rk     s   .

	
 !$'(,
zSRTRunner.__init__	lora_name	lora_pathpinnedc                 C   s   | j |||S r   )r-  load_lora_adapter)rj   r/  r0  r1  r!   r!   r"   r2    s   zSRTRunner.load_lora_adapterc                 C   s   | j |S r   )r-  unload_lora_adapter)rj   r/  r!   r!   r"   r3    s   zSRTRunner.unload_lora_adapterr   r   r   rn   r   logprob_start_lentop_kr   
dimensionsc	              	   C   s   | j r| j| j||||||dS | jdkr5| jj|||d}	t|	tr+dd |	D }
n|	d g}
t|
dS | jdkrT| j|}	t|	tsH|	g}	dd |	D }t|d	S | j|}	d
d |	D }t|d	S )N)r-  r   r   r   r4  r5  r   r   )promptrn   r6  c                 S      g | ]}|d  qS r   r!   rp   xr!   r!   r"   rr         z%SRTRunner.forward.<locals>.<listcomp>r   r   c                 S   r8  r9  r!   r:  r!   r!   r"   rr     r<  r   c                 S      g | ]}|d  d qS r   r   r!   r:  r!   r!   r"   rr     rs   )	r*  r   r-  r\   r   r   r   rF   rerank)rj   r   rn   r   r   r4  r5  r   r6  responser,   rM   r!   r!   r"   r     s6   








zSRTRunner.forwardc                 C   sb   | j r| j| j|||dS | j||}| jdkr%dd |D }t|dS dd |D }t|dS )zp
        testing serving by sending all prompts once
        only return output strings and no logprobs
        )r-  r   r   r   r   c                 S   r8  r9  r!   r:  r!   r!   r"   rr     r<  z+SRTRunner.batch_forward.<locals>.<listcomp>r   c                 S   r=  r>  r!   r:  r!   r!   r"   rr     rs   r   )r*  batch_forward_generation_rawr-  r   r\   rF   )rj   r   rn   r   r   r@  r,   rM   r!   r!   r"   batch_forward  s   


zSRTRunner.batch_forwardc                 C   r   r   r!   r}   r!   r!   r"   r     r   zSRTRunner.__enter__c                 C   s   | j   | ` d S r   )r-  shutdownr   r!   r!   r"   r     s   
zSRTRunner.__exit__r-  c                 C   sH  g }g }g }	g }
g }g }g }|d urg }g }nd  }}|dd}|r&||d< t |D ]\}}| j||r7|| nd |d|t|d}|d }| sLtd|| |d d	 }|d d
 }|d d }|d urr|d d dd  }nd }|d d }t||| ksJ t||| ksJ |dd  }|dd  }t|t|ksJ |
||d g  || |	dd |D dd |d d d d t D g  |dd |d d D  |dd |d d D  |d ur|dd |D dd |d d d D g  |dd |d d D  q*t|||	||
||||d	S )Nr   r   r   r5  T)r0  sampling_paramsreturn_logprobr4  top_logprobs_numr   ru   r   	meta_infoinput_token_logprobsoutput_token_logprobsinput_top_logprobsinput_token_ids_logprobsr   prompt_tokensc                 S   "   g | ]}d d |dt  D qS )c                 S   r8  ro   r!   rp   tupr!   r!   r"   rr   "  r<  ?SRTRunner.forward_generation_raw.<locals>.<listcomp>.<listcomp>Nr   r:  r!   r!   r"   rr   "  s   " z4SRTRunner.forward_generation_raw.<locals>.<listcomp>c                 S   r8  ro   r!   rO  r!   r!   r"   rr   $      output_top_logprobsc                 S   rN  )c                 S   r8  ro   r!   rO  r!   r!   r"   rr   .  r<  rQ  NrR  r:  r!   r!   r"   rr   -      c                 S   rN  )c                 S   r8  )r   r!   rO  r!   r!   r"   rr   4  r<  rQ  NrR  r:  r!   r!   r"   rr   3  rU  c                 S      g | ]	}d d |D qS )c                 S   r8  ro   r!   rO  r!   r!   r"   rr   :  r<  rQ  r!   r:  r!   r!   r"   rr   :  s    c                 S   r8  ro   r!   rO  r!   r!   r"   rr   <  rS  output_token_ids_logprobsc                 S   rV  )c                 S   r8  ro   r!   rO  r!   r!   r"   rr   F  r<  rQ  r!   r:  r!   r!   r"   rr   E  s    )	rG   rH   rI   rJ   rN   rO   rK   rP   rQ   )r   r   r   r   r   r   r   rF   )r-  r   r   r   r4  r5  r   rG   rH   rI   rN   rJ   rO   rK   rP   rQ   rE  r   r7  r@  ru   rI  rJ  r-   rL  num_prompt_tokensr!   r!   r"   r     s   
	





z SRTRunner.forward_generation_rawc                 C   s>   g }|dd}|j | |r|nd |d}dd |D }t|dS )Nr   rD  )r0  rE  c                 S   r8  )ru   r!   )rp   r   r!   r!   r"   rr   f  r<  z:SRTRunner.batch_forward_generation_raw.<locals>.<listcomp>)rG   )r   rF   )r   r   r   r-  rG   rE  r@  r!   r!   r"   rA  W  s   

z&SRTRunner.batch_forward_generation_rawF)r   Nr   NN)rR   rS   rT   r   rU   r   r'   rW   r   r   r   dictr   rY   r   rk   r2  r3  r   rX   r   rB  r   r   r   r   r   rA  r!   r!   r!   r"   r     s   	
 !"#%
&'()*+,
x

	


0



 r   c                  C   s,   ddl m}  d	dtfdd}t| d| dS )
z
    Use sdpa by default to fix the OOM issue.
    Revert this commit:
    https://github.com/huggingface/transformers/commit/975b988bfe6e7ebb47390cd9a1556c6888804883#diff-5f76eac6f18f4b491521314c318a9692318feb4d19228e9576cce7bde4240834R660
    r   )Gemma2PreTrainedModelFhard_check_onlyc                 S   s
   d| _ | S )Nsdpa)_attn_implementation)r   r\  r!   r!   r"   _check_and_enable_sdpau  s   z8monkey_patch_gemma2_sdpa.<locals>._check_and_enable_sdpar_  NrY  )*transformers.models.gemma2.modeling_gemma2r[  r   setattr)r[  r_  r!   r!   r"   r   m  s   r    T
hf_outputssrt_outputsprefill_tolerancedecode_tolerancerouge_l_tolerance
debug_textcheck_logprobsc              
      sv  t d| j t d|j t| j|j}t d| t fdd|D s0J d  |rtt| jD ]}t| j| }	t|j| }
|	j	d }t dt
t|	|
  |d	kr{tt|	|
 |k s{J d
| d| d|	d|
t| j| }	t|j| }
t dt
t|	|
  |d	krtt|	|
 |k sJ d| d| d|	d|
q9d S d S )Nzhf_outputs.output_strs=zsrt_outputs.output_strs=zrouge_l_scores=c                 3   s    | ]}| kV  qd S r   r!   )rp   scorerg  r!   r"   	<genexpr>  s    
z,check_close_model_outputs.<locals>.<genexpr>z:Not all ROUGE-L scores are greater than rouge_l_tolerance=r   zprefill logprobs max_diffd   z(prefill logprobs are not all close with z prefill_tolerance=z.hf_logprobs=z, srt_logprobs=zdecode logprobs max_diffz'decode logprobs are not all close with z decode_tolerance=)printrG   r   allranger   r   rX   rI   shapemaxabsrJ   )rc  rd  re  rf  rg  rh  ri  rouge_l_scoresr   hf_logprobssrt_logprobs	input_lenr!   rk  r"   check_close_model_outputs|  sZ   


rx  r   )rb  T)Ar+  multiprocessingrb   osdataclassesr   typingr   r   r   r   r   r   torch.nn.functionalr   r   r)   r   r   r	   r
   r   r   r   sglang.srt.entrypoints.enginer   ,sglang.srt.model_loader.ci_weight_validationr   sglang.srt.utilsr   r   r   &sglang.srt.utils.hf_transformers_utilsr   sglang.test.test_utilsr   r   %sglang.srt.hardware_backend.npu.utilsr   r   TEST_RERANK_QUERY_DOCSpathdirname__file__dirpathopenjoinfreadlong_promptr   r   r#   r/   r1   rW   rE   rF   rZ   r   r   rY   rU   r   rx  r!   r!   r!   r"   <module>   s    	




  y  i