o
    5ti                     @  s  d dl mZ d dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZmZmZmZmZ d dlZd dlZd d	lmZ d d
lmZ d dlmZ d dlmZm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 d dl2m3Z3m4Z4 eededkrd dl5m6Z6 zd dl7m8Z8 W n e9y   d dl:m8Z8 Y nw zd dl;m<Z< W n e9y   d dl=m<Z< Y nw erd dl>m?Z? d dl@mAZA eBeCZD	d0d1d,d-ZEe'dG d.d/ d/e%ZFdS )2    )annotationsN)version)	find_spec)ProcessQueue)Empty)sleep)TYPE_CHECKINGAnyLiteralcastoverload)
distribute)parse)tqdm)LLMSamplingParamsTokensPrompt)LoRARequest)
TemplateLM)register_model)	Collator_add_special_kwargsconfigure_pad_tokenhandle_stop_sequenceshas_bos_prefixmaybe_truncatenormalize_gen_kwargspostprocess_generated_textundistribute)get_rolling_token_windowsmake_disjoint_windowvllm0.8.3)resolve_hf_chat_template)get_tokenizer)get_open_port)PreTrainedTokenizerBase)Instance	127.0.0.1
model_argsdictsampling_paramslist[SamplingParams]requestslist[list[int]]lora_requestr   result_queuer   dp_sizeintlocal_dp_rankdp_master_portdp_master_ipstrreturnNonec	                 C  s  |s| |g f dS t| tjd< tjd< t|tjd< t|tjd< t|tjd< d}	zz!tdi | }	|	jdd |D ||d	}
td
 | ||
f W n2 ty } z&d| dt|j	 dt| }t
j|dd | |d|if W Y d}~nd}~ww W |	durz~	t  W dS  ty } zt
jd| dt|j	 dt| dd W Y d}~dS d}~ww dS |	durz~	t  W w  ty } zt
jd| dt|j	 dt| dd W Y d}~w d}~ww w )z
    Worker process for vLLM multiprocessing.
    Initializes a vLLM engine, processes requests, and puts results or errors
    onto the result_queue.
    NVLLM_DP_RANKVLLM_DP_RANK_LOCALVLLM_DP_SIZEVLLM_DP_MASTER_IPVLLM_DP_MASTER_PORTc                 S     g | ]}t |d qS )prompt_token_idsr   .0request rF   Q/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/models/vllm_causallms.py
<listcomp>^       z#_vllm_mp_worker.<locals>.<listcomp>r,   r0      zWorker z failed during generation: z: Texc_infoerrorz* encountered an error during LLM cleanup: rF   )putr7   osenvironr   generater   	Exceptiontype__name__eval_loggerrN   gccollectwarning)r*   r,   r.   r0   r1   r2   r4   r5   r6   llmreseerror_message	e_cleanuprF   rF   rG   _vllm_mp_worker@   sd    
r_   c                      s<  e Zd ZU dZded< 																		
								dndo fd3d4Zed5d6 Zed7d8 Zedpd:d;Z	ed<d= Z
	dqdrdAdBZedsdCdDZe	dtdudGdHZe	dtdvdKdHZ	dtdwdNdHZ		dxdydSdTZ	dzd{dXdYZ	dzd|dZd[Z	dzd}d^d_Zed~dddeZe		dddldmZ  ZS )VLLMi   r'   	tokenizerautoNFrK           ?T   left
pretrainedr7   dtype1Literal['float16', 'bfloat16', 'float32', 'auto']revision
str | Nonetrust_remote_codebool | Nonetokenizer_modeLiteral['auto', 'slow']tokenizer_revisionadd_bos_tokenprefix_token_id
int | Nonetensor_parallel_sizer3   quantizationmax_gen_toks
swap_space
batch_size	str | int
max_lengthmax_model_lenseedgpu_memory_utilizationfloatdata_parallel_sizelora_local_pathenable_thinkingboolchat_template_argsdict | Nonethink_end_tokenmax_lora_ranktruncation_side"Literal['left', 'right', 'middle']c           !        s4  t    tdstd|d u s|d u sJ d|dd  || _tjdddk| _	|d ur2|n|| _
t|
| _|| _t|| _i d|d	t|d
|d|d|d|d|d|dt|
d| j
rkt| j
nd d|d|dt|d|dt|dt|dt|| _| j| t|trd|v rdnt|| _| jdkrtd/i | j| _ntd | j	sdn| jdd | jd< d| _td || _ddlm} |j|||d | _t |p|f||pd!|d"| jd urd#| jini | _!t"| j!| jd$| _!|pi | _#| j#d%|| _$t%t&dt%d&kra| j!d d d'}t%t&dt%d(krT| jdkr=| jj'j(|d)< ndd*l)m*} |d/i | j}|+ } | |d)< n||d< t,d/i || _-nd | _-|	| _.|	d urutd+| j/  || _0|d urt%t&dt%d,ksJ d-t1d.d|| _2d S d | _2d S )0Nr"   zattempted to use 'vllm' LM type, but package `vllm` is not installed. Please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`z@Either max_length or max_model_len may be provided, but not bothdeviceVLLM_USE_V110modelr   rl   rj   ra   rp   rr   rn   rv   r}   max_num_seqsry   rw   r~   enable_lorar   rb   rK   zYou might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached.raydistributed_executor_backendz8Manual batching is not compatible with data parallelism.r   )
AutoConfig)rn   rl   F)rp   rn   rl   rs   )model_configr   r#   )ra   chat_templatetoolsz0.9.0r   )
EngineArgsz2Loglikelihood prefix token id used in evaluation: z0.3.0z1lora adapters only compatible with vllm > v0.3.0.	finetunedrF   )3super__init__r   ModuleNotFoundErrorpopr   rP   rQ   getV1_max_lengthr3   rv   r   r   r   r   r*   update
isinstancer7   rz   r   r   rV   rY   infors   transformersr   from_pretrained_configr%   ra   r   r   r   parse_versionr   
llm_enginer   vllm.engine.arg_utilsr   create_model_configr$   hf_chat_templatecustom_prefix_token_idrt   _max_gen_toksr   r0   )!selfri   rj   rl   rn   ra   rp   rr   rs   rt   rv   rw   rx   ry   rz   max_batch_sizer|   r}   r~   r   r   r   r   r   r   r   r   kwargsr   kwargs_resolve_hf_chat_templater   engine_argsr   	__class__rF   rG   r   ~   s   
 

	










zVLLM.__init__c                 C  s   | j jS N)ra   eos_token_idr   rF   rF   rG   eot_token_id  s   zVLLM.eot_token_idc                 C  s,   | j d ur| j S | jjd ur| jjS | jjS r   )r   ra   bos_token_idr   r   rF   rF   rG   rt      s
   
zVLLM.prefix_token_idr8   c                 C  sx   | j r| j S | jdkr| jjjjS d}|D ]}t| j|r%t| j|  S qt| j	dr9| j	j
dkr5| jS | j	j
S | jS )NrK   )n_positionsmax_position_embeddingsn_ctxmodel_max_lengthl         3Me')r   r   r   r   r   r}   hasattrr   getattrra   r   _DEFAULT_MAX_LENGTH)r   seqlen_config_attrsattrrF   rF   rG   r|   )  s   
zVLLM.max_lengthc                 C  s   | j S r   )r   r   rF   rF   rG   rx   :  s   zVLLM.max_gen_tokschat_historylist[dict[str, str]]add_generation_promptc                 C  s   z| j j|fd|| | j| jd| j}W n) tjjy?   t	d | j jdd |D fd|| | j| jd| j}Y nw t
d|S )zc
        Method to apply a chat template to a list of chat history between user and model.
        F)tokenizer   continue_final_messager   r   zHFailed to apply chat template. removing the system role in chat history.c                 S  s   g | ]
}|d  dkr|qS )rolesystemrF   )rD   msgrF   rF   rG   rH   S  s    z,VLLM.apply_chat_template.<locals>.<listcomp>r7   )ra   apply_chat_templater   r   r   jinja2
exceptionsTemplateErrorrV   rY   r   )r   r   r   chat_templatedrF   rF   rG   r   >  s:   
	

zVLLM.apply_chat_templatec                 C  s   | j jddS )N/__)ra   name_or_pathreplacer   rF   rF   rG   tokenizer_name^  s   zVLLM.tokenizer_namestring	list[int]c                 K     d S r   rF   r   r   add_special_tokensr   rF   rF   rG   
tok_encodeb     zVLLM.tok_encode	list[str]r/   c                 K  r   r   rF   r   rF   rF   rG   r   f  r   str | list[str]list[int] | list[list[int]]c                   sZ  | j sJ |s	g S t|tr|gn|| j | j i |t|| j} fddD }dd t|D }dd t|D }fdd|D }fdd|D }	g }
|rmi |ddi}|rk| j |fd	di|jng }
|	r{| j |	fd	di|jng }d gt	 }t|D ]
\}}|
| ||< qt|D ]
\}}|| ||< qt|tr|d
 S |S )Nc                   s   g | ]}t | qS rF   )r   )rD   s)
_bos_tokenrF   rG   rH     rI   z#VLLM.tok_encode.<locals>.<listcomp>c                 S  s   g | ]\}}|r|qS rF   rF   rD   ifrF   rF   rG   rH         c                 S  s   g | ]\}}|s|qS rF   rF   r   rF   rF   rG   rH     r   c                      g | ]} | qS rF   rF   rD   r   _stringrF   rG   rH         c                   r   rF   rF   r   r   rF   rG   rH     r   r   Freturn_attention_maskr   )
ra   r   r7   decodert   r   rs   	enumerate	input_idslen)r   r   r   r   special_tokens_kwargshas_prefix_flagsidx_hasidx_notstrs_hasstrs_notenc_has
kwargs_offenc_notoutjr   rF   )r   r   rG   r   k  s\   

	r.   rR   r,   ,list[SamplingParams] | SamplingParams | Nonec                   s  |r|d u rt ddddd}t|tstd|gt| }jdkrgjsgtjd+dddd t	j|D }dd t	j|D }fddt
||ddD }fdd|D }t|}t  t|S jdkrj}tjdd}tjdpt }	dd t	j|D }dd t	j|D }g t }
}ztt
||ddD ]"\}\}}ttj ||j||||	|f	d}|  |
| qi  t t|
k rz|jdd\}}t|trd|v rt|d | |< W n! ty    fd dt|
D }|rtd!| d"d Y qw t t|
k sԇ fd#dtt|
D }t|W z
|  |  W n t yG   t!j"d$dd% Y nw |
D ]!}|j#d&d |$ rj|%  |j#d'd |$ rj|&  qJS z
|  |  W n t y   t!j"d$dd% Y nw |
D ]!}|j#d&d |$ r|%  |j#d'd |$ r|&  qw j'j(d(d |D |j)d)kjd*}|S ),Nr   rK   F)temperatureprompt_logprobs
max_tokens
detokenizer-   r*   r+   r,   r.   r/   r0   r   c                 S  s(   t di | }|jdd |D ||dS )Nc                 S  r?   r@   rB   rC   rF   rF   rG   rH     rI   zIVLLM._model_generate.<locals>.run_inference_one_model.<locals>.<listcomp>rJ   rF   )r   rR   )r*   r,   r.   r0   rZ   rF   rF   rG   run_inference_one_model  s   z5VLLM._model_generate.<locals>.run_inference_one_modelc                 S     g | ]}t |qS rF   listrD   xrF   rF   rG   rH     r   z(VLLM._model_generate.<locals>.<listcomp>c                 S  r   rF   r  rD   sprF   rF   rG   rH     s    c                 3  s$    | ]\}} j || jfV  qd S r   )r*   r0   )rD   reqr  r   rF   rG   	<genexpr>  s
    
z'VLLM._model_generate.<locals>.<genexpr>Tstrictc                   s   g | ]} j | qS rF   )remoter  )r   rF   rG   rH     rI   r=   r)   r>   c                 s      | ]}t |V  qd S r   r  r  rF   rF   rG   r        c                 s  r  r   r  r  rF   rF   rG   r    s    
)targetargs   )timeoutrN   c                   s$   g | ]\}}|  s| vr|qS rF   )is_alive)rD   idxprank_resrF   rG   rH     s    zWorker processes z died unexpectedlyc                   r   rF   rF   r   r  rF   rG   rH     r   z%Failed to close vllm DP results queuerL   
      c                 S  r?   r@   rB   rC   rF   rF   rG   rH   #  rI   rb   )r,   use_tqdmr0   )r*   r+   r,   r-   r.   r/   r0   r   )*r   r   r  r   r   r   r   r   r  r   zipr   shutdownr   rP   rQ   r&   r   r   r   r_   r*   copyr0   startappendr+   RuntimeErrorr   rangeclosejoin_threadrS   rV   debugjoinr  	terminatekillr   rR   rz   )r   r.   rR   r,   inputsobject_refsresultsr2   r6   r5   procsresqrankr  r  procresult
dead_procsoutputsrF   )r  r   r   rG   _model_generate  s   












	zVLLM._model_generatelist[Instance]disable_tqdmlist[float]c              
     s  d }| j dkrt|}g }g }ttdd |D |p| jdkdD ]3\ \}tttt| 	|| j
| jd dd}d	d |D }| fd
d|D  |t| q!g }	|p]t| j }
tdt||
D ]$}||||
  }t|ddi\}}| j|dd}|	t||dd qfg }d}|D ]0}|	|||  }tdd |D }|| ||7 }|t|d  jd }| jd|f| q|S )Nrb   c                 S  s   g | ]}|j qS rF   r  rD   r  rF   rF   rG   rH   7  s    z.VLLM.loglikelihood_rolling.<locals>.<listcomp>r   )disable   rK   )
token_listprefix_tokenmax_seq_lencontext_lenc                 S  s   g | ]}d | qS )r   rF   r  rF   rF   rG   rH   I  r   c                 3  s    | ]} |fV  qd S r   rF   )rD   windowreq_idxrF   rG   r  L  r  z-VLLM.loglikelihood_rolling.<locals>.<genexpr>r
  TF)r.   r3  r	  c                 s  s    | ]	\}}|d  V  qdS )r   NrF   )rD   _nllrF   rF   rG   r  d  s    loglikelihood_rolling)rz   r   r   r   r,  r  mapr!   r    r   rt   r|   extendr  r3   r   r  _loglikelihood_tokenssumr  
cache_hookadd_partial)r   r.   r3  adaptive_batch_sizeall_windowsrequest_window_countsr   rolling_token_windowswindowsall_nllsrz   r   batchbatch_indicesbatch_windows
batch_nllsloglikelihoodscurrent_idxwindow_countrequest_nllsrequest_totalrF   r>  rG   rB  *  s^   

zVLLM.loglikelihood_rollingc              	   C  s  | j sJ g }tdd |D ddi\}}| |}dd t|||ddD }dd	 }t||d d
}	|	j| jdkr>t| jndd d}
tt||pM| j	dkdd}| j 
| j}|
D ]}t|ddi\}}t|ddi\}}g }g }g }t||ddD ]G\}}t|tsJ dt| | j||| jd\}}}t||| j| jdd\}}|| |td||d| ||||dB  q{| j|d|d}t|||ddD ])\}}}|jd j}t||d| j}|| | jd||f| |d qqZ|  |	 |S )Nc                 s  s    | ]}|j V  qd S r   r5  r6  rF   rF   rG   r  v  s    z&VLLM.generate_until.<locals>.<genexpr>r
  Tc                 S  s   g | ]\}}}||f|fqS rF   rF   )rD   abcrF   rF   rG   rH   x  s    
z'VLLM.generate_until.<locals>.<listcomp>r	  c                 S  s   t | d d  | d d fS )Nr   rK   )r   )	_requestsrF   rF   rG   _collate_gen}  s   z)VLLM.generate_until.<locals>._collate_gen)group_byrb   r   nbatch_fnzRunning generate_until requeststotalr7  descz3Expected `gen_kwargs` to be of type `dict` but got )eosdefault_max_gen_toks)rx   r}   sideverbose)r   stop)untilrx   )r.   rR   r,   ri  generate_untilrK   rF   )!ra   r  r   r   get_batchedrz   r3   r   r   r,  r   r   r   r+   rT   modify_gen_kwargsrx   r   r|   r   r  r   r1  r0  textr   r   r   rG  rH  r   r!  get_original)r   r.   r3  r[   contextall_gen_kwargscontext_encodingreqsr\  re_ordschunkspbarrd  chunkcontext_and_encodingcontext_encoding_truncatedr,   _cache_gen_kwargstoks
gen_kwargsr   ri  rx   contoutput_context_gen_kwargsgenerated_textrF   rF   rG   rj  o  s   

	




zVLLM.generate_until2list[tuple[tuple[str, str], list[int], list[int]]]list[tuple[float, bool]]c              	   C  sn  | j d }g }dd }t||d}|j| jdkrt| jndd d}tt||dd	}|D ]}	g }
g }|	D ]?\}}}t||  }|krPtd
| d| d || | d  }t|t	dt|t| |  }|

| |
| q5| j|
dd}t|||	|
ddD ]'\}}\}}}}| j|||d}|
| |d ur| jd|| |d qq-|  ||S )NrK   c                 S  s"   | d | d  }t | t|fS )NrK   r8  )r   tuple)r  rz  rF   rF   rG   _collate  s   z,VLLM._loglikelihood_tokens.<locals>._collate)sort_fnrb   r   r^  zRunning loglikelihood requestsra  zContext length z exceeds max length (z). Truncating context.F)r.   rR   Tr	  )tokensr0  ctxlenloglikelihood)r|   r   rk  rz   r3   r   r   rV   rY   maxr  r1  r  _parse_logprobsrG  rH  r   r!  rn  )r   r.   r3  max_cxt_lenr[   r  re_ordrt  ru  rv  r'  ctxlensr@  context_enccontinuation_encfull_lengthinpr  r0  r}  	cache_keyanswerrF   rF   rG   rE    sT   




zVLLM._loglikelihood_tokensr  r  r  tuple[float, bool]c           	        s   |j }dd   fdd|D }tdd t| |d ||d dd	D }d}t| |d ||d dd	D ]\}}|rOt||jd
}||krOd} ||fS q7||fS )a  Process logprobs and tokens.

        :param tokens: list
            Input tokens (potentially left-truncated)
        :param outputs: RequestOutput
            Contains prompt_logprobs
        :param ctxlen: int
            Length of context (so we can slice them away and only keep the predictions)
        :return:
            continuation_logprobs: float
                Log probabilities of continuation tokens
            is_greedy: bool
                Whether argmax matches given continuation exactly
        c                 S  s   t | d| S )Nlogprob)r   )r  rF   rF   rG   coerce_logprob_to_num"  s   z3VLLM._parse_logprobs.<locals>.coerce_logprob_to_numc                   s.   g | ]}|d ur fdd|  D nd qS )Nc                   s   i | ]	\}}| |qS rF   rF   )rD   tokenr  r  rF   rG   
<dictcomp>-  s    z3VLLM._parse_logprobs.<locals>.<listcomp>.<dictcomp>)items)rD   logprob_dictr  rF   rG   rH   ,  s    
z(VLLM._parse_logprobs.<locals>.<listcomp>c                 s  s    | ]
\}}| |V  qd S r   )r   )rD   r  r  rF   rF   rG   r  8  s
    
z'VLLM._parse_logprobs.<locals>.<genexpr>NTr	  )keyF)r   rF  r  r  r   )	r  r0  r  continuation_logprobs_dictscontinuation_logprobs	is_greedyr  r  	top_tokenrF   r  rG   r    s,   

 zVLLM._parse_logprobsr{  dict[str, Any]rd  str | list[str] | Nonere  %tuple[dict[str, Any], list[str], int]c                 C  sf   t | |d}t|ddt|tr|d n|d}t|d|}|dd ddd	|B }|||fS )
a2  Process generation kwargs into vLLM-compatible format.

        Args:
            gen_kwargs: Raw generation kwargs from the request.
            eos: EOS token string for stop sequence handling.
            default_max_gen_toks: Default max tokens if not specified in gen_kwargs.

        Returns:
            A tuple of (kwargs, stop_sequences, max_gen_toks) where:
            - kwargs: Processed kwargs ready for SamplingParams
            - stop_sequences: List of stop sequences including EOS
            - max_gen_toks: Maximum tokens to generate
        )re  ri  Nr   )rd  rx   	do_sampleF)skip_special_tokensspaces_between_special_tokens)r   r   r   r   r  r3   )r{  rd  re  r  ri  rx   rF   rF   rG   rl  M  s   
zVLLM.modify_gen_kwargs)rb   NFNrb   NNNrK   Nrc   rd   rb   NNNre   rf   rK   NTNNrg   rh   )2ri   r7   rj   rk   rl   rm   rn   ro   ra   rm   rp   rq   rr   rm   rs   ro   rt   ru   rv   r3   rw   rm   rx   r3   ry   r3   rz   r{   r|   ru   r}   ru   r~   r3   r   r   r   r3   r   rm   r   r   r   r   r   rm   r   r3   r   r   )r8   r3   )T)r   r   r   r   r8   r7   )r8   r7   r   )r   r7   r8   r   )r   r   r8   r/   )r   r   r8   r   )FN)r.   r/   rR   r   r,   r   )F)r.   r2  r3  r   r8   r4  )r.   r2  r3  r   r8   r   )r.   r  r3  r   r8   r  )r  r  r  r3   r8   r  )Nrc   )r{  r  rd  r  re  r3   r8   r  )rU   
__module____qualname__r   __annotations__r   propertyr   rt   r|   rx   r   r   r   r   r1  rB  rj  rE  staticmethodr  rl  __classcell__rF   rF   r   rG   r`   y   s   
  


 @ Fe=>r`   )r)   )r*   r+   r,   r-   r.   r/   r0   r   r1   r   r2   r3   r4   r3   r5   r3   r6   r7   r8   r9   )G
__future__r   rW   loggingrP   importlib.metadatar   importlib.utilr   multiprocessingr   r   queuer   timer   typingr	   r
   r   r   r   r   r   more_itertoolsr   packaging.versionr   r   r   r"   r   r   r   vllm.lora.requestr   lm_eval.api.modelr   lm_eval.api.registryr   lm_eval.models.utilsr   r   r   r   r   r   r   r   r   lm_eval.utilsr    r!   vllm.entrypoints.chat_utilsr$   vllm.tokenizersr%   r   !vllm.transformers_utils.tokenizervllm.utils.network_utilsr&   
vllm.utilsr   r'   lm_eval.api.instancer(   	getLoggerrU   rV   r_   r`   rF   rF   rF   rG   <module>   sR    ,
9