o
    5tiS                     @   s   d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZmZ eeZzd dlZW n	 ey\   Y nw er`	 ed
G dd deZdS )    N)	find_spec)TYPE_CHECKINGDictListOptionalTupleUnion)tqdm)Instance)
TemplateLM)register_model)Collatorhandle_stop_sequencespostprocess_generated_text)get_rolling_token_windowsmake_disjoint_windowsglangc                (       s  e Zd ZdZ																				dOd
edeeef dededee dee dedededededee dededee	 dededee dee f& fddZ
	dPdee d ed!ee	 fd"d#Z	dPdee d ed!ee fd$d%Z							dQdeee  d&ed'eee edf d(ed)ed*efd+d,Zed-d. Zed/d0 Zed1d2 Zed3d4 Z			dRd5eeee f d6ed7ed8ed!eee eee  f f
d9d:Zd;ee d!efd<d=Zed!efd>d?ZdPd@eeef d!efdAdBZ	dSdCeeeef  dDed!efdEdFZ	dPdeeeeef ee ee f  d ed!eee	ef  fdGdHZed;edIed!ee	ef fdJdKZedLed!efdMdNZ   Z!S )TSGLangLMi      N   FautoTcuda
pretrained
batch_sizemax_model_lenmax_gen_toksadd_bos_tokentokenizer_pathtokenizer_modeload_formattrust_remote_codedtypekv_cache_dtypecontext_lengthdevicechunked_prefill_sizemem_fraction_staticdp_sizetp_sizeprefix_token_idthink_end_tokenc                    s,  t    tdstdd|v s|d u sJ d|d u s%|d u s%J d|| _|d ur.|n|| _t|| _t|| _||||	|
||||| j| j|d| _	| j	
| t|tr^d|v r^dnt|| _| jdkrmtd	 tjdi | j	| _| jjj| _|| _|| _d
| v rd| _td || _d S )Nr   zattempted to use 'sglang' LM type, but package `sglang` is not installed. Please install sglang via official document here:https://docs.sglang.ai/start/install.html#install-sglangr   zSGLang only supports CUDAzDEither context_length or max_model_len may be provided, but not both)
model_pathr   r   r    r!   r"   r#   r%   r'   r)   r(   r&   r   r   zData parallelism will be deprecated in the future version of SGLang. See here: https://docs.sglang.ai/backend/server_arguments.html#data-parallelism .gemmaTzeFound 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it. )super__init__r   ModuleNotFoundErrorr+   _max_lengthinttensor_parallel_sizedata_parallel_size
model_argsupdate
isinstancestrr   eval_loggerwarningsglEnginemodeltokenizer_manager	tokenizer_max_gen_toksr   lowerinfocustom_prefix_token_id)selfr   r   max_batch_sizer   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   kwargs	__class__r.   S/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/models/sglang_causallms.pyr0   %   s`   




zSGLangLM.__init__requestsdisable_tqdmreturnc              
      s~  d }| j dkrt|}g }g }ttdd |D |p| jdkdD ]3\ \}tttt| 	|| j
| jd dd}dd |D }| fd	d
|D  |t| q!g }	|p]t| j }
tdt||
D ]}||||
  }t| \}}| j|dd}|	t|| qfg }d}|D ]0}|	|||  }tdd
 |D }|| ||7 }|t|d  jd }| jd|f| q|S )Nr   c                 S   s   g | ]}|j qS r.   args.0reqr.   r.   rJ   
<listcomp>   s    z2SGLangLM.loglikelihood_rolling.<locals>.<listcomp>r   )disabler   )
token_listprefix_tokenmax_seq_lencontext_lenc                 S   s   g | ]}d | qS )Nr.   )rQ   xr.   r.   rJ   rS      s    c                 3   s    | ]} |fV  qd S rY   r.   )rQ   windowreq_idxr.   rJ   	<genexpr>   s    z1SGLangLM.loglikelihood_rolling.<locals>.<genexpr>F)rK   rL   c                 s   s    | ]	\}}|d  V  qdS )r   Nr.   )rQ   _nllr.   r.   rJ   r^      s    loglikelihood_rolling)r   len	enumerater	   ranklistmapr   r   
tok_encoder*   
max_lengthextendappendr3   rangezip_loglikelihood_tokenssumrO   
cache_hookadd_partial)rE   rK   rL   adaptive_batch_sizeall_windowsrequest_window_countsstringrolling_token_windowswindowsall_nllsr   ibatchbatch_indicesbatch_windows
batch_nllsloglikelihoodscurrent_idxwindow_countrequest_nllsrequest_totalr.   r\   rJ   ra   |   s^   

zSGLangLM.loglikelihood_rollingc                 C   s  g }t dd |D  \}}| j|| jd}dd t |||D }dd }t||d d}|j| jd	kr7t| jnd
d d}	tt||pF| j	d
kdd}
| j
| j}|	D ]}t | \}}t | \}}g }g }t ||D ][\}}t|trt|}t|dd |d}n	tdt| d| v r|d}n| j}| j| }t||kr||| d   n|| | |}||||dB  qj| j|d|d}t ||D ]%\}}|dd}t||| j}|| | jd||f| |
 d qqS|
!  |"|S )Nc                 s   s    | ]}|j V  qd S rY   rN   rP   r.   r.   rJ   r^      s    z*SGLangLM.generate_until.<locals>.<genexpr>)add_special_tokensc                 S   s   g | ]\}}}||f|fqS r.   r.   )rQ   abcr.   r.   rJ   rS      s    z+SGLangLM.generate_until.<locals>.<listcomp>c                 S   s   t | d d  | d d fS )Nr   r   )rb   )	_requestsr.   r.   rJ   _collate_gen   s   z-SGLangLM.generate_until.<locals>._collate_gen)group_byr   r   nbatch_fnzRunning generate_until requeststotalrT   descuntil)eosz/Expected `kwargs` to be of type `dict` but got r   )
max_tokensstopT)rK   generatesampling_paramstext generate_untilr   )#rl   rg   r   r   get_batchedr   r3   r	   rb   rd   r@   decodeeot_token_idr8   dictcopydeepcopyr   pop
ValueErrortypekeysr   rh   rj   modify_gen_kwargs_model_generategetr   r+   ro   rp   r7   closeget_original)rE   rK   rL   rescontextall_gen_kwargscontext_encodingr   re_ordschunkspbarr   chunkcontext_and_encodingcontext_encoding_truncatedr   rZ   
gen_kwargsrG   r   r   max_ctx_lencontoutputgenerated_textr.   r.   rJ   r      sx   








zSGLangLM.generate_untilr   r   return_logprobtop_logprobs_numlogprob_start_lenc                 C   sR   |s|r|ni }| ddd t|ts|gt| }| jj|||||d}|S )Nr   r   )temperaturemax_new_tokens)	input_idsr   r   r   r   )r7   r8   r   rb   r>   r   )rE   rK   r   r   r   r   r   outputsr.   r.   rJ   r      s"   

zSGLangLM._model_generatec                 C   s   | j jS rY   )r@   eos_token_idrE   r.   r.   rJ   r   >  s   zSGLangLM.eot_token_idc                 C   s,   | j d ur| j S | jjd ur| jjS | jjS rY   )rD   r@   bos_token_idr   r   r.   r.   rJ   r*   C  s
   
zSGLangLM.prefix_token_idc                 C   s6   | j r| j S t| jdrt| jjdr| jjjS | jS )Nr?   rX   )r2   hasattrr>   r?   rX   _DEFAULT_MAX_LENGTHr   r.   r.   rJ   rh   L  s   
zSGLangLM.max_lengthc                 C   s   | j S rY   )rA   r   r.   r.   rJ   r   V  s   zSGLangLM.max_gen_toksrt   left_truncate_lenr   
truncationc                    sT   |s| j }| j|||ddj} r(t|ts! fdd|D }|S |  d  }|S )NF)r   r   return_attention_maskc                    s   g | ]	}|  d  qS rY   r.   )rQ   encr   r.   rJ   rS   n  s    z'SGLangLM.tok_encode.<locals>.<listcomp>)r   r@   r   r8   r9   )rE   rt   r   r   r   encodingr.   r   rJ   rg   [  s    
zSGLangLM.tok_encodetokensc                 C   s   d S rY   r.   )rE   r   r.   r.   rJ   
tok_decodet  s   zSGLangLM.tok_decodec                 C      dS )z
        Return the name of the model's tokenizer and/or the accompanying chat template.
        The returned string is used to cache requests.

        Returns:
            str: The name of the model's tokenizer and/or chat template.
        Nr.   r   r.   r.   rJ   tokenizer_namex  s   	zSGLangLM.tokenizer_namechat_templatec                 C   r   )a  
        Get the appropriate chat template for the model based on the `chat_template` argument.

        This method returns the chat template string to build the prompt from a chat history.
        The chat template is saved in the evaluation results for reproducibility.
        Boolean arguments should be used with models that have only one chat template,
        while string arguments are used with models that have multiple chat templates.
        For the reference implementation, see HFLM class in `lm_eval.models.huggingface`.

        Args:
            chat_template (Union[bool, str]): Specifies whether to apply a chat template:
                - If False: Do not apply any chat template.
                - If True: Apply the default chat template.
                - If str: Apply the specified chat template by name.

        Returns:
            str: The selected chat template in Jinja format.
        Nr.   )rE   r   r.   r.   rJ   r     s   zSGLangLM.chat_templatechat_historyadd_generation_promptc                 C   s   | j j|d|| d}|S )zc
        Method to apply a chat template to a list of chat history between user and model.
        F)tokenizer   continue_final_message)r@   apply_chat_template)rE   r   r   chat_templatedr.   r.   rJ   r     s   zSGLangLM.apply_chat_templatec                 C   s>  g }dd }t ||d}|j| jdkrt| jndd d}tt||dd}|D ]m}g }	g }
|D ]+\}}}|| | j d  }t|tdt|t| | j  }|	| |
| q0| j	|	d	d
ddd}t
||
||	D ]'\}}\}}}}| j|||d}|| |d ur| jd|| |d qmq(|  ||S )Nc                 S   s"   | d | d  }t | t|fS )Nr      )rb   tuple)rZ   toksr.   r.   rJ   _collate  s   z0SGLangLM._loglikelihood_tokens.<locals>._collate)sort_fnr   r   r   zRunning loglikelihood requestsr   FTr   )rK   r   r   r   r   )r   r   ctxlenloglikelihoodr   )r   r   r   r3   r	   rb   rh   maxrj   r   rl   _parse_logprobsro   rp   r7   r   r   )rE   rK   rL   r   r   re_ordr   r   r   inputsctxlens	cache_keycontext_enccontinuation_encinpr   r   r   r_   answerr.   r.   rJ   rm     sV   


zSGLangLM._loglikelihood_tokensr   c           
      C   s   |d d }t dd ||d D }|d d }d}t| |d ||d D ]\}}|rCt|dd	 d
d }	|	|krCd} ||fS q(||fS )a  Process logprobs and tokens.

        :param tokens: list
            Input tokens (potentially left-truncated)
        :param outputs:
            Contains input_token_logprobs and input_top_logprobs
        :param ctxlen: int
            Length of context (so we can slice them away and only keep the predictions)
        :return:
            continuation_logprobs: float
                Log probabilities of continuation tokens
            is_greedy: bool
                Whether argmax matches given continuation exactly
        	meta_infoinput_token_logprobsc                 s   s    | ]\}}}|V  qd S rY   r.   )rQ   logprobr_   r.   r.   rJ   r^     s    

z+SGLangLM._parse_logprobs.<locals>.<genexpr>Ninput_top_logprobsTc                 S   s   | d S )Nr   r.   )rZ   r.   r.   rJ   <lambda>   s    z*SGLangLM._parse_logprobs.<locals>.<lambda>)keyr   F)rn   rl   r   )
r   r   r   continuation_logprobs_listscontinuation_logprobstop_logprobs_lists	is_greedytokentop_logprobs	top_tokenr.   r.   rJ   r     s   
" zSGLangLM._parse_logprobsrG   c                 C   sb   |  dd| d< | dd }|du rd| vrtd d| d< |  dd| d< |  dd| d< | S )Nr   g        	do_sampleFzSGot `do_sample=False` and no temperature value, setting VLLM temperature to 0.0 ...skip_special_tokensspaces_between_special_tokens)r   r   r:   debug)rG   r   r.   r.   rJ   r     s   zSGLangLM.modify_gen_kwargs)r   NNr   FNr   r   Tr   r   Nr   r   Nr   r   NN)F)NFNFr   r   )NFF)T)"__name__
__module____qualname__r   r9   r   r3   r   boolfloatr0   r   r
   ra   r   r   r   propertyr   r*   rh   r   rg   r   r   r   r   r   rm   staticmethodr   r   r   __classcell__r.   r.   rH   rJ   r   !   s   
X
F
a




	



 
; #r   ) r   loggingimportlib.utilr   typingr   r   r   r   r   r   r	   lm_eval.api.instancer
   lm_eval.api.modelr   lm_eval.api.registryr   lm_eval.models.utilsr   r   r   lm_eval.utilsr   r   	getLoggerr   r:   r   r<   r1   r   r.   r.   r.   rJ   <module>   s(     
