o
    5ti                    @  s  d dl mZ d dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZ d dlZd dlZd dlm  mZ d dlZd dlmZmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dl m Z  d dl!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 d dl2m3Z3m4Z4m5Z5m6Z6 e	rd dl7m8Z8m9Z9 d dl:m;Z; d dl<m=Z= e>e?Z@dZAe)dddG dd de'ZBdS )    )annotationsN)	timedelta)Path)TYPE_CHECKINGAnyLiteral)AcceleratorInitProcessGroupKwargsfind_executable_batch_size)get_max_memory)HfApi)version)parse)tqdm)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES)utils)
TemplateLM)register_model)Collator_add_special_kwargsconfigure_pad_tokenhandle_stop_sequenceshas_bos_prefixnormalize_gen_kwargspostprocess_generated_text)clear_torch_cache	get_dtypepad_and_concatstop_sequences_criteria)IteratorSequenceAutoQuantizationConfig)Instancel         3Me'zhf-autohfhuggingfacec                      s$  e Zd ZdZdZdZ										
																				dd fd<d=Z		
				dddAdBZedCdD Z	edEdF Z
eddHdIZeddJdKZeddLdMZeddNdOZedPdQ ZedRdS ZedTdU ZedVdW ZeddXdYZ		ddd\d]Zddddd^dd_d`Z		
													dddcddZ						dddedfZdddkdlZ		dddqdrZ	s		dddxdyZddd}d~Z		ddddZdddZ		ddddZ	ddddZdd Z 		ddddZ!	ddddZ"	ddddZ#dddZ$  Z%S )HFLMzAn abstracted Huggingface model class. Enables usage with both models of
    `transformers.AutoModelForCausalLM` and `transformers.AutoModelForSeq2SeqLM` classes.

    Supports data-parallel multi-GPU with HF Accelerate.
    Ni   defaultmain FTcudaauto   @   	./offload
pretrained"str | transformers.PreTrainedModelbackend'Literal['default', 'causal', 'seq2seq']revision
str | None	subfolderstr	tokenizerTstr | transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerFast | None
truncationbool | Nonelogits_cachebool
max_length
int | Nonedevicedtypestr | torch.dtype | Nonesoftmax_dtypemixed_precision_dtype
batch_sizeint | str | Nonemax_batch_sizetrust_remote_codeuse_fast_tokenizeradd_bos_tokenprefix_token_idparallelizemax_memory_per_gpumax_cpu_memoryoffload_folderstr | os.PathLike | Nonepeftdeltaautogptqbool | str | None	gptqmodel	gguf_filethink_end_tokenstr | int | Noneenable_thinkingchat_template_argsdict[str, Any] | NonereturnNonec           '   
     s  t    t|ts%td |rJ d|| _| jj| _| jj	| _
d} nt|	ts,J t|ts3J t|ttfs<J ttddd}!t|!gd}"|"jdkrR|"| _|"jj}#d	|#v r`tj } nd
|#v rjtj } nd|#v rttj } ntj } |s|"jdkstd	dgdd t| D  ddg dd t| D  dd t| D  }$|	r|	|$v rt|	| _td|	 d |	dv rttjtdk rtdtj n?td tdtj   tj rtd	ntd| _n|	d	krtd|	 d t| dr| jjnt|	| _t|}| j |||||d | j!| j	||d | j"||||||||d t#| j	d d  }%d urRt|%t$rRdd!l%m&}& |&'|%}%t|tr| j(d=i d"|d#|d$|
d%|d&|d'| d(|d)|d*|d+|d,|d-|d.|d/|d |%d0|| t| j)tj*j+r| j),  | j)-  t|tr|. rt|n|| _/|| _0|| _1| j2j3| _3t4| j2| j	d1| _2|d ur|pi t$|d2B ni | _5|| _6|| _7|| _8|| _9|| _:|| _;d| _<i | _=|| _>|d urt?|nd | _@|d urt?|nd | _At|Bd3r1|Cd4}|d | _DtE|dkr-tF|d nd| _<nt|| _Dt|tr| dksIt| jdkrn|sn|snt| dsnz	| j)G| j W n tHym   tId5 Y nw | dkr|"jdkr|rtd6 n| |"jkrtd7|"j d8 | jjJrtd9|  d: t|"j | _|"| _| jjK| _L| jj| _Mnd| _Ld| _Mntd; d| _Ld| _M|| _N|d urtd<| jO  d S d S )>Nz`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.zW`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`r   4   )weeks)timeout)kwargs_handlersr-   r+   npuxpucpuc                 S     g | ]}d | qS )zcuda: .0irf   rf   N/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/models/huggingface.py
<listcomp>       z!HFLM.__init__.<locals>.<listcomp>mpsmps:0c                 S  re   )znpu:rf   rg   rf   rf   rj   rk      rl   c                 S  re   )zxpu:rf   rg   rf   rf   rj   rk      rl   zUsing device '')rm   rn   z2.1z$mps requires torch >= 2.1. You have zDevice not specifiedzCuda Available? z9Using `accelerate launch` or `parallelize=True`, device 'z(' will be overridden when placing model.acceleratorr4   rH   rV   r6   )configr2   rH   )r4   r6   rH   rI   rV   rJ   quantization_configr"   r0   r4   rA   rH   rL   gpusrM   rN   rO   rQ   rR   rS   rU   rV   r6   )model_config)rY   r,   :zFailed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore.zYou are both using a HF Accelerate `device_map` (`--model_args parallelize=True`) and launching via `accelerate launch`. This will attempt to do model and data parallelism depending on the resources available.zWARNING: The number of total system GPUs does not match the number of spawned processes. If you would like to use data parallelism, please launch the script with 'accelerate launch *script*'. Current run will proceed with z	 devices.zUsing z devices with data parallelismzPassed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integrationz2Loglikelihood prefix token id used in evaluation: rf   )Psuper__init__
isinstancer7   eval_loggerwarning_modelr@   _devicerr   _configintr	   r   r   num_processesrp   typetorchr+   device_countrb   rc   setrangeinfor   r   __version__RuntimeErroris_availablehasattr_get_config_get_backend_create_tokenizergetattrdicttransformers.quantizersr#   	from_dict_create_modelmodelnnModuleevaltie_weightsisdigitrW   r:   r<   r8   
vocab_sizer   rZ   rJ   _max_lengthr0   rR   rQ   r4   batch_schedulebatch_sizesrG   r   rC   rD   
startswithsplitbatch_size_per_gpulenfloatto
ValueErrordebugis_local_main_processlocal_process_index_rank_world_sizecustom_prefix_token_idrK   )'selfr0   r2   r4   r6   r8   r:   r<   r>   r@   rA   rC   rD   rE   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rQ   rR   rS   rU   rV   rW   rY   rZ   kwargsrt   accelerator_kwargsrp   device_typedevice_listrs   r#   	__class__rf   rj   rx   F   s  
+








	

	







"






zHFLM.__init__
device_maprt   r   c              	     s  t tjddt tjdd }|dkr)tdr)jdur)td d}|du rOt }d	|v r7|d	= t	t
|k }td
| d dt
|  i }	|ri }
 dure fddt|D }nt }|d	d tdrfdd| D }n|}||	d< |du rdn||	d< td| d|	d  |dur||
d	< ||	d< |	S |du rtdrdjj i}ndtji}d|	d< ||	d< td|  |	S d|	d< d|	d< td |	S )zOReturns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`.LOCAL_WORLD_SIZEr-   
WORLD_SIZEr   rp   NzTWe are not in a distributed setting for accelerate. Setting model_parallel to False.Frd   zSetting model parallel to z( since the number of local processes is z and the number of GPUs is c                   s   i | ]}| qS rf   rf   )rh   
device_idx)rM   rf   rj   
<dictcomp>  s    z-HFLM._get_accelerate_args.<locals>.<dictcomp>c                   s*   i | ]\}}|  j j  kr||qS rf   )rp   process_index)rh   kv)num_local_processesr   rf   rj   r     s    

max_memoryr,   r   z>Model parallel was set to True, setting max memory per GPU to z and device map to rO   r*   zSModel parallel was set to False, max memory was not set, and device map was set to z Model parallel was set to False.)r   osenvirongetr   rp   rz   r   r   r=   r   r   popitemsr@   r7   )r   rL   r   rM   rN   rO   rt   num_machinesmax_memory_all_gpusargsr   max_memory_per_gpu_maprf   )rM   r   r   rj   _get_accelerate_args[  sv   





zHFLM._get_accelerate_argsc                 C     | j S N)r~   r   rf   rf   rj   rr     s   zHFLM.configc                 C  s   t | dr| j| jS | jS )Nrp   )r   rp   unwrap_modelr|   r   rf   rf   rj   r     s   
z
HFLM.modelr   c                 C  s   | j jS r   )r8   eos_token_idr   rf   rf   rj   eot_token_id  s   zHFLM.eot_token_idc                 C  s,   | j d ur| j S | jjd ur| jjS | jjS r   )r   r8   bos_token_idr   r   rf   rf   rj   rK     s
   
zHFLM.prefix_token_idc                 C  sf   | j r| j S d}|D ]}t| jj|rt| jj|  S q
t| jdr0| jjtkr,| jS | jjS | jS )N)n_positionsmax_position_embeddingsn_ctxmodel_max_length)	r   r   r   rr   r   r8   r   TOKENIZER_INFINITY_DEFAULT_MAX_LENGTH)r   seqlen_config_attrsattrrf   rf   rj   r>     s   zHFLM.max_lengthc                 C  s   dS )N   rf   r   rf   rf   rj   max_gen_toks  s   zHFLM.max_gen_toksc                 C  r   r   )r   r   rf   rf   rj   rE        zHFLM.batch_sizec                 C  r   r   )r}   r   rf   rf   rj   r@     r   zHFLM.devicec                 C  r   r   )r   r   rf   rf   rj   rank  r   z	HFLM.rankc                 C  r   r   )r   r   rf   rf   rj   
world_size  r   zHFLM.world_sizec                 C  s   | j jddS )N/__)r8   name_or_pathreplacer   rf   rf   rj   tokenizer_name  s   zHFLM.tokenizer_namerr   7transformers.PretrainedConfig | transformers.AutoConfigc                 C  s   |dv sJ |dkr|dv r|| _ td| j  d n@t|ddtv r2d| _ td	| j  d n*t|ddtv rHd
| _ td	| j  d n|sOtd d
| _ td| j  d | jdu rw| j d
krlt	j
| _dS | j dkryt	j| _dS dS dS )a  Helper method during initialization.

        Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder)) model type to be used.
        sets `self.AUTO_MODEL_CLASS` appropriately if not already set.

        **If not calling HFLM.__init__() or HFLM._get_backend() within a subclass of HFLM,
        user must set `self.backend` to be either "causal" or "seq2seq" manually!**
        )r(   causalseq2seqr(   )r   r   z0Overrode HF model backend type, and using type 'ro   
model_typeNr   zUsing model type 'r   zHF model type is neither marked as CausalLM or Seq2SeqLM.                     This is expected if your model requires `trust_remote_code=True` but may be an error otherwise.Setting backend to causalz;Model type cannot be determined. Using default model type ')r2   rz   r   r   r   r   r   r{   AUTO_MODEL_CLASStransformersAutoModelForCausalLMAutoModelForSeq2SeqLM)r   rr   r2   rH   rf   rf   rj   r     s<   



zHFLM._get_backendrq   c                C  s   t jj|||||d| _dS )z/Return the model config for HuggingFace models.rq   N)r   
AutoConfigfrom_pretrainedr~   )r   r0   r4   rH   rV   r6   rf   rf   rj   r   .  s   
zHFLM._get_configrs   AutoQuantizationConfig | Nonec                 K  s   |pi }| | j||d|||	|d |sM|sM|dr8ttjtdks+J d|d }r8t||d< | jj|f|t|||||d|| _	nr|rU|rUt
d|rzd	d
lm} W n tys } zt|dd|d}~ww |j|f||du rdnt|j|du rdn|dd|| _	|rzd	dlm} W n ty } zt|dd|d}~ww |j|fd|i|| _	|
r|rt
d|
r*d	dlm}m} |drt|tdk rtdt| j	jdrt| j	jjdr| j	jjj}n| j	jj}|t| jkrtd| dt| j d | j	 t| j |j| j	|
|d| _	dS |r|r5t!d | jj|f|t||d |}| j	" # D ]>\}}z| j$|" | 7  _$W qL t%ys } zt%d!| |d}~w t&y } zt'd"| d#| |d}~ww ~dS dS )$a  Initializes an HF or HF-compatible PreTrainedModel from scratch
        inside HFLM, using the kwargs passed into self.__init__().

        Also handles functionality such as AutoGPTQ usage and PEFT wrapping.

        For future similar extensions to AutoGPTQ that are not core to HF's ecosystem,
        (such as PyTorch models that are nearly, but not quite, fully mirroring
        HF's public interface relied on in this HFLM class)
        please consider subclassing HFLM and overriding this and other methods as needed.
        r   )rL   r   rM   rN   rO   rt   load_in_4bitz4.30.0z,load_in_4bit requires transformers >= 4.30.0bnb_4bit_compute_dtype)r4   rA   rH   rV   rs   r6   zDCannot use both 'autogptq' and 'gptqmodel' options at the same time.r   )AutoGPTQForCausalLMz8Tried to load auto_gptq, but auto-gptq is not installed zPplease install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]NTz.safetensors)rH   model_basenameuse_safetensors)	GPTQModelz8Tried to load gptqmodel, but gptqmodel is not installed zplease install gptqmodel via `pip install gptqmodel --no-build-isolation` or `pip install lm-eval[gptqmodel] --no-build-isolation`rH   z<Cannot use both 'peft' and 'delta' options at the same time.)	PeftModelr   z0.4.0z#load_in_4bit requires peft >= 0.4.0text_configr   z#Model config indicates vocab_size='z(', but found tokenizer with vocab size 'z$'. Resizing model embedding layer...)r4   zHDelta weights might trigger unexpected behavior when used with AutoGPTQ.)r4   rA   rH   z*Delta model is missing weights for layer: z%Failed to add delta weights to layer 	. Error: )(updater   r   vparser   r   r   r   r   r|   r   	auto_gptqr   ModuleNotFoundErrorr   from_quantizedr   stemendswithrU   r   rQ   r   AssertionErrorr   rr   r   r   r   r8   rz   r   resize_token_embeddingsr{   
state_dictr   dataKeyError	Exceptionr   )r   r0   r4   rA   rH   rL   rt   rM   rN   rO   rQ   rR   rS   rU   rV   rs   r6   r   model_kwargscompute_dtyper   	exceptionr   r   PEFT_VERSIONr   _model_deltanameparamerf   rf   rj   r   @  s
  #




zHFLM._create_modelc	                 C  s   ||d}	|s|dur||	d< n||	d< |dur||	d< |r"||	d< |rFt |tr6tjj|fi |	| _dS t |tjtjfsAJ || _dS t |trN|}
n| jj	}
tjj|
fi |	| _dS )zHelper method during initialization.

        Create a tokenizer object corresponding to the correct
        tokenizer for value of `pretrained`, or use the pre-initialized tokenizer passed.
        )r4   rH   NrV   use_fastrJ   r6   )
ry   r7   r   AutoTokenizerr   r8   PreTrainedTokenizerPreTrainedTokenizerFastr   r   )r   r0   r8   r4   rH   rI   rV   rJ   r6   r   
model_namerf   rf   rj   r     s@   



zHFLM._create_tokenizerr   requestsSequence | Noneposc              
     s.  |r4|| \}}}t || jd  d  d d t |jd  d  t |jd  d   nj tjdd
 fdd}z| }W n tyl } zdt|v rad}n W Y d }~nd }~ww jdkrtj|gj	d	}	j
|	    }
t|
}t  |S t  |S )Nr-   )starting_batch_sizerE   r   c                   s   j dkr(t }tj| |fjd }tj| |fjd }||d}ni }tj| fjd }tdD ]}tjj	|fi |dj
d}q:| S )Nr   r@   	attn_masklabels   r  dimrA   )r2   maxr   onesr@   longr   Flog_softmax_model_callrC   )rE   lengthbatched_conts
test_batchcall_kwargs_outmax_cont_encmax_context_encr>   r   rf   rj   forward_batch+  s0   



z.HFLM._detect_batch_size.<locals>.forward_batchzNo executable batch size foundr  )rE   r   )r   r>   r
   rG   r   r7   r   r   tensorr@   rp   gatherrd   detachnumpytolistminr   )r   r  r  r"  context_enccontinuation_encr'  rE   r  
max_rnk_bsgatheredrf   r$  rj   _detect_batch_size  s<   


zHFLM._detect_batch_sizestringadd_special_tokensleft_truncate_len	list[int]c                 K  sZ   t || j}|d u rt|| j| jrd|d< | jj|fi |}|r+|| d  }|S )NFr4  )r   rJ   r   r8   decoderK   encode)r   r3  r4  r5  r   special_tokens_kwargsencodingrf   rf   rj   
tok_encodeZ  s   	
zHFLM.tok_encodeleftstrings	list[str]padding_side!tuple[torch.Tensor, torch.Tensor]c           	      C  s  | j j}|| j _i }| jdkr-t|d t| j dd r ddi}n| jd ur+d| ji}ni }| j s4J d| j |f|ddd	|}|rx|d
 d}||krZtd| d| d |d
 d d | d f |d
< |d d d | d f |d< || j _|d
 |d fS )Nr   r   	bos_tokenr4  Fz-Tokenizer shoukd be initialized at this pointlongestpt)r:   paddingreturn_tensors	input_idsr-   z6Left truncation applied. Original sequence length was z, truncating to last z# tokens. Some content will be lost.attention_mask)	r8   r?  r2   r   r   rJ   sizerz   r{   )	r   r=  r?  r5  r:   old_padding_sider4  r:  original_lengthsrf   rf   rj   tok_batch_encodes  sB   


zHFLM.tok_batch_encodetokensIterator[list[str]]skip_special_tokensc                 C  s   | j j||dS )NrN  )r8   r7  )r   rL  rN  rf   rf   rj   
tok_decode  s   zHFLM.tok_decodeinpstorch.Tensorr  torch.Tensor | Noner  c              	   C  s   t  e t j| jj| j| jdudF |dus|durE|dur$|dus&J tj| jks.J | j	|||dj
W  d   W  d   S | 	|j
W  d   W  d   S 1 s\w   Y  W d   dS 1 slw   Y  dS )a/  

        :param inps: torch.Tensor
            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)] or of shape
            [batch, sequence_ctx]. the size of sequence may vary from call to call
        :param attn_mask: torch.Tensor, optional
            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)]. Only passed
            (and must be passed) if self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM
        :param labels: torch.Tensor, optional
            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)]. Only passed
            (and must be passed) if self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM
        :return
            A torch tensor of shape [batch, sequence, vocab] with the
        logits returned from the model's decoder
        Nr   rA   enabled)rF  rG  r  )r   no_gradautocastr@   r   rD   r   r   r   r   logits)r   rQ  r  r  rf   rf   rj   r    s$   
RzHFLM._model_callstopc              	   K  s   | dd|d< | d}| d }dkr |d u r d |d< }|du r.|dkr.|dd  t| j||jd |jd }tj| jj| j	| j	d ud | j
jd
|||| jjdd	|W  d    S 1 sfw   Y  d S )Ntemperatureg        	do_sampleFr-   r   rT  T)rF  r>   stopping_criteriapad_token_id	use_cacherf   )r   r   r   r8   shaper   rW  r@   r   rD   r   generater]  )r   contextr>   rY  generation_kwargsr[  tempr\  rf   rf   rj   _model_generate  s0   
$zHFLM._model_generaterX  contleninplenc                 C  sX   | j dkr|r	|sJ d||| | }|S | j dkr*|r |r$J d|d | }|S )Nr   zGMust pass input len and cont. len to select scored logits for causal LMr   z=Selecting scored logits for Seq2SeqLM requires only cont. len)r2   )r   rX  re  rf  rf   rf   rj   _select_cont_toks  s   

	

zHFLM._select_cont_tokslist[Instance]disable_tqdmlist[float]c              
     s.  d }| j dkrtd |  }td|  |}g }g }ttdd |D |p*| jdkdD ]3\ \}tttj	tj
| || j| jdd	}d
d |D }	| fdd|	D  |t|	 q.d}
| jdkrtjt|| jd}| j|    }t||| j  }
|
dkr||
|d g 7 }g }|p| j }tdt||D ]'}||||  }t|ddi\}}| j|dt|d}|t||dd q| jdkr|
dkr|d |
  }g }d}|D ]0}||||  }tdd |D }|| ||7 }|t|d  j d }| j!"d|f| q|S )Nr,   ?Passed argument batch_size = auto. Detecting largest batch sizeDetermined Largest batch size: c                 S     g | ]}|j qS rf   r   )rh   reqrf   rf   rj   rk         z.HFLM.loglikelihood_rolling.<locals>.<listcomp>r   )disabler-   )
token_listprefix_tokenmax_seq_lencontext_lenc                 S  s   g | ]}d | qS )r   rf   )rh   xrf   rf   rj   rk   *  s    c                 3  s    | ]} |fV  qd S r   rf   )rh   windowreq_idxrf   rj   	<genexpr>-      z-HFLM.loglikelihood_rolling.<locals>.<genexpr>r  strictTF)r  ri  override_bsr|  c                 s  s    | ]	\}}|d  V  qdS )r   Nrf   )rh   r"  nllrf   rf   rj   rz  S  s    loglikelihood_rolling)#rE   printr2  	enumerater   r   listmapr   make_disjoint_windowget_rolling_token_windowsr;  rK   r>   extendappendr   r   r   r(  r@   rp   r)  rd   r*  r+  r,  r  r   zip_loglikelihood_tokenssumr   
cache_hookadd_partial)r   r  ri  adaptive_batch_sizerE   all_windowsrequest_window_countsr3  rolling_token_windowswindowspad_amntmytensorr1  all_nllsri   batchbatch_indicesbatch_windows
batch_nllsloglikelihoodscurrent_idxwindow_countrequest_nllsrequest_totalrf   rx  rj   r    sx   



zHFLM.loglikelihood_rollingc                 C  s   |t t|| j  }|| jv r| j| S t| jdkr1| j|d  | jkr1| j| j|< | j| S td| j d | ||| j|< td| j|   | j| S )Nr-   z"Passed argument batch_size = auto:z. Detecting largest batch sizezDetermined largest batch size: )r   r   r   r   rG   r  r2  )r   r  n_reordered_requestsschedrf   rf   rj   _batch_scheduler^  s   



zHFLM._batch_scheduler2list[tuple[tuple[str, str], list[int], list[int]]]r}  list[tuple[float, bool]]c           *      C  sJ  g }d%dd}d%dd}t ||| jdkr| jrdnd |d	}t|}| jd
kr*| jn|d ur0|nd}	| jd
kr@|dkr@|s@| jnd }
|j|	|
d}tt||pS| jdkdd}|D ]}g }g }g }g }g }d }d }|D ]\}}}t|dksyJ t|dksJ t|| j	ksJ | jdkrt|t| }|| j	d krt
dt| dt| d| j	 d|| j	 d  d	 tj|| | j	d  d  d d tj| jd}|j\}nG| jdkrtj|| j	 d  tj| jd}|j\}|t| tj|| j	 d  tj| jd}|j\}|| |d urt||n|}|d ur(t||n|}|| || || qli }|sCJ d| jdkrQt||dd}n!| jdkrr|s^J dt||}t||}t||} | |d}tj| j|fi |d| jd}!t||!||ddD ]\\}"}#}}$}}%t|%}| jdkr||$jd |  nd }&| j|$||&d }$|$d}$|$jdd!}'|j|"|#|%|$d"D ]S\}"}%}$tj|%tj| jdd}%|'d d |%jd  d f |%k }(t|$d#|%d d}$t!|$" t#|(f})||) |"d ur| j$%d$|"|) |&d qŐqqY|'  |(|S )&Nro  ,tuple[tuple[str, str], list[int], list[int]]c                 S  s"   | d | d  }t | t|fS )z&Defines the key for the sorted method.r-      )r   tuplero  toksrf   rf   rj   _collatex  s   	z,HFLM._loglikelihood_tokens.<locals>._collatec                 S  s   | d | d dd  S )z<Defines the key to group and lookup one-token continuations.r  Nrf   )ro  rf   rf   rj   _lookup_one_token_cont  s   z:HFLM._loglikelihood_tokens.<locals>._lookup_one_token_contr   contextssort_fngroup_bygroup_fnr,   r   nbatch_fnzRunning loglikelihood requeststotalrq  descr-   zCombined length of context (z) and continuation (z") exceeds model's maximum length (z). Truncating z tokens from the left.r  )rA   r@   r   z$padding_len_inp should be set by nowright)r?  z%padding_len_cont should be set by nowr  r  Tr~  )re  rf  )r  )req_strcxt_toks	cont_toksrX  r  loglikelihood)ro  r  ))r   r2   r<   r   rE   r  get_batchedr   r   r>   rz   r{   r   r(  r  r@   r_  r  	ones_liker  r   r  r  r  rC   r  rg  	unsqueezeargmax	get_cacheallr)  squeezer   r  r=   r  r  r   closeget_original)*r   r  ri  r}  resr  r  re_ordr  rE   r  chunkspbarchunkrQ  cont_toks_listinplenscontsencoder_attnspadding_len_inppadding_len_contr"  r.  r/  total_lengthinprf  contre  r!  batched_inpsr  batched_encoder_maskmulti_logitsrequest_str
ctx_tokensrX  r  ctx_lengreedy_tokens	max_equalanswerrf   rf   rj   r  o  s*  



	















=
zHFLM._loglikelihood_tokensc                   s  g }d) fdd}t t||p jdkdd}d } jdkr/td	   }td
|  |} jdkr7 jn|d ur=|nd} jdkrI|sI jnd }tdd |D |ddd d}	|	j||d}
 j	 j
dd}|
D ]}t|ddi\}}|d }t|tsJ dt| t| j}t|dd |d}|d} jdkr j| }|dksJ d| d j dn jdkr j} j|| jd\}}| j}| j}d |v rtd! |d |jd" | } jd*||||d#|}| }t||dd$D ]h\}} jdkr||jd" d  }t jtr6 fd%dt |D }|r6||d& d" d  } 	|}t jtrF|! }t"||t jt#rS jnd d'}|$|  j%&d(||f| |'d" qqk|	(|}|)  |S )+Nro  tuple[str, dict]c                   s      | d }t| | d fS )z%Defines the key for the sorted methodr   )r;  r   r  r   rf   rj   r  a  s   z%HFLM.generate_until.<locals>._collater   zRunning generate_until requestsr  r,   rk  rl  c                 S  rm  rf   rn  )rh   regrf   rf   rj   rk     rp  z'HFLM.generate_until.<locals>.<listcomp>
gen_kwargsc                 S  s   | d S )Nr-   rf   )rv  rf   rf   rj   <lambda>  s    z%HFLM.generate_until.<locals>.<lambda>r  r  FrO  r|  Tz/Expected `kwargs` to be of type `dict` but got until)eosr   r   z9Invalid configuration: requested max tokens to generate (z5) must be less than model's maximum sequence length (z).r   )r5  r:   r>   zE`max_length` in generation kwargs. Please use `max_gen_toks` instead.r-   )ra  rG  rY  r>   r~  c                   s   g | ]\}}| j kr|qS rf   )rW   )rh   ri   tokenr   rf   rj   rk     s
    
r  )
generationrY  rW   generate_until)ro  r  rf   )*r   r   r   rE   r  r2  r  r   r  rP  r   r  ry   r   r   r   r   r   r   r2   r>   rK  r:   r   r@   rz   r{   r_  rd  r,  rW   r   r  lstripr   r7   r  r  r  r   r  r  )r   r  ri  r  r  r  r  rE   r  re_ordsr  r  r  r  all_gen_kwargsr  r   r  r   max_ctx_lenr.  
attn_masksr>   r  r  r  ra  think_token_indicessrf   r   rj   r  \  s   

	













"zHFLM.generate_untilchat_historylist[dict[str, str]]add_generation_promptc              	   C  sx   z| j j|fd|| d| j}W |S  tjjy;   td dd |D }| j j|fd|| d| j}Y |S w )zQMethod to apply a chat template to a list of chat history between user and model.F)tokenizer  continue_final_messagezHFailed to apply chat template. removing the system role in chat history.c                 S  s   g | ]
}|d  dkr|qS )rolesystemrf   )rh   msgrf   rf   rj   rk     s    z,HFLM.apply_chat_template.<locals>.<listcomp>)r8   apply_chat_templaterZ   jinja2
exceptionsTemplateErrorrz   r{   )r   r  r  chat_templatedrf   rf   rj   r    s6   zHFLM.apply_chat_templatec                 C  sx   ddd}ddd}dd
d}|| j || j | j|| j| jd}| jr.|| j| j|d< | jr:|| j| j|d< |S )zLMethod to get Hugging Face model information for experiment reproducibility.r\   r   c                 S  s6   t | dr	|  S t | drtdd |  D S dS )Nnum_parameters
parametersc                 s  s    | ]}|  V  qd S r   )numel)rh   prf   rf   rj   rz    r{  zDHFLM.get_model_info.<locals>.get_model_num_params.<locals>.<genexpr>r  )r   r  r  r  r   rf   rf   rj   get_model_num_params
  s
   

z1HFLM.get_model_info.<locals>.get_model_num_paramsr7   c                 S  s   t | dr| jS dS )NrA   r*   )r   rA   r  rf   rf   rj   get_model_dtype  s   
z,HFLM.get_model_info.<locals>.get_model_dtyper0   r4   c                 S  s\   zt  j| |d}|jW S  ty- } ztd|  d| d|  W Y d }~dS d }~ww )N)repo_idr4   zFailed to get model SHA for z at revision r   r*   )r   
model_infoshar   rz   r   )r0   r4   r  r  rf   rf   rj   get_model_sha  s   z*HFLM.get_model_info.<locals>.get_model_sha)model_num_parametersmodel_dtypemodel_revision	model_shapeft_sha	delta_shaNr\   r   r\   r7   )r0   r7   r4   r7   r\   r7   )r|   r4   r0   rQ   rR   )r   r  r  r  r  rf   rf   rj   get_model_info  s   


zHFLM.get_model_info)r(   r)   r*   NFTNr+   r,   NNr-   r.   FTNNFNNr/   NNFFNNNN)>r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   r5   rA   rB   rC   rB   rD   rB   rE   rF   rG   r?   rH   r;   rI   r;   rJ   r;   rK   r?   rL   r;   rM   rF   rN   rF   rO   rP   rQ   r5   rR   r5   rS   rT   rU   r;   rV   r5   rW   rX   rY   r;   rZ   r[   r\   r]   )Nr,   NNr/   N)rL   r;   r   r5   rM   rF   rN   rF   rO   r5   rt   r?   r\   r   r
  r  )r(   F)rr   r   r2   r3   rH   r;   r\   r]   )r0   r7   r4   r7   rH   r=   rV   r5   r6   r7   r\   r]   )r)   r,   FFNNNr/   NNFFNNr*   )"r0   r7   r4   r5   rA   rB   rH   r;   rL   r;   rt   r?   rM   rF   rN   rF   rO   r5   rQ   r5   rR   r5   rS   rT   rU   r;   rV   r5   rs   r   r6   r7   r\   r]   )r)   FTNNr*   )r0   r1   r8   r9   r4   r5   rH   r;   rI   r;   rV   r5   rJ   r;   r6   r5   r\   r]   )Nr   )r  r  r  r   )NN)r3  r7   r4  r;   r5  r?   r\   r6  )r<  NF)
r=  r>  r?  r7   r5  r?   r:   r=   r\   r@  )T)rL  rM  rN  r=   )rQ  rR  r  rS  r  rS  r\   rR  )r>   r   rY  r>  r\   rR  )rX  rR  re  r?   rf  r?   r\   rR  )F)r  rh  ri  r=   r\   rj  )FN)r  r  ri  r=   r}  r?   r\   r  )r  rh  ri  r=   r\   r>  )r  r  r  r=   r\   r7   )r\   r   )&__name__
__module____qualname____doc__r   r   rx   r   propertyrr   r   r   rK   r>   r   rE   r@   r   r   r   r   r   r   r   r2  r;  rK  rP  r  rd  rg  r  r  r  r  r  r  __classcell__rf   rf   r   rj   r'   ;   s      V





@ (=@+
*)V o r'   )C
__future__r   loggingr   datetimer   pathlibr   typingr   r   r   r  r   torch.nn.functionalr   
functionalr  r   
accelerater   r	   r
   accelerate.utilsr   huggingface_hubr   	packagingr   packaging.versionr   r   r   &transformers.models.auto.modeling_autor   r   lm_evalr   lm_eval.api.modelr   lm_eval.api.registryr   lm_eval.models.utilsr   r   r   r   r   r   r   lm_eval.models.utils_hfr   r   r   r   collections.abcr    r!   transformers.quantizers.autor#   lm_eval.api.instancer$   	getLoggerr  rz   r   r'   rf   rf   rf   rj   <module>   s<    $	

