o
    5ti{                     @   s   d dl Z d dlZd dlZd dlm  mZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZmZmZmZmZ d dlmZmZ d	ZeeZed
G dd deZdS )    N)tqdm)BatchEncoding)Instance)register_model)HFLM)Collatorflatten_image_listhandle_stop_sequencesreplace_placeholdersresize_image)pad_and_concatstop_sequences_criteriaz<image>zhf-multimodalc                       sD  e Zd ZdZejZdZ										d=deej	B de
dB dedB d	ed
e
dB de
dB de
dB de
dB de
dB de
dB f fddZ		d>deej	B deejB dB dedB dedB ddf
ddZ	d?ddZdd Z	d@deeeef  dedefdd ZdAd!eeB dedB f fd"d#Z	$		dBd%ee d&ee d'ed(e
d)edeeeejf B fd*d+Zd?d,d-Zd.d/ Zd0d1 Zd2ee dee f fd3d4Z	dAd2ee d5edeeeef  f fd6d7Z 		dCd2eeedeef ee
 ee
 ee
 f  d5ed8e
deeeef  fd9d:Z!	dAd2ee d5edee f fd;d<Z"  Z#S )DHFMultimodalLMz[
    An abstracted Hugging Face model class for multimodal LMs like Llava and Idefics.
    TN  F
pretrainedimage_token_idimage_string
interleave
max_images
min_pixels
max_pixelsimage_widthimage_heightimage_max_sidec                    s(  |	| _ |
| _|| _| jr| j s| jrtd|rd|ini |r#d|ini B | _t j|fi | | jdks:J dd| _|| _	|| _
|| _|s|rNt|nt| jdd p[t| jdd | _| jd usfJ d	| j| jgdd
| _|d urtd| j d| j d d S d S td| d || _d S )NzlAmbiguous config for image resize: you can not specify both image_max_side and (image_width or image_height)r   r   autoz@Batch size 'auto' is not yet supported for hf-multimodal models.Fr   image_token_indexzMust have a non-None image_token_id to evaluate a Hugging Face AutoModelForVision2Seq model. Please pass `image_token_id` in `--model_args` if model's config does not already specify one.skip_special_tokensz1A non-default image_token_id with image_token_id=z and string value 'zx' was specified manually. Note that using an improper image_token placeholder may lead to ignored image input or errors!zAA non-default image_token string with string value image_string=')r   r   r   
ValueErrorpixelssuper__init__
batch_sizechat_appliedr   r   rgbintgetattrconfigr   
tok_decodeimage_tokeneval_loggerinfo)selfr   r   r   r   r   convert_img_formatr   r   r   r   r   kwargs	__class__ J/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/models/hf_vlms.pyr!   &   sN   


zHFMultimodalLM.__init__main	tokenizerrevisiontrust_remote_codereturnc                 K   sx   |rt |trtjj|||dS t |tjsJ |S t |tr"|}n| jj}tjj|f||d| j| _	| j	j
| _
dS )z
        Helper method during initialization.

        For the multimodal variant, we initialize not just
        `self.tokenizer` but also `self.processor`.
        )r5   r6   N)
isinstancestrtransformersAutoProcessorfrom_pretrainedProcessorMixinmodelname_or_pathr   	processorr4   )r,   r   r4   r5   r6   r.   
model_namer1   r1   r2   _create_tokenizerp   s.   

z HFMultimodalLM._create_tokenizerc                 C   s>   | j ||dd}|d}|d |r|| d }||fS )zIHelper function which encodes an image + string combo using AutoProcessorN)textimagesreturn_tensors	input_idsattention_mask)r@   pop)r,   stringrD   left_truncate_lenadd_special_tokensencodingtext_encodingr1   r1   r2   tok_multimodal_encode   s   

z$HFMultimodalLM.tok_multimodal_encodec                 C   s   t |t |  }|dkr|| d | }|d|  }| || |\}}| ||\}}|d |d }}t |}	||	d }
||
|fS )a;  Helper function to perform the role of TemplateLM._encode_pair
        Except allowing for image input to also be processed alongside `context`.

        This method is a bit messy due to the need to defer conversion of image and text token input
        into PyTorch tensors until the main inference loop.
        r   N)lenrstriprN   )r,   contextcontinuationrD   n_spaces	whole_enc	image_enccontext_enc_context_enc_lencontinuation_encr1   r1   r2   _encode_multimodal_pair   s   
z&HFMultimodalLM._encode_multimodal_pairchat_historyadd_generation_promptc                 C   s<  d| _ | js<|D ]2}g }|d }t| j|t}|td}t|D ]
}|dd d q#|d|d ||d< qnX|D ]U}g }|d }t| j|t}d}	|	t}
t
|
D ]&\}}|rh|d|d |t|
d	 k r|| jk r|d
di |	d	7 }	qZ||d< |	|krtd| d|	 q>| jj||| dS )NTcontent image)typer_   rC   )r`   rC   r      r`   z/Mismatch in image placeholder count. Expected: z
, Actual: )r\   continue_final_message)r#   r   minr   countDEFAULT_IMAGE_PLACEHOLDERreplacerangeappendsplit	enumeraterO   r   r@   apply_chat_template)r,   r[   r\   r]   crC   image_countrW   expected_image_countactual_image_count
text_partsipartr1   r1   r2   rk      sP   

z"HFMultimodalLM.apply_chat_templatechat_templatec                    s<   t | jdr| j}| j| _t |}|| _|S t |S )Nrk   )hasattrr@   r4   r    rs   )r,   rs   
_tokenizerselected_templater/   r1   r2   rs     s   zHFMultimodalLM.chat_templateleftstringsrD   padding_siderJ   
truncationc                    s    j s fdd|D } jj}| j_ fdd|D } jr'dd |D }t jdddkr4t|} j|||dd	d
}| j	 j
j |rg|d d d | d f |d< |d d d | d f |d< | j_|S )Nc                    s   g | ]}t |t j jqS r1   )r
   re   r)   r   ).0rI   r,   r1   r2   
<listcomp>,  s    z>HFMultimodalLM.tok_batch_multimodal_encode.<locals>.<listcomp>c                    s   g | ]	}|d  j  qS )N)r   r{   imgr|   r1   r2   r}   9      c                 S   s   g | ]	}d d |D qS )c                 S   s   g | ]}| d qS )RGB)convertr~   r1   r1   r2   r}   ;  s    zIHFMultimodalLM.tok_batch_multimodal_encode.<locals>.<listcomp>.<listcomp>r1   )r{   sublistr1   r1   r2   r}   ;  r   
model_typer^   llavalongestpt)rD   rC   rz   paddingrE   rF   rG   )r#   r4   ry   r$   r&   r'   r   r@   todevicer>   dtype)r,   rx   rD   ry   rJ   rz   old_padding_siderL   r1   r|   r2   tok_batch_multimodal_encode  s8   
	
z*HFMultimodalLM.tok_batch_multimodal_encodec                 C   s@   t   | j|fi |jW  d   S 1 sw   Y  dS )z(
        TODO: update docstring
        N)torchno_gradr>   logits)r,   inpsimgs	attn_masklabelsr1   r1   r2   _model_multimodal_callV  s   
$z%HFMultimodalLM._model_multimodal_callc                 K   s   | dd|d< | d}| ddkr|d u rd |d< }|du r.| ddkr.|d t| j||d jd |d jd }| jjd
i |||| jjdd	|S )Ntemperatureg        	do_sampleFrF   ra   r   T)
max_lengthstopping_criteriapad_token_id	use_cacher1   )getrH   r   r4   shaper>   generater   )r,   inputsr   stopgeneration_kwargsr   r   r1   r1   r2   _model_multimodal_generate^  s,   


z)HFMultimodalLM._model_multimodal_generatec                    s<   i }|d   D ] tj fdd|D dd| < q|S )z
        Helper function: batch together image encodings across examples in a batch.
        # TODO: for variable-sized images, this may break down.
        r   c                    s&   g | ]}t j|  jjjd qS ))r   r   )r   tensorr   r>   r   )r{   rU   keyr,   r1   r2   r}     s    z0HFMultimodalLM._batch_images.<locals>.<listcomp>dim)keysr   cat)r,   
image_encsbatched_imgsr1   r   r2   _batch_imagesx  s   	zHFMultimodalLM._batch_imagesrequestsc                    s.   |rt |d jdk rt j|dS tdd)Nr      )r   zmodel type `hf-multimodal` does not support loglikelihood_rolling. Use 'hf' model type for text-only loglikelihood_rolling tasks zZthis is because we do not support measuring the loglikelihood a model assigns to an image.)rO   argsr    loglikelihood_rollingNotImplementedError)r,   r   r/   r1   r2   r     s   z$HFMultimodalLM.loglikelihood_rollingdisable_tqdmc                    s.   |rt |d jdk rt j||dS td)Nr   r   r   r   z'loglikelihood' requests for model type `hf-multimodal` are not yet tested. This feature will be enabled when a loglikelihood-based multiple-choice VQA dataset is added!)	rO   r   r    loglikelihoodr   r   rZ   rh    _multimodal_loglikelihood_tokens)r,   r   r   new_reqsrQ   rR   aux_argumentsvisualsrV   rY   rU   r/   r1   r2   r     s
   zHFMultimodalLM.loglikelihoodoverride_bsc           *   
   C   s6  g }dt t ttf tt tt f fdd}dt t ttf tt tt f fdd}t||| jtjkr8| jr8dnd |d}t	|}| j
dkrI| j
n|d urO|nd	}	| j
dkr_|d	kr_|s_| jnd }
|j|	|
d
}tt	||pr| jd	kdd}|D ]}g }g }g }g }d }|D ]b\}}}}t	|d	ksJ t	|d	ksJ t	|d	ksJ t	|| jksJ tj|| | jd  d  d d tj| jd}|j\}|d urt||n|}|| || || || qi }t||dd}| |}tj| j||fi |dd}t||||ddD ]\\}}}}} }}!t	|!}"| jtjkr.|| jd	 |  nd }#| j| |"|#d} | d	} | jdd}$|j |||!| dD ]B\}%}&}'tj|&tj| jdd	}&|$|&k! }(t"|'d|&d#d}'t$|'% t&|(f})||) | j'(d|%|) |)d qLqqx|*  |+|S )Nreqc                 S   s"   | d | d  }t | t|fS )z%Defines the key for the sorted methodra      )rO   tuple)r   toksr1   r1   r2   _collate  s   zAHFMultimodalLM._multimodal_loglikelihood_tokens.<locals>._collatec                 S   s    | d | d  | d dd  S )z;Defines the key to group and lookup one-token continuationsNr1   )r   r1   r1   r2   _lookup_one_token_cont  s    zOHFMultimodalLM._multimodal_loglikelihood_tokens.<locals>._lookup_one_token_contcontexts)sort_fngroup_bygroup_fnr   r   nbatch_fnz4Running loglikelihood requests with text+image inputtotaldisabledescra   r   )r   r   right)ry   r   Fstrict)contleninplen)req_strcxt_toks	cont_toksr   r   r   ),r   r9   listr%   r   AUTO_MODEL_CLASSr:   AutoModelForCausalLMlogits_cacherO   r"   _batch_schedulerget_batchedr   rankr   r   r   longr   r   maxrh   r   r   Flog_softmaxr   zip_select_cont_toks	unsqueezeargmax	get_cacheallgathersqueezefloatsumbool
cache_hookadd_partialupdatecloseget_original)*r,   r   r   r   resr   r   re_ordn_reordered_requestsr"   r   chunkspbarchunkr   r   cont_toks_listinplenspadding_len_inprW   rV   rY   rU   inpr   call_kwargsbatched_inpsr   multi_logitsrequest_str
ctx_tokens_image_encsr   r   r   ctx_lengreedy_tokens_request_str
_cont_toks_logits	max_equalanswerr1   r1   r2   r     s   ((

	









9
z/HFMultimodalLM._multimodal_loglikelihood_tokensc                    sF  |rt |d jdk rt j||dS g } fdd}tt ||p% jdkdd}tdd	 |D |d
dd d}|j jd d} j	 j
dd}|D ]}	t|	ddi\}
}} fdd	|D }t|
tsht|
}
|d }t|trt|}t|dd |d}n	tdt| d| v r|d}n j} j| } j|
|| jd}|d }d|vr|jd | |d<  j|fd|i|}~tj  dd l}|   |! }t||
ddD ]9\}}||jd d  } 	|}|D ]}t |dkr|"|d }q|#|  j$%d||f| |&d qqJ|'|}|(  |S )Nr   r   r   c                    s      | d }t| | d fS )Nr   )
tok_encoderO   )xr   r|   r1   r2   r   z  s   z/HFMultimodalLM.generate_until.<locals>._collatez5Running generate_until requests with text+image inputr   c                 S   s   g | ]}|j qS r1   )r   )r{   regr1   r1   r2   r}     s    z1HFMultimodalLM.generate_until.<locals>.<listcomp>
gen_kwargsc                 S   s   | d S )Nra   r1   )r	  r1   r1   r2   <lambda>  s    z/HFMultimodalLM.generate_until.<locals>.<lambda>)r   r   r   Fr   r   c                    s"   g | ]} fd d|d D qS )c                    s    g | ]}t | j j jqS r1   )r   r   r   r   r~   r|   r1   r2   r}     s    z<HFMultimodalLM.generate_until.<locals>.<listcomp>.<listcomp>visualr1   )r{   argr|   r1   r2   r}     s    
until)eosz/Expected `kwargs` to be of type `dict` but got max_gen_toks)rJ   rz   rF   r   ra   r   r   generate_until))rO   r   r    r  r   r   r   r   r"   r(   eot_token_idr   r8   r   dictcopydeepcopyr	   rH   r   r`   r   r  r   r   rz   r   r   r   cudaempty_cachegccollecttolistri   rh   r   r   r   r   r   )r,   r   r   r   r   r   re_ordsr   r  r   r   all_gen_kwargsr   r   r  r.   r  r  max_ctx_lenr   rV   contr  r   r   rQ   stermr/   r|   r2   r  q  s   












zHFMultimodalLM.generate_until)
NNTr   FNNNNN)r3   F)NN)T)F)rw   NF)FN)$__name__
__module____qualname____doc__r:   AutoModelForImageTextToTextr   
MULTIMODALr9   PreTrainedModelr%   r   r!   r=   rB   rN   rZ   r   r  rk   rs   r   r   Tensorr   r   r   r   r   r   r   r   r   r   r  __classcell__r1   r1   r/   r2   r      s    
N
/

 9

7
)$
 <r   )r  loggingr   torch.nn.functionalnn
functionalr   r:   r   r   lm_eval.api.instancer   lm_eval.api.registryr   lm_eval.models.huggingfacer   lm_eval.models.utilsr   r   r	   r
   r   lm_eval.models.utils_hfr   r   re   	getLoggerr"  r*   r   r1   r1   r1   r2   <module>   s     
