o
    5ti-                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZ d dlmZ d	gZed
G dd de
ZdS )    N)tqdm)BatchEncoding)Instance)register_model)HFLM)Collatorreplace_placeholders)stop_sequences_criteriaz<audio>zhf-audiolm-qwenc                       sR  e Zd ZdZejZdZ	d'deej	B de
dB f fddZ			
d(deej	B deejB dB dedB dedB ddf
ddZ	d)deeeef  dedefddZdd Z			
d*dee dee dede
dedeeeejf B fddZ	
d+dee d edee fd!d"Zdee dee fd#d$Z	
d+dee d edeeeef  fd%d&Z  ZS ),HFAUDIOLMQWENzU
    An abstracted Hugging Face model class for Audio LM model like Qwen2-Audio.
    T   
pretrained
max_audiosNc                    s$   t  j|fi | || _d| _d S )NF)super__init__r   chat_applied)selfr   r   kwargs	__class__ M/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/models/hf_audiolm.pyr      s   
zHFAUDIOLMQWEN.__init__mainF	tokenizerrevisiontrust_remote_codereturnc                 K   sn   |rt |trtjj|||dS t |tjsJ |S t |tr"|}n| jj}tjj|||d| _	| j	j
| _
dS )z
        Helper method during initialization.
        For the multimodal variant, we initialize not just
        `self.tokenizer` but also `self.processor`.
        )r   r   N)
isinstancestrtransformersAutoTokenizerfrom_pretrainedProcessorMixinmodelname_or_pathAutoProcessor	processorr   )r   r   r   r   r   r   
model_namer   r   r   _create_tokenizer*   s(   

zHFAUDIOLMQWEN._create_tokenizerchat_historyadd_generation_promptc                 C   s   | j j|d|d}|S )zc
        Method to apply a chat template to a list of chat history between user and model.
        F)tokenizer)   )r%   apply_chat_template)r   r(   r)   chat_templatedr   r   r   r+   U   s   z!HFAUDIOLMQWEN.apply_chat_templatec                 K   s   | dd|d< | d}| ddkr|d u rd |d< }|du r.| ddkr.|d t| j||d jd |d jd }| jjd
i |||| jjdd	|S )Ntemperatureg        	do_sampleF	input_ids   r   T)
max_lengthstopping_criteriapad_token_id	use_cacher   )getpopr	   r   shaper"   generater3   )r   inputsr1   stopgeneration_kwargsr.   r2   r   r   r   _model_multimodal_generateb   s,   


z(HFAUDIOLMQWEN._model_multimodal_generateleftstringsaudiospadding_sideleft_truncate_len
truncationc           	         sN    fdd} j stD ]}|||}q j||ddd}| j jj |S )Nc                    s    fdd|D S )Nc                    s   g | ]
}t | d jqS )z#<|audio_bos|><|AUDIO|><|audio_eos|>)r   r   ).0string)placeholderr   r   r   
<listcomp>   s    z[HFAUDIOLMQWEN.tok_batch_multimodal_encode.<locals>._replace_placeholder.<locals>.<listcomp>r   )rE   r>   r   )rE   r   _replace_placeholder   s   zGHFAUDIOLMQWEN.tok_batch_multimodal_encode.<locals>._replace_placeholderTpt)r?   textpaddingreturn_tensors)r   DEFAULT_AUDIO_PLACEHOLDERSr%   todevicer"   dtype)	r   r>   r?   r@   rA   rB   rH   rE   encodingr   rG   r   tok_batch_multimodal_encode|   s   
z)HFAUDIOLMQWEN.tok_batch_multimodal_encoderequestsdisable_tqdmc                    s|  g } fdd}t t||p jdkdd}tdd |D |dd	d
 d}|j jd d}|D ]}t|ddi\}	}
}g }|D ]}|d D ]	}||d  qFq@t|	t	sZt	|	}	|
d }d }t|t
rt|}d| v r|d}t|tr~|g}nt|t	std| n	tdt|  j jdd}|s|g}n|| d| v r|d}n j} j| } j|	|| jd}|d }d|vr|jd | |d< |d d|d< |jd|_ j|fd|i|}~tj  dd l}|   |! }t||	ddD ]'\}}||jd d  } |}||  j"#d||f| |$d q
q/|%|}|&  |S )Nc                    s      | d }t| | d fS )Nr   )
tok_encodelen)xtoksrG   r   r   _collate   s   z.HFAUDIOLMQWEN.generate_until.<locals>._collater   z5Running generate_until requests with text+audio input)totaldisabledescc                 S   s   g | ]}|j qS r   )args)rC   regr   r   r   rF      s    z0HFAUDIOLMQWEN.generate_until.<locals>.<listcomp>
gen_kwargsc                 S   s   | d S )Nr0   r   )rW   r   r   r   <lambda>   s    z.HFAUDIOLMQWEN.generate_until.<locals>.<lambda>)group_bygroup_fn)nbatch_fnstrictFaudioarrayuntilzAExpected `kwargs['until']` to be of type Union[str,list] but got z/Expected `kwargs` to be of type `dict` but got )skip_special_tokensmax_gen_toks)rA   rB   r/   r1   r0   cudar:   )re   generate_until)'r   rV   rankr   get_batched
batch_sizezipappendr   listdictcopydeepcopykeysr6   r   
ValueErrortype
tok_decodeeot_token_idrj   r1   rR   rB   r7   rN   r/   r<   torchrk   empty_cachegccollecttolist
cache_hookadd_partialupdateget_originalclose)r   rS   rT   resrY   pbarre_ordschunkschunkcontextsall_gen_kwargsaux_argumentsr?   audio_lst_dictrf   r_   rh   r   eosrj   max_ctx_lenr9   context_enccontr}   cont_toks_list	cont_tokscontextsr   rG   r   rl      s   















zHFAUDIOLMQWEN.generate_untilc                 C   s
   t dd)Nz~model type `hf-audiolm` does not support loglikelihood_rolling. Use 'hf' model type for text-only loglikelihood_rolling tasks zZthis is because we do not support measuring the loglikelihood a model assigns to an image.NotImplementedError)r   rS   r   r   r   loglikelihood_rolling"  s   z#HFAUDIOLMQWEN.loglikelihood_rollingc                 C   s   t d)Nz'loglikelihood' requests for model type `hf-audiolm` are not yet tested. This feature will be enabled when a loglikelihood-based multiple-choice VQA dataset is added!r   )r   rS   rT   r   r   r   loglikelihood(  s   zHFAUDIOLMQWEN.loglikelihood)r   )r   F)T)r=   NF)F)__name__
__module____qualname____doc__r   "Qwen2AudioForConditionalGenerationAUTO_MODEL_CLASS
MULTIMODALr   PreTrainedModelintr   r!   boolr'   rr   rs   r+   r<   r   r{   TensorrR   r   rl   floatr   tupler   __classcell__r   r   r   r   r
      s    
,

*
}r
   )rt   r{   r   r   r   lm_eval.api.instancer   lm_eval.api.registryr   lm_eval.models.huggingfacer   lm_eval.models.utilsr   r   lm_eval.models.utils_hfr	   rM   r
   r   r   r   r   <module>   s    