o
    5tip                     @   s   d Z ddlZddlmZ ddlmZmZ ddlZddl	m	Z	 ddl
mZ ddlmZ ddlmZ er8dd	lmZ eeZed
G dd deZdS )a  
WinML backend for lm-eval-harness with NPU/GPU/CPU support.

This backend leverages Windows Machine Learning (WinML) to run models on various
hardware backends including NPUs, GPUs, and CPUs. It's particularly useful for
running inference on Windows devices with dedicated Neural Processing Units.

Example usage:
    lm_eval --model winml --model_args pretrained=path/to/onnx/model.onnx --tasks hellaswag

    N)Path)TYPE_CHECKINGAny)tqdm)utils)
TemplateLM)register_modelInstancewinmlc                       s<  e Zd ZdZdZe	dHdeeef deeef dB dd fddZ				
	dIdede
dB de
de
ddf
 fddZdJddZdd ZdefddZdJddZdeddfddZede
fddZede
dB fdd Zede
fd!d"Z		#dKd$ed%e
dB d&edee
 fd'd(Zd)ee
 defd*d+Zd,edejfd-d.Z	/	dLd0eeeeef ee
 ee
 f  d1ed2e
dB deeeef  fd3d4Z	/dMd0ed5 d1edeeeef  fd6d7Z	/dMd0ed5 d1edee fd8d9Z 	/dMd0ed5 d1edee fd:d;Z!				<	=	>	/dNd?ed@e
dAee dB dBedCedDe
dEedefdFdGZ"  Z#S )O	WindowsMLz
    WindowsML backend for lm-eval-harness with NPU/GPU/CPU support.

    This model class provides integration with Windows Machine Learning (WindowsML)
    to enable evaluation on NPUs and other Windows-optimized hardware.
    i   Narg_dictadditional_configreturnc                 C   s:   i |pi }|rdd |  D }|| | di |S )a<  
        Override to properly merge dictionaries and avoid duplicate keyword arguments.

        Args:
            arg_dict: Dictionary containing model arguments
            additional_config: Optional dictionary containing additional configuration

        Returns:
            Instance of WindowsML class
        c                 S   s   i | ]\}}|d ur||qS N ).0kvr   r   H/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/models/winml.py
<dictcomp><   s    z1WindowsML.create_from_arg_obj.<locals>.<dictcomp>Nr   )itemsupdate)clsr   r   merged_configfiltered_additionalr   r   r   create_from_arg_obj*   s   
zWindowsML.create_from_arg_obj      @   
pretrained
max_length
batch_sizemax_batch_sizec                    s   t    |   || _|p| j| _|| _|| _|dks |dkr2t	d| d| d d| _d| _| 
  |   |   | | tdt| j d dS )a2  
        Initialize WindowsML model.

        Args:
            pretrained: Path to ONNX model file or directory containing model files
            max_length: Maximum sequence length
            batch_size: Batch size for inference
            max_batch_size: Maximum batch size for auto-batching
        r   zMWindowsML backend currently only supports batch size 1. Requested batch_size=z, max_batch_size=z. Setting both to 1.zAvailable EP devices: z execution providersN)super__init___validate_dependenciesr    _DEFAULT_MAX_LENGTHr!   r"   r#   eval_loggerwarning_fix_winrt_runtime"_register_winml_providers_to_genai"_setup_winml_devices_and_providers_load_and_compile_modelinfolenep_device_map)selfr    r!   r"   r#   kwargs	__class__r   r   r%   C   s.   

zWindowsML.__init__c              
   C   s   zddl }|| _td|j  W n ty# } ztd|d}~ww zddl}|| _td|j  W dS  tyH } ztd|d}~ww )z
        Validate that required dependencies are available.

        Raises:
            ImportError: If required dependencies are not installed
        r   NzONNX Runtime GenAI version: z]ONNX Runtime GenAI is required for WinML backend. Install with: pip install onnxruntime-genaizONNX Runtime version: zhONNX Runtime is also required for execution provider registration. Install with: pip install onnxruntime)onnxruntime_genaiogr(   r.   __version__ImportErroronnxruntimeort)r1   r6   er:   r   r   r   r&   x   s0   z WindowsML._validate_dependenciesc                 C   sH   ddl m} tt|dd}|d d }| r"|  dS dS )z
        This function removes the msvcp140.dll from the winrt-runtime package.
        So it does not cause issues with other libraries.
        r   )metadatazwinrt-runtime winrtzmsvcp140.dllN)	importlibr<   r   strdistributionlocate_fileexistsunlink)r1   r<   site_packages_pathdll_pathr   r   r   r*      s   zWindowsML._fix_winrt_runtimec              
   C   s  zYddl m  m  m  m} ddlm}m} ||jd1 |j	
 }| }|D ]}|   | j|j|j td|j d q*W d   W dS 1 sRw   Y  W dS  tyt } ztd|  W Y d}~d	S d}~w ty } ztd
|  W Y d}~d	S d}~ww )z
        Register Windows ML execution providers to ONNX Runtime GenAI.

        Returns:
            True if registration was successful, False otherwise
        r   N)InitializeOptions
initialize)optionszRegistered z to ONNX Runtime GenAITzWindows ML import error: Fz&Error registering providers to GenAI: )+winui3.microsoft.windows.ai.machinelearning	microsoftwindowsaimachinelearningEwinui3.microsoft.windows.applicationmodel.dynamicdependency.bootstraprG   rH   ON_NO_MATCH_SHOW_UIExecutionProviderCatalogget_defaultfind_all_providersensure_ready_asyncgetr6   #register_execution_provider_librarynamelibrary_pathr(   r.   r8   r)   	Exception)r1   r   rG   rH   catalog	providersproviderr;   r   r   r   r+      s8   

z,WindowsML._register_winml_providers_to_genaic              
   C   s$  zn| j  }i | _|D ]}|j}|| jvrg | j|< | j| | qtd | j D ]>\}}td|  |D ]/}z| j |j	j
j}td|jdd|d W q; tyj   td|jdd Y q;w q-W d
S  ty } ztd|  td	 i | _W Y d
}~d
S d
}~ww )zSetup execution providers using Windows ML device enumeration API.

        This method queries available devices and builds a mapping of execution providers.
        z%Available execution provider devices:zExecution Provider: z | Vendor: z<16z | Device Type: z<8z | Device Type: Unknownz&Windows ML device enumeration failed: z)Falling back to legacy provider selectionN)r:   get_ep_devicesr0   ep_nameappendr(   r.   r   OrtHardwareDeviceTypedevicetyperW   	ep_vendorrY   r)   )r1   
ep_devicesra   r^   rW   devicesdevice_typer;   r   r   r   r,      sD   




z,WindowsML._setup_winml_devices_and_providers
model_pathc              
   C   s   t |}| r|jdkr|j}n| r|}ntd| dz$td|  | j	t
|| _| j| j| _td || _W dS  ty\ } ztd| d|   d}~ww )	a!  
        Load and optionally compile ONNX model with ONNX Runtime GenAI.

        Args:
            model_path: Path to ONNX model file or directory

        Raises:
            FileNotFoundError: If model path is not found or invalid
            Exception: If model loading fails
        z.onnxzModel path z not found or invalidz,Loading model with ONNX Runtime GenAI from: z?Model and tokenizer loaded successfully with ONNX Runtime GenAIz2Failed to load model with ONNX Runtime GenAI from z: N)r   is_filesuffixparentis_dirFileNotFoundErrorr(   r.   r6   Modelr@   genai_model	Tokenizergenai_tokenizerrg   rY   error)r1   rg   input_model_pathr;   r   r   r   r-      s.   z!WindowsML._load_and_compile_modelc              
   C   s   z$t | jdr#| jj}t |dr#|jr#t|jtr|jd W S |jW S W n ty? } ztd| d W Y d}~nd}~ww td dS )	z{
        Get the end-of-text token ID.

        Returns:
            End-of-text token ID from the GenAI tokenizer
        configeos_token_idr   zError getting EOS token ID: z, using fallback value 2Nz8Could not determine EOS token ID, using fallback value 2   )	hasattrrn   rs   rt   
isinstancelistrY   r(   r)   )r1   rs   r;   r   r   r   eot_token_id  s    

zWindowsML.eot_token_idc                 C   s   z:t | jdr| jj}t |dr|jdur|jW S z| jd}t|dkr,|d W W S W W dS  ty:   Y W dS w  tyD   Y dS w )z
        Get the prefix token ID (typically BOS token).

        Returns:
            BOS token ID if available, otherwise None
        rs   bos_token_idNr=   r   )rv   rn   rs   rz   rp   encoder/   rY   )r1   rs   	empty_encr   r   r   prefix_token_id5  s$   zWindowsML.prefix_token_idc                 C   s   dS )z
        Get the maximum number of tokens to generate.

        Returns:
            Maximum generation tokens (default: 4096)
        r   r   )r1   r   r   r   max_gen_toksS  s   zWindowsML.max_gen_toksTstringleft_truncate_lenadd_special_tokensc                 C   s2   | j |}|durt||kr|| d }|S )af  
        Tokenize string and return token IDs.

        Args:
            string: Input string to tokenize
            left_truncate_len: If provided, truncate from the left to this length
            add_special_tokens: Whether to add special tokens (note: GenAI tokenizer handles this automatically)

        Returns:
            List of token IDs
        N)rp   r{   r/   )r1   r   r   r   encodingr   r   r   
tok_encode]  s   zWindowsML.tok_encodetokensc                 C   s   | j |S )z
        Decode token IDs back to text.

        Args:
            tokens: List of token IDs to decode

        Returns:
            Decoded text string
        )rp   decode)r1   r   r   r   r   
tok_decodew  s   
zWindowsML.tok_decode
input_textc           	   
   C   s  zr| j |}t|dkrtd tjdtjdW S | j	| j
}|jddd | j| j
|}|| |d}tj|tjd}t|jd	krO|d }nt|jd
krY|}ntd|j td|j dt| d |W S  ty } z	td|   d}~ww )a  
        Run inference using ONNX Runtime GenAI to get full logits sequence.

        Args:
            input_text: Input text string to compute logits for

        Returns:
            Logits matrix of shape (seq_len, vocab_size) where logits[i] contains
            predictions for the token at position i+1 given tokens[0:i+1]

        Raises:
            Exception: If inference fails
        r   z+No tokens to process; returning empty array)r   r   dtyper   Fr!   	do_samplelogits   ru   zUnexpected logits shape: zFull logits shape: z for z input tokenszGenAI inference failed: N)rp   r{   r/   r(   r)   npemptyfloat32r6   GeneratorParamsrn   set_search_options	Generatorappend_tokens
get_outputarrayshape
ValueErrordebugrY   rq   )	r1   r   input_tokensparams	generatorfull_logits_tensorlogits_arraylogits_matrixr;   r   r   r   $_run_genai_inference_for_full_logits  s2   



z.WindowsML._run_genai_inference_for_full_logitsFrequestsdisable_tqdmoverride_bsc                 C   s   t d)a  
        Stub implementation - not used since we override loglikelihood directly.
        WindowsML uses the GenAI tokenizer and overrides loglikelihood to work
        with text inputs directly, avoiding tokenization round-trip issues.

        Args:
            requests: List of tokenized requests
            disable_tqdm: Whether to disable progress bar
            override_bs: Optional batch size override

        Returns:
            Empty list (method not used)
        zUWindowsML overrides loglikelihood() directly and does not use _loglikelihood_tokens())NotImplementedError)r1   r   r   r   r   r   r   _loglikelihood_tokens  s   zWindowsML._loglikelihood_tokensr
   c                 C   s  g }t ||ddD ]O}|j\}}t|dkr|d q	z|| }|rG| |}t|dkrB| jdurB|d | jkrB|dd }t|}	n| jdurP| jgng }t|}	| |}
|
|	d }t|dkrn|d W q	| j| j}|j	ddd	 | j
| j|}|
d|	 }t|dkr|tj|tjd
 n| jdur|tj| jgtjd
 d}d}|D ]q}| }tj|tjd
}t|jdkr|ddddf }nt|jdkr|dddf }n|}t|}t|| }|tt| }|t| | }|t|7 }tt|t|krd}|tjt|gtjd
 q|||f W q	 tyY } ztd|  ddl}t|  |d W Y d}~q	d}~ww |S )a[  
        Compute log-likelihood using ONNX Runtime GenAI with teacher forcing.

        Args:
            requests: List of instances containing (context, continuation) text pairs
            disable_tqdm: Whether to disable progress bar

        Returns:
            List of tuples containing (log_likelihood, is_greedy) for each request
        zComputing log-likelihoodsdisabledescr   )        TNr   r   Fr   r   r   Tr   ru   z!Failed to compute loglikelihood: )r   F)r   argsr/   r_   r   r}   r6   r   rn   r   r   r   r   r   int32
get_logitsr   r   maxexplogsumintfloatargmaxrY   r(   r)   	tracebackr   
format_exc)r1   r   r   resultsrequestcontextcontinuation	full_textcontext_enccontext_lenfull_tokenscontinuation_tokensr   gencontext_tokenstotal_ll	greedy_oktokr   r   next_logits	max_logit
exp_logitslog_sum_explog_probr;   r   r   r   r   loglikelihood  s   









zWindowsML.loglikelihoodc              
      s   g }t ||ddD ]d}|jd }tttjtj| || j| j	dd}g }|D ]\}}	| 
|}
| 
|	}||
|f q(ddlm   fdd|D }| j|d	d
}dd |D }t|}|| | jd|f| q	|S )a  
        Compute rolling log-likelihood for perplexity using ONNX Runtime GenAI.
        Uses sliding windows to handle sequences longer than max_length.

        Args:
            requests: List of instances containing text sequences
            disable_tqdm: Whether to disable progress bar

        Returns:
            List of sum of log-likelihood values for each request
        z!Computing rolling log-likelihoodsr   r   r   )
token_listprefix_tokenmax_seq_lenr   r	   c              	      s&   g | ]\}} d i ||fdi dqS )r   r   )request_typedoc	argumentsidxr<   r   )r   ctxcontr	   r   r   
<listcomp>k  s    z3WindowsML.loglikelihood_rolling.<locals>.<listcomp>T)r   c                 S   s   g | ]}|d  qS )r   r   )r   xr   r   r   r   |  s    loglikelihood_rolling)r   r   rx   mapr   make_disjoint_windowget_rolling_token_windowsr   r}   r!   r   r_   lm_eval.api.instancer
   r   r   
cache_hookadd_partial)r1   r   r   loglikelihoodsr   r   rolling_token_windowswindow_requestsr   r   context_textcontinuation_textwindow_instances
string_nll	total_nllr   r	   r   r   >  sD   





zWindowsML.loglikelihood_rollingc                 C   s   |sg S g }t ||ddD ]^}|j\}}|d| j}|dg }|dd}	|dd}
|d	d
}|dd}z| j||||	|
||d}|| W q tyk } ztd|  |d W Y d}~qd}~ww |S )a,  
        Generate text until stopping criteria using ONNX Runtime GenAI.

        Args:
            requests: List of generation requests with context and generation kwargs
            disable_tqdm: Whether to disable progress bar

        Returns:
            List of generated text strings
        zGenerating textr   r~   untiltemperaturer   top_p      ?top_k2   r   F)r   r   r   r   zGeneration failed for request: r=   N)	r   r   rU   r~   _run_genai_generationr_   rY   r(   r)   )r1   r   r   r   r   r   
gen_kwargsr~   r   r   r   r   r   generated_textr;   r   r   r   generate_until  s8   
	zWindowsML.generate_untilr   r   r   prompt
max_tokensstop_sequencesr   r   r   r   c              
      s  z| j | j}|r!|dkr!|jt|t|t|t|dd n|jt|ddddd | j | j|}	| j|}
|		|
 g }|	
 s|	  |	
 rOn;|	d}t|
t| }t||kr|| }|| t||krrn|r| j| t fdd|D rn|	
 rF|r| j|}|r|D ]}||r|dt|  } |W S q|W S W d	S  ty } ztd
|  W Y d}~d	S d}~ww )a2  
        Run text generation using ONNX Runtime GenAI.

        Args:
            prompt: Input text to generate from
            max_tokens: Maximum number of tokens to generate
            stop_sequences: List of sequences that will stop generation
            temperature: Sampling temperature (0 = greedy, higher = more random)
            top_p: Nucleus sampling threshold
            top_k: Top-k sampling parameter
            do_sample: Whether to use sampling (if False, uses greedy decoding)

        Returns:
            Generated text string
        r   r   )r!   r   r   r   repetition_penaltyr   r   c                 3   s    | ]}| v V  qd S r   r   )r   stop_seqcurrent_textr   r   	<genexpr>  s    z2WindowsML._run_genai_generation.<locals>.<genexpr>Nr=   zGenAI generation error: )r6   r   rn   r   r   r   r   rp   r{   r   is_donegenerate_next_tokenget_sequencer/   r_   r   anyendswithrY   r(   rq   )r1   r   r   r   r   r   r   r   r   r   r   generated_tokensoutput_tokensbase	new_tokenr   r   r;   r   r   r   r     sj   	



zWindowsML._run_genai_generationr   )r   r   r   )r   N)NT)FN)F)r   Nr   r   r   F)$__name__
__module____qualname____doc__r'   classmethoddictr@   r   r   r   r%   r&   r*   boolr+   r,   r-   propertyry   r}   r~   rx   r   r   r   ndarrayr   tupler   r   r   r   r   r   __classcell__r   r   r3   r   r      s    

5
#',
6 

r
J
2
	r   )r  loggingpathlibr   typingr   r   numpyr   r   lm_evalr   lm_eval.api.modelr   lm_eval.api.registryr   r   r
   	getLoggerr   r(   r   r   r   r   r   <module>   s    
