o
    پi3                     @   s   d dl Z d dlmZmZmZmZmZmZmZ d dl	Z	d dl
mZ d dlmZmZ er6d dlZ	d dlmZmZ G dd dedd	ZG d
d dZG dd dZ					ddedee dee dededee defddZdS )    N)TYPE_CHECKINGIteratorListOptionalTuple	TypedDictUnion)Unpack)GenerationParametersSamplingParameters)ExLlamaV2DynamicGeneratorExLlamaV2Samplerc                   @   sN   e Zd ZU eed< eeeeef   ed< ee ed< ded< ee ed< dS )ExllamaV2Params
max_tokensstop_conditionsseedzExLlamaV2Sampler.Settingsgen_settingsmax_new_tokensN)	__name__
__module____qualname__int__annotations__r   r   r   str r   r   M/home/ubuntu/.local/lib/python3.10/site-packages/outlines/models/exllamav2.pyr      s   
 r   F)totalc                   @   s2   e Zd Zdd Zdd Zdddee fdd	Zd
S )OutlinesExLlamaV2Tokenizerc                 C   s.   || _ | j  | _t| j j| _| j j| _d S N)exl2_tokenizerget_piece_to_id_dict
vocabularysetextended_piece_to_idspecial_tokenseos_token_id)self	tokenizerr   r   r   __init__   s   z#OutlinesExLlamaV2Tokenizer.__init__c                 C   s   |S r   r   )r&   tokenr   r   r   convert_token_to_string   s   z2OutlinesExLlamaV2Tokenizer.convert_token_to_string	token_idsztorch.LongTensorreturnc                 C   s*   | j jt|dd}t|tr|gS |S )NF)decode_special_tokens)r   decodetorchtensor
isinstancer   )r&   r+   decodedr   r   r   r.       s   
z!OutlinesExLlamaV2Tokenizer.decodeN)r   r   r   r(   r*   r   r   r.   r   r   r   r   r      s    r   c                   @   s   e Zd ZdZdddddefddZd	eeee f d
e	de
dee deeeeee f f f
ddZdeeee f de
fddZd	eeee f d
e	de
dee deeee f f
ddZd	eeee f d
e	de
dee deeeee f  f
ddZdS )ExLlamaV2ModelzRepresents a `exl2` model.	generatorr   r'   r   max_seq_lenc                 C   s   || _ || _|| _d S r   )r4   r'   r5   )r&   r4   r'   r5   r   r   r   r(   -   s   
zExLlamaV2Model.__init__promptsgeneration_parameterssampling_parametersexllamav2_paramsr,   c                    s|  ddl m} t|tr|g}t|\ }} du r<g  |D ]}	| jjj|	dd}
|
j	d } 
| j|  q |d< n fdd	tt|D |d< | jjjg}t|jtr^|
|j nt|jtro|jD ]}|
| qg||d
< ||d< | }|jdur|j|_|jdur|j|_|jdur|j|_||_||d< |jdkr||j }|d |j |d< t|dkr|d }||fS )z_Prepare the generation parameters.

        `exllamav2` uses different default values

        r   )r   NT)encode_special_tokensr   c                    s   g | ]} qS r   r   ).0_r   r   r   
<listcomp>T   s    z@ExLlamaV2Model.prepare_generation_parameters.<locals>.<listcomp>r   r   r      )exllamav2.generatorr   r1   r   dataclassesastupler4   r'   encodeshapeappendr5   rangelenr%   stop_atlistSettingstemperaturetop_ptop_klogits_processornum_samples)r&   r6   r7   r8   structure_logits_processorr9   r   rI   r   promptidsprompt_tokensr   r   r   r>   r   prepare_generation_parameters7   sP   











z,ExLlamaV2Model.prepare_generation_parametersoutputc                 C   s   t |tr|S t|dkr|d S |jdkrVt||jkr|S t||j dks*J t||j }g }t|jD ]}g }t|D ]}|||| |   q@|| q8|S |S )a  
        The purpose of this function is to reformat the output from exllamav2's output format to outline's output format
        For exllamav2, it mainly accepts only a list or a string(they also do cfg sampling with tuples but we will ignore this for now)
        The exllamav2's logic is
        1. If the prompt is a string, return a string. This is the same as outlines
        2. If a prompt is a list, return a list. This is not the same as outlines output in that if the list is only one element, the string is expected to be outputted.
        3. There is no such thing as num_samples, so the prompts had to be duplicated by num_samples times. Then, we had the function output a list of lists
        r@   r   )r1   r   rH   rP   rG   rF   )r&   rV   r8   num_items_per_sample
new_outputicurr_samplejr   r   r   reformat_outputu   s"   

zExLlamaV2Model.reformat_outputc              
   K   sR   |  ||||\}}	 | jj||d t|d dd|d d|d d}| ||S )Nr   r   Tr   Fr   )rR   r   r   completion_onlyr:   r   add_bosr   )rU   r4   generateminr\   )r&   r6   r7   rQ   r8   r9   rV   r   r   r   r_      s$   
zExLlamaV2Model.generatec                    s   ddl m} |||\}}i t|tr|g}t|}|d }t|D ]4\}	}
jjj	|
ddd}|||d |	 d||d |d	 ddd
}|d urP|d7 }j
|}|	|< q&dg|  dtt f fdd}| S )Nr   )ExLlamaV2DynamicJobr   TF)r:   r^   r   r   r   )	input_idsr   min_new_tokensr   r   r   token_healingr-   r@    r,   c                  3   sx    j  r:j  } | D ] }|d  }|d dkr%|dd}| |< |d r-d |< q V  j  sd S )Nserialstage	streamingtextre   eos)r4   num_remaining_jobsiterategetr\   )resultsridxri   	next_textorderr8   r&   r   r   token_generator   s   



z.ExLlamaV2Model.stream.<locals>.token_generator)rA   ra   rU   r1   r   rH   	enumerater4   r'   rD   enqueuer   )r&   r6   r7   rQ   r8   r9   ra   
batch_sizer   rp   prb   jobrf   rt   r   rq   r   stream   sB   



zExLlamaV2Model.streamN)r   r   r   __doc__r   r(   r   r   r   r
   r   r	   r   r   rU   r\   r_   r   rz   r   r   r   r   r3   *   s^    


>

"r3   T
model_pathdraft_model_pathr5   cache_q4pagedmax_chunk_sizer,   c                 C   sF  zddl m}m}m}m}	m}
 ddlm} W n ty!   tdw |	| }|dur2||_	|d |_
|  ||}|du r@d}|rJ|||dd	}n|||dd	}|j|dd
 td |
|}|rddnd}d}d}|dur|	|}||}|r|||dd	}n|||dd	}|||||||d||d	}|j}t|}t|||}|S )a2  
    Load an ExLlamaV2 model.

    Parameters
    ----------
    model_path (str)
        Path to the model directory.
    device (str)
        Device to load the model on. Pass in 'cuda' for GPU or 'cpu' for CPU
    max_seq_len (Optional[int], optional)
        Maximum sequence length. Defaults to None.
    scale_pos_emb (Optional[float], optional)
        Scale factor for positional embeddings. Defaults to None.
    scale_alpha_value (Optional[float], optional)
        Scale alpha value. Defaults to None.
    no_flash_attn (Optional[bool], optional)
        Disable flash attention. Defaults to None.
    num_experts_per_token (Optional[int], optional)
        Number of experts per token. Defaults to None.
    cache_q4 (bool, optional)
        Use Q4 cache. Defaults to False.
    tokenizer_kwargs (dict, optional)
        Additional keyword arguments for the tokenizer. Defaults to {}.
    gpu_split (str)
        "auto", or VRAM allocation per GPU in GB. Auto will use exllama's autosplit feature
    low_mem (bool, optional)
        Enable VRAM optimizations, potentially trading off speed
    verbose (bool, optional)
        Enable if you want debugging statements

    Returns
    -------
    An `ExLlamaV2Model` instance.

    Raises
    ------
    `ImportError` if the `exllamav2` library is not installed.

    r   )	ExLlamaV2ExLlamaV2CacheExLlamaV2Cache_Q4ExLlamaV2ConfigExLlamaV2Tokenizer)r   a2  The `exllamav2`, `transformers` and `torch` libraries needs to be installed in order to use `exllamav2` models. Please run `pip install transformers torch git+https://github.com/lapp0/exllamav2@sampler-logits-processor` Documentation: https://dottxt-ai.github.io/outlines/latest/reference/models/exllamav2/N   r;   T)r5   lazy)progresszLoading tokenizer...   r@   F)	modelcachedraft_modeldraft_cacher'   max_batch_sizeuse_ngram_draftr   r   )	exllamav2r   r   r   r   r   rA   r   ImportErrormax_input_lenmax_attention_sizearch_compat_overridesload_autosplitprintr5   r   r3   )r|   r}   r5   r~   r   r   r   r   r   r   r   r   configr   r   r'   r   r   r   draft_configr4   outlines_tokenizeroutlines_exl2_modelr   r   r   exl2   sd   /
r   )NNFTN)rB   typingr   r   r   r   r   r   r   r/   typing_extensionsr	   outlines.generate.apir
   r   torch.LongTensorrA   r   r   r   r   r3   r   r   boolr   r   r   r   r   <module>   s@    $ M