o
    }oi%                     @   s|   d dl mZ d dlZd dlmZmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZmZ G d	d
 d
eZdS )    )ListN)batchfirst_value)Tensor)LLMSamplingParams)LoRARequest)ITritonDeployable)cast_outputstr_ndarray2listc                   @   s   e Zd ZdZdd ZddefddZdd	 Zed
d Z	edd Z
eedddddejfddZ					ddee dededededefddZdS ) vLLMHFExportera  
    The Exporter class uses vLLM APIs to convert a HF model to vLLM and makes the class,
    deployable with Triton server.

    Example:
        from nemo.export import vLLMHFExporter
        from nemo.deploy import DeployPyTriton

        exporter = vLLMHFExporter()
        exporter.export(model="/path/to/model/")

        server = DeployPyTriton(
            model=exporter,
            triton_model_name='model'
        )

        server.deploy()
        server.serve()
        server.stop()
    c                 C   s   d | _ d | _d S N)modellora_models)self r   P/home/ubuntu/.local/lib/python3.10/site-packages/nemo/export/vllm_hf_exporter.py__init__2   s   
zvLLMHFExporter.__init__Fenable_lorac                 C   s   t ||d| _dS )z
        Exports the HF checkpoint to vLLM and initializes the engine.
        Args:
            model (str): model name or the path
        )r   r   N)r   r   )r   r   r   r   r   r   export6   s   zvLLMHFExporter.exportc                 C   s   | j d u ri | _ || j |< d S r   )r   )r   lora_model_name
lora_modelr   r   r   add_lora_models>   s   
zvLLMHFExporter.add_lora_modelsc              
   C   sT   t ddtdt ddtjddt ddtjddt ddtjddt d	dtjddf}|S )
Npromptsnameshapedtypemax_output_lenT)r   r   r   optionaltop_ktop_ptemperature)r   bytesnpint_single)r   inputsr   r   r   get_triton_inputC   s   zvLLMHFExporter.get_triton_inputc                 C   s   t ddtdf}|S )Noutputsr   r   )r   r%   )r   r+   r   r   r   get_triton_outputN   s   z vLLMHFExporter.get_triton_outputr    r"   r#   r$   r)   c              
   K   s   zHdt |di}d|v r|d|d< d|v r |d|d< d|v r+|d|d< d|v r6|d|d< | jd	i |}t|tj}W d|iS  tyk } zdt|}t|gtj}W Y d }~d|iS d }~ww )
Ninput_textsr   r    r"   r#   r$   zAn error occurred: {0}r+   r   )	r   popforwardr
   r&   bytes_	Exceptionformatstr)r   r)   infer_inputoutput_textsoutputerrorerr_msgr   r   r   triton_infer_fnS   s&   zvLLMHFExporter.triton_infer_fn@      皙?      ?Nr-   r   c                 C   s   | j d us	J dd }|d ur,| jd u rtd|| j v s#J dt|d| j| }t||t||d}| j j|||d}	g }
|	D ]}|
|j	d j
 qC|
S )NzModel is not initialized.zNo lora models are available.zLora model was not added beforer;   )
max_tokensr$   r"   r#   )lora_requestr   )r   r   r1   keysr   r   intgenerateappendr+   text)r   r-   r    r"   r#   r$   r   r?   sampling_paramsrequest_outputr6   or   r   r   r/   i   s   	
zvLLMHFExporter.forward)F)r:   r;   r<   r=   N)__name__
__module____qualname____doc__r   boolr   r   propertyr*   r,   r   r   r&   ndarrayr9   r   r3   rA   floatr/   r   r   r   r   r      s<    


r   )typingr   numpyr&   pytriton.decoratorsr   r   pytriton.model_configr   vllmr   r   vllm.lora.requestr   nemo.deployr	   nemo.deploy.utilsr
   r   r   r   r   r   r   <module>   s   