o
    }oiXQ                     @   s`  d dl Z d dlZd dlZd dlmZmZmZmZ d dl	Z	d dl
mZ d dlZd dlmZmZ d dlmZmZmZmZmZmZmZmZmZ d dlmZ d dlmZ d dlmZ  d dl!m"Z" d d	l#m$Z$ d d
l%m&Z& d dl'm(Z(m)Z) d dl*m+Z+ d dl,m-Z- e.dZ/ej0dd Z1e1Z2dZ3zd dl4m2Z2 d dl5m6Z6 W n e7y   dZ3Y nw G dd de$Z8dS )    N)IterableListOptionalUnion)RequestOutputSamplingParams)	CacheConfigDeviceConfig
LoadConfig
LoadFormat
LoRAConfigObservabilityConfigParallelConfigSchedulerConfig
VllmConfig)initialize_ray_cluster)LoRARequest)	Scheduler)	LLMEngine)ITritonDeployable)cast_output)convert_lora_nemo_to_canonicalprepare_directory_for_export)NemoModelConfig)NemoModelLoaderNeMoc                    s    fdd}|S )z*Used as batch if pytriton is not supportedc                     s    | i |S )N )argskwargsfuncr   M/home/ubuntu/.local/lib/python3.10/site-packages/nemo/export/vllm_exporter.pywrapper5   s   znoop_decorator.<locals>.wrapperr   )r    r"   r   r   r!   noop_decorator1   s   r#   T)batch)TensorFc                &   @   s  e Zd ZdZdd Z													dHd
ededededededee deee  dedede	dede
dee de	fddZdedeee  dedefddZ				dId ed!ed"e
d#ed$e
d%ee defd&d'Zd(ee fd)d*Zd(ee fd+d,Zd-ejd.edefd/d0Zed1d2 Zed3d4 Zed-ejfd5d6Zed-ejfd7d8Z	9											:	:	:	:dJd;ee d!ed#ed$e
d"e
d<eee  d=eee  d>ee d?eee  d@eee  dAee dBe	dCe	dDe	dEe	deeee  eeee   f f dFdGZdS )KvLLMExportera  
    The vLLMExporter class implements conversion from a Nemo checkpoint format to something compatible with vLLM,
    loading the model in vLLM, and binding that model to a Triton server.

    Example:
        from nemo.export.vllm_exporter import vLLMExporter
        from nemo.deploy import DeployPyTriton

        exporter = vLLMExporter()

        exporter.export(
            nemo_checkpoint='/path/to/checkpoint.nemo',
            model_dir='/path/to/temp_dir',
            model_type='llama',
        )

        server = DeployPyTriton(
            model=exporter,
            triton_model_name='LLAMA',
        )

        server.deploy()
        server.serve()
    c                 C   s   d| _ tjs
J dd S )Nr   zOnly vLLM V1 is supported)
request_idenvsVLLM_USE_V1)selfr   r   r!   __init__^   s   zvLLMExporter.__init__auto   Nr   T?nemo_checkpoint	model_dir
model_typedevicetensor_parallel_sizepipeline_parallel_sizemax_model_lenlora_checkpointsdtypeseed	log_statsweight_storagegpu_memory_utilizationquantizationdelete_existing_filesc                  C   s  t ||d t|}|dv sJ t|||d|	|
ddd||ddd}|jddr-td t||d	}|jrht	j
t	j
|jd
shtt	j
|jd
d}tj|j |dd W d   n1 scw   Y  |durw|dv rwtd d}t	j
|jd}t	j
|ot	j
|t	j
|k}|dkr|jdur| }d}n)d}d}n$|dkr| }d}n|dkrd}d}n|dkrd}d}ntd| d|rt|| n
|std|  td|dd| d}tdd|jdddtd}t|rtntjddd}| j |||j!d }|jd!krt"| dd"l#m$} |}n%|jd#kr*dd$l%m&} |}n|jd%ks8|j'd&ks8J dd'l(m)} |}t*t+|||||||t, d(||d)| _-dS )*a  
        Exports the Nemo checkpoint to vLLM and initializes the engine.

        Args:
            nemo_checkpoint (str): path to the nemo checkpoint.
            model_dir (str): path to a temporary directory to store weights and the tokenizer model.
                The temp dir may persist between subsequent export operations, in which case
                converted weights may be reused to speed up the export.
            model_type (str): type of the model, such as "llama", "mistral", "mixtral".
                Needs to be compatible with transformers.AutoConfig.
            device (str): type of the device to use by the vLLM engine.
                Supported values are "auto", "cuda", "cpu", "neuron".
            tensor_parallel_size (int): tensor parallelism.
            pipeline_parallel_size (int): pipeline parallelism.
                Values over 1 are not currently supported by vLLM.
            max_model_len (int): model context length.
            lora_checkpoints List[str]: paths to LoRA checkpoints.
            dtype (str): data type for model weights and activations.
                Possible choices: auto, half, float16, bfloat16, float, float32
                "auto" will use FP16 precision for FP32 and FP16 models,
                and BF16 precision for BF16 models.
            seed (int): random seed value.
            log_stats (bool): enables logging inference performance statistics by vLLM.
            weight_storage (str): controls how converted weights are stored:
                "file" - always write weights into a file inside 'model_dir',
                "memory" - always do an in-memory conversion,
                "cache" - reuse existing files if they are newer than the nemo checkpoint,
                "auto" - use "cache" for multi-GPU runs and "memory" for single-GPU runs.
            gpu_memory_utilization (float): The fraction of GPU memory to be used for the model
                executor, which can range from 0 to 1.
            quantization (str): quantization method that is used to quantize the model weights.
                Possible choices are None (weights not quantized, default) and "fp8".
            delete_existing_files (bool): if True, deletes all the files in model_dir.
        )r=   >   Nfp8r,   NF)
tokenizer_moder7   r8   revisioncode_revisiontokenizer_revisionr5   r<   quantization_param_pathenforce_eagerr>   z\NeMo FP8 checkpoint detected, but exporting FP8 quantized engines is not supported for vLLM.)r4   r3   zconfig.jsonw   )indent>   r,   memoryz4Setting weight_storage = "file" for FP8 quantizationfilezmodel.safetensorsTcacherH   z'Unsupported value for weight_storage: ""zUsing cached weights in       )
block_sizer;   
swap_spacecache_dtypesliding_window   r           )max_num_batched_tokensmax_num_seqsr5   num_lookahead_slotsdelay_factorenable_chunked_prefillscheduler_cls)load_formatdownload_dirmodel_loader_extra_config)r0   r6   r7   ray)RayDistributedExecutormp)MultiprocExecutorunir-   )UniProcExecutor)model_configcache_configparallel_configscheduler_configdevice_configload_configlora_configobservability_config)vllm_configexecutor_classr9   ).r   r	   r   nemo_model_configgetLOGGERwarningr   modelospathexistsjoinopenjsondumphf_text_configto_dictgetmtimedistributed_executor_backend
ValueErrorr   convert_and_store_nemo_weightsinfor   get_sliding_windowr   r5   V1Schedulerr
   r   SAFETENSORS_prepare_lora_checkpointsr7   r   )vllm.v1.executor.ray_distributed_executorr^   #vllm.v1.executor.multiproc_executorr`   
world_sizevllm.v1.executor.abstractrb   r   r   r   engine) r*   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   rg   rc   re   fsafetensors_filesafetensors_file_validsave_weightsinmemory_weight_conversionrd   rf   rh   ri   r^   rl   r`   rb   r   r   r!   exportb   s   4


	
zvLLMExporter.exportreturnc                 C   s   g | _ |sd S d}d}|D ]F}tj|std| dtj|d| }td| d| d t||dd	\}}	| j 	| |	d
 d d }
t
||
}|d7 }qt|t| j |dS )Nr   zLoRA checkpoint file 'z does not exist'lora_zConverting LoRA checkpoint 'z' into 'z'...T)	hf_formatpeftlora_tuningadapter_dimr-   )max_lora_rank	max_loras
lora_dtype)r6   rr   rs   isfileFileNotFoundErrorru   ro   r   r   appendmaxr   len)r*   r0   r6   r7   indexr   	nemo_filehf_lora_dir_ri   rankr   r   r!   r   '  s    

z&vLLMExporter._prepare_lora_checkpoints      ?rS   promptmax_output_lentemperaturetop_ktop_plora_uidc           
      C   s   |dkrd}t ||t||d}|d ur/|dkr/|t| jk r/td| |d | j| d}nd }t| j}	|  jd7  _| jj|	|||d |	S )	NrS   r   )
max_tokensr   r   r   r   LoRA_r-   )	lora_namelora_int_idlora_local_path)lora_request)	r   intr   r6   r   strr'   r   add_request)
r*   r   r   r   r   r   r   sampling_paramsr   r'   r   r   r!   _add_request_to_engineC  s   	
z#vLLMExporter._add_request_to_enginerequest_idsc              	   C   s   d gt | }dgt | }t|sE| j }|D ]'}|jsqz||j}W n	 ty0   Y qw |j||< |jd j	}|||< qt|rdd |D S )NFc                 S      g | ]}|gqS r   r   .0responser   r   r!   
<listcomp>u      z1vLLMExporter._forward_regular.<locals>.<listcomp>)
r   allr   stepfinishedr   r'   r}   outputstextr*   r   	responsesr   request_outputsrequest_outputrequest_indexoutput_textr   r   r!   _forward_regulara  s"   


zvLLMExporter._forward_regularc              	   c   s    d gt | }dgt | }t|sL| j }|D ]#}z||j}W n	 ty-   Y qw |j||< |jd j	}|||< qdd |D V  t|rd S d S )NFr   c                 S   r   r   r   r   r   r   r!   r     r   z3vLLMExporter._forward_streaming.<locals>.<listcomp>)
r   r   r   r   r   r'   r}   r   r   r   r   r   r   r!   _forward_streamingw  s    


zvLLMExporter._forward_streaminginputsr   c                 C   s   d|v rt tjj|d | d dd}nd }| j|d | d d|d | d |d | d |d	 | d |d
 | d |dS )N	lora_uidsr   zutf-8)encodingpromptszUTF-8r   r   r   r   r   r   r   r   r   r   )r   numpychardecoder   )r*   r   r   r   r   r   r!   _add_triton_request_to_engine  s   "z*vLLMExporter._add_triton_request_to_enginec                 C   s   t ddtdt ddtjddt ddtjddt ddtjddt d	dtjddt d
dtddt ddtjddt ddtjddf}|S )Nr   r   nameshaper7   r   T)r   r   r7   optionalr   r   r   r   output_generation_logitsoutput_context_logits)r%   bytesr   int_singlebool_)r*   r   r   r   r!   get_triton_input  s   
zvLLMExporter.get_triton_inputc                 C   s   t ddtdf}|S )Nr   r   r   )r%   r   )r*   r   r   r   r!   get_triton_output  s   zvLLMExporter.get_triton_outputc                 K   s`   g }t |d }t|D ]}| ||}|| q| |}dd |D }t|tj}d|iS )zS
        This function is used to perform inference on a batch of prompts.
        r   c                 S      g | ]}|d  qS r   r   r   rr   r   r!   r         z0vLLMExporter.triton_infer_fn.<locals>.<listcomp>r   )r   ranger   r   r   r   r   bytes_r*   r   r   num_requestsr   r'   r   output_tensorr   r   r!   triton_infer_fn  s   
zvLLMExporter.triton_infer_fnc                 k   sn    g }t |d }t|D ]}| ||}|| q| |D ]}dd |D }t|tj}d|iV  q dS )zG
        This function is used to perform streaming inference.
        r   c                 S   r   r   r   r   r   r   r!   r     r   z:vLLMExporter.triton_infer_fn_streaming.<locals>.<listcomp>r   N)r   r   r   r   r   r   r   r   r   r   r   r!   triton_infer_fn_streaming  s   z&vLLMExporter.triton_infer_fn_streaming@   Finput_textsstop_words_listbad_words_listno_repeat_ngram_sizetask_idsr   !prompt_embeddings_checkpoint_path	streamingoutput_log_probsr   r   c              	   C   s  |dur|g krt d|dur|g krt d|dur t d|	dur,|	g kr,t d|dur4t d|dur<t d|rBt d|rHt d	|rNt d
g }tt|D ]'}|| }|
durk|t|
k rk|
| }nd}| j||||||d}|| qV|r| |S | |S )a  
        The forward function performs LLM evaluation on the provided array of prompts with other parameters shared,
        and returns the generated texts. If 'streaming' is True, the output texts are returned incrementally
        with a generator: one token appended to each output at a time. If 'streaming' is false, the final output texts
        are returned as a single list of responses.
        Nz stop_words_list is not supportedzbad_words_list is not supportedz%no_repeat_ngram_size is not supportedztask_ids is not supportedz(prompt_embeddings_table is not supportedz2prompt_embeddings_checkpoint_path is not supportedz!output_log_probs is not supportedz)output_generation_logits is not supportedz&output_context_logits is not supportedr   )NotImplementedErrorr   r   r   r   r   r   )r*   r   r   r   r   r   r   r   r   r   r   prompt_embeddings_tabler   r   r   r   r   r   r   r   r   r'   r   r   r!   forward  sH   


zvLLMExporter.forward)r,   r-   r-   NNr,   r   Tr,   r.   NT)r   r-   rS   N)r   r-   rS   r   NNNNNNNFFFF)__name__
__module____qualname____doc__r+   r   r   r   r   boolfloatr   r   r   r   r   r   r   ndarrayr   propertyr   r   r$   r   r   r   r   r   r   r   r   r!   r&   D   s   	
	

 F

 




	


r&   )9rw   loggingos.pathrr   typingr   r   r   r   r   	vllm.envsr(   wraptvllmr   r   vllm.configr   r	   r
   r   r   r   r   r   r   vllm.executor.ray_utilsr   vllm.lora.requestr   vllm.v1.core.sched.schedulerr   r   vllm.v1.engine.llm_enginer   nemo.deployr   nemo.deploy.utilsr   nemo.export.utilsr   r   nemo.export.vllm.model_configr   nemo.export.vllm.model_loaderr   	getLoggerro   	decoratorr#   r$   use_pytritonpytriton.decoratorspytriton.model_configr%   	Exceptionr&   r   r   r   r!   <module>   s<   ,

	