o
    wia@                     @   s>  d dl Z d dlZd dlmZ d dlmZmZ d dlZd dl	Z	d dl
Z	d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZmZm Z  ej!dd Z"dZ#e"Z$zd dl%m$Z$m&Z& d dl'm(Z( W n e)y   dZ#Y nw e*dZ+G dd dZ,dd Z-G dd deZ.dS )    N)Path)ListOptional)Template)StrictHandling)CommonInferenceParams)InferenceRequest)	inference)ITritonDeployable)NEMO2broadcast_listcast_outputnemo_checkpoint_versionstr_ndarray2listc                    s    fdd}|S )a  A no-op decorator that returns the original function unchanged.

    Used as a fallback when pytriton's batch decorator is not available.

    Args:
        func: The function to decorate

    Returns:
        The original function without any modifications
    c                     s    | i |S )z4
        Wrapper method returning the func.
         )argskwargsfuncr   c/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/deploy/nlp/megatronllm_deployable.pywrapper0   s   znoop_decorator.<locals>.wrapperr   )r   r   r   r   r   noop_decorator#   s   r   T)batchfirst_value)TensorFNeMoc                   @   s^   e Zd ZdZe									ddededed	ed
edededee dedefddZ	dS )MegatronLLMDeployz
    A factory class for creating deployable instances of Megatron LLM models.
    This class provides a method to get the appropriate deployable instance
    based on the version of the NeMo checkpoint model used.
           NFnemo_checkpoint_filepathnum_devices	num_nodestensor_model_parallel_sizepipeline_model_parallel_sizecontext_parallel_sizemax_batch_sizerandom_seedenable_flash_decodelegacy_ckptc
           
      C   s0   t | tkrt| |||||||||	d
S td)a  
        Returns the appropriate deployable instance for the given NeMo checkpoint.

        Args:
            nemo_checkpoint_filepath (str): Path to the .nemo checkpoint file.
            num_devices (int): Number of devices to use for deployment.
            num_nodes (int): Number of nodes to use for deployment.
            tensor_model_parallel_size (int): Size of the tensor model parallelism.
            pipeline_model_parallel_size (int): Size of the pipeline model parallelism.
            context_parallel_size (int): Size of the context parallelism.
            enable_flash_decode (bool): Whether to enable flash decode for inference.

        Returns:
            ITritonDeployable: An instance of a deployable class compatible with Triton inference server.
        
r   r    r!   r"   r#   r$   r%   r&   r'   r(   z&Only NeMo 2.0 checkpoint is supported.)r   r   MegatronLLMDeployableNemo2	Exceptionr)   r   r   r   get_deployableK   s   z MegatronLLMDeploy.get_deployable)	r   r   r   r   r   r   NFF)
__name__
__module____qualname____doc__staticmethodstrintr   boolr,   r   r   r   r   r   D   sB    	
r   c                 C   s
   t | S )z 
    Serializes dict to str
    )jsondumps)messagesr   r   r   dict_to_strx      
r8   c                   @   s  e Zd ZdZddddddddejddddddfd	ed
edededededededejdededede	e de
de
fddZ	d6dee de	e dee fddZdd  Zd7d!d"Zd#d$ Zd%d& Zed'd( Zed)d* Zeed+dd,d-d.dd/d0d1d2
d3ejfd4d5ZdS )8r*   a  
    Triton inference server compatible deploy class for a .nemo model file

    Args:
        nemo_checkpoint_filepath (str): path for the nemo checkpoint.
        num_devices (int): number of GPUs.
        num_nodes (int): number of nodes.
        tensor_model_parallel_size (int): tensor parallelism.
        pipeline_parallelism_size (int): pipeline parallelism.
        context_parallel_size (int): context parallelism.
        params_dtype (torch.dtype): max input length.
        inference_batch_times_seqlen_threshold (int): squence threshold.
        inference_max_seq_length (int): max_seq_length for inference. Required by MCoreEngine (>=0.12). Defaults to
        4096.
        max_batch_size (int): max batch size for inference. Defaults to 32.
        random_seed (Optional[int]): random seed for inference. Defaults to None.
        enable_flash_decode (bool): enable flash decode for inference. Defaults to False.
    Nr   i  i   r   TFr   r    r!   r"   r#   r$   expert_model_parallel_sizeexpert_tensor_parallel_sizeparams_dtype&inference_batch_times_seqlen_thresholdinference_max_seq_lengthr%   r&   r'   r(   c                 C   s   || _ tj|||||ddd|rtjnd d	}tjd|||tjdtjtjdddd}t	j
t|||	|
||||d\| _| _| _d S )NF)	r"   r#   r$   r:   r;   sequence_parallelsetup_optimizersstore_optimizer_statesckpt_load_strictnessgpuz
bf16-mixed)	precisionr<   pipeline_dtypeautocast_enabledgrad_reduce_in_fp32)acceleratordevicesr!   strategyplugins)pathtrainerr<   r=   r>   r%   r&   r'   )r   nlMegatronStrategyr   LOG_ALLTrainerMegatronMixedPrecisiontorchbfloat16r	   setup_mcore_enginer   mcore_engineinference_wrapped_modelmcore_tokenizer)selfr   r    r!   r"   r#   r$   r:   r;   r<   r=   r>   r%   r&   r'   r(   rJ   rM   r   r   r   __init__   sF   z#MegatronLLMDeployableNemo2.__init__promptsinference_paramsreturnc                 C   s$   |pt  }| jj|d|d}t|S )a^  
        Generates text based on the provided input prompts.

        Args:
            prompts (List[str]): A list of input strings.
            inference_params (Optional[CommonInferenceParams]): Parameters for controlling the inference process.
        Returns:
            List[InferenceRequest]: A list containing the generated results.
        F)r[   add_BOScommon_inference_params)r   rV   generatelist)rY   r[   r\   resultsr   r   r   r`      s   
z#MegatronLLMDeployableNemo2.generatec           	      C   s   	 t jdt jdd}t jj|dd |dkr=tdgdd}tdgdd\}}}}}t|t|t|||d	}| 	|| ndS q)
zD
        Generate function for ranks other than the rank 0.
        Tr   cudadtypedevicer   srcNdatarh   )temperaturetop_ktop_pnum_tokens_to_generatereturn_log_probs)
rS   emptylongdistributed	broadcastr   r   r3   floatr`   )	rY   messager[   rk   rl   rm   rn   	log_probsr\   r   r   r   generate_other_ranks   s    z/MegatronLLMDeployableNemo2.generate_other_ranksc                 C   sP   z| j jjj}| j jjj}t|}W n ty   tdw |j|||d}|S )zy
        Load the chat template.
        Works when model's tokenizer has chat template (typically chat models).
        zThe tokenizer does not have chat template, if you would like to evaluate chat model                              ensure your model's tokenizer has a chat template)r7   	bos_tokenadd_generation_prompt)rX   	tokenizerchat_templaterx   r   AttributeError
ValueErrorrender)rY   r7   ry   tokenizer_chat_templaterx   templaterendered_outputr   r   r   apply_chat_template   s   z.MegatronLLMDeployableNemo2.apply_chat_templatec                 C   sH   | j jjj}g }|D ]}||v r|||dd  q
|| q
|S )zV
        Removes eos token if it exists in the output, otherwise does nothing
        r   r   )rX   rz   	eos_tokenappendrsplit)rY   textr   outputtr   r   r   remove_eos_token  s   z+MegatronLLMDeployableNemo2.remove_eos_tokenc                 C   s
   t |S )z&
        Convert str to dict.
        )r5   loads)rY   json_strr   r   r   str_to_dict  r9   z&MegatronLLMDeployableNemo2.str_to_dictc                 C   s   t ddtdt ddtjddt ddtjddt ddtjddt d	dtjddt d
dtjddt ddtjddt ddtjddt ddtjddt ddtjddt ddtjddf}|S )Nr[   nameshapere   
max_lengthT)r   r   re   optionalr%   rl   rm   rk   r&   compute_logprobr   n_top_logprobsecho)r   bytesnpint_singlebool_)rY   inputsr   r   r   get_triton_input"  s   z+MegatronLLMDeployableNemo2.get_triton_inputc                 C   s*   t ddtdt ddtjdt ddtdfS )N	sentencesr   r   rv   top_logprobs)r   r   r   r   rY   r   r   r   get_triton_output3  s   z,MegatronLLMDeployableNemo2.get_triton_outputr   rl   rm   rk   r   r   r   r   r   c                    s^  i }t |d}|dd}|dd}|dd}|dd	}|d
d}|dd}	|dd}
|dd}d|	rOfdd|D }fdd|D }tj r|tj dkr|tjjtjdgtjdddd t	|dd t	|||||gdd t
|t|t||||
d}||}|rfdd|D }n	fdd|D }|}dt|tji}|rg }|D ]2}|rt|j|j    }nt|j   }t|dkr|dg q|| q|r	tdd |D  t fdd|D }||d< nt||d< |
r-g }|D ]}t|j}|| qt|tj|d < |S )!Nr[   rk   g      ?rl   r   rm   g        r      r   Fr   r   r   r   Tc                       g | ]}  |qS r   )r   .0promptr   r   r   
<listcomp>W      z>MegatronLLMDeployableNemo2.triton_infer_fn.<locals>.<listcomp>c                    r   r   )r   r   r   r   r   r   X  r   rc   rd   rg   ri   )rk   rl   rm   rn   ro   top_n_logprobsc                    s    g | ]} r|j |j n|qS r   )r   generated_textr   r	text_onlyr   r   r   u  s     c                    s   g | ]	} r	|j n|qS r   )r   r   r   r   r   r   w  s    r   c                 s   s    | ]}t |V  qd S N)lenr   arrr   r   r   	<genexpr>  s    z=MegatronLLMDeployableNemo2.triton_infer_fn.<locals>.<genexpr>c                    s(   g | ]}t j|d  t| fd dqS )r   )constant_values)r   padr   r   )max_lenr   r   r     s   ( rv   r   )r   poprS   rr   is_initializedget_world_sizers   tensorrq   r   r   r3   rt   r`   r   r   r   bytes_prompt_log_probsgenerated_log_probscpudetachnumpyr   r   maxarrayr8   generated_top_n_logprobs)rY   r   output_inferr[   rk   rl   rm   rn   rv   r   r   r   r\   rb   output_textsoutput_log_probsr   lppaddedoutput_top_n_log_probstop_n_lpr   )r   rY   r   r   triton_infer_fn;  s   
 	
 

z*MegatronLLMDeployableNemo2.triton_infer_fnr   )T)r-   r.   r/   r0   rS   rT   r2   r3   re   r   r4   rZ   r   r   r   r`   rw   r   r   r   propertyr   r   r   r   r   ndarrayr   r   r   r   r   r*      s    	

:



r*   )/r5   loggingpathlibr   typingr   r   r   r   rS   torch.distributedwraptjinja2r   +megatron.core.dist_checkpointing.validationr   /megatron.core.inference.common_inference_paramsr   )megatron.core.inference.inference_requestr   nemo.lightning	lightningrN   nemo.collections.llmr	   nemo.deployr
   nemo.deploy.utilsr   r   r   r   r   	decoratorr   use_pytritonr   pytriton.decoratorsr   pytriton.model_configr   r+   	getLoggerLOGGERr   r8   r*   r   r   r   r   <module>   s<   

4