o
    }oi                     @   s\   d Z zddlmZ ddlmZmZ W n ey   dZ Y nw ddlmZ G dd deZ	dS )	T    )ModelConfig)TritonTritonConfigF)
DeployBasec                       sr   e Zd ZdZ												
ddedededededef fddZdd Zdd Zdd Z	dd Z
  ZS )DeployPyTritona  
    Deploys any models to Triton Inference Server that implements ITritonDeployable interface in nemo.deploy.

    Example:
        from nemo.deploy import DeployPyTriton, NemoQueryLLM
        from nemo.export.tensorrt_llm import TensorRTLLM

        trt_llm_exporter = TensorRTLLM(model_dir="/path/for/model/files")
        trt_llm_exporter.export(
            nemo_checkpoint_path="/path/for/nemo/checkpoint",
            model_type="llama",
            tensor_parallelism_size=1,
        )

        nm = DeployPyTriton(model=trt_llm_exporter, triton_model_name="model_name", http_port=8000)
        nm.deploy()
        nm.run()
        nq = NemoQueryLLM(url="localhost", model_name="model_name")

        prompts = ["hello, testing GPT inference", "another GPT inference test?"]
        output = nq.query_llm(prompts=prompts, max_output_len=100)
        print("prompts: ", prompts)
        print("")
        print("output: ", output)
        print("")

        prompts = ["Give me some info about Paris", "Do you think Londan is a good city to visit?", "What do you think about Rome?"]
        output = nq.query_llm(prompts=prompts, max_output_len=250)
        print("prompts: ", prompts)
        print("")
        print("output: ", output)
        print("")

       N   @  A  0.0.0.0TFr   triton_model_nametriton_model_versioncheckpoint_pathmax_batch_size	http_port	grpc_portc                    s(   t  j|||||||||	|
||d dS )aH  
        A nemo checkpoint or model is expected for serving on Triton Inference Server.

        Args:
            triton_model_name (str): Name for the service
            triton_model_version(int): Version for the service
            checkpoint_path (str): path of the nemo file
            model (ITritonDeployable): A model that implements the ITritonDeployable from nemo.deploy import ITritonDeployable
            max_batch_size (int): max batch size
            port (int) : port for the Triton server
            address (str): http address for Triton server to bind.
        )r   r   r   modelr   r   r   address
allow_grpc
allow_http	streamingpytriton_log_verboseN)super__init__)selfr   r   r   r   r   r   r   r   r   r   r   r   	__class__ O/home/ubuntu/.local/lib/python3.10/site-packages/nemo/deploy/deploy_pytriton.pyr   >   s   
zDeployPyTriton.__init__c              
   C   s   |    z`| jr4t| j| j| j| jd}t|d| _| jj	| j
| j| jj| jj| jjtddd W dS t| j| j| j| j| j| jd}t|d| _| jj	| j
| j| jj| jj| jjt| jdd W dS  ty~ } zd| _t| W Y d}~dS d}~ww )	z@
        Deploys any models to Triton Inference Server.
        )log_verboser   r   grpc_address)configT)	decoupled)
model_namemodel_version
infer_funcinputsoutputsr!   )http_addressr   r    r   r   r   )r   N)_init_nemo_modelr   r   r   r   r   r   r   tritonbindr   r   r   triton_infer_fn_streamingget_triton_inputget_triton_outputr   r   r   triton_infer_fnr   	Exceptionprint)r   triton_configer   r   r   deployi   sP   	
zDeployPyTriton.deployc              
   C   sX   | j du r	tdz| j   W dS  ty+ } zd| _ t| W Y d}~dS d}~ww )zE
        Starts serving the model and waits for the requests
        Ndeploy should be called first.)r*   r0   server1   )r   r3   r   r   r   r6      s   
zDeployPyTriton.servec                 C       | j du r	td| j   dS )z:
        Starts serving the model asynchronously.
        Nr5   )r*   r0   runr   r   r   r   r8         
zDeployPyTriton.runc                 C   r7   )z*
        Stops serving the model.
        Nr5   )r*   r0   stopr9   r   r   r   r;      r:   zDeployPyTriton.stop)r   NNr   r	   r
   r   TTFr   )__name__
__module____qualname____doc__strintr   r4   r6   r8   r;   __classcell__r   r   r   r   r      s<    &+/
r   N)
use_pytritonpytriton.model_configr   pytriton.tritonr   r   r0   nemo.deploy.deploy_baser   r   r   r   r   r   <module>   s   