o
    }oi(                     @   s(  d dl Z d dlZd dlZd dlZd dlmZmZ d dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZ G dd deZe Ze ZG d	d
 d
e	ZG dd deZG dd deZeddd Zeddd Zdd Zdd Zdd ZeddefddZdd  Zed!defd"d#Z dS )$    N)FastAPIHTTPException)	BaseModelmodel_validator)BaseSettings)NemoQueryLLMPyTorch)loggingc                       sJ   e Zd ZU dZeed< eed<  fddZedd Z	edd	 Z
  ZS )
TritonSettingsz[
    TritonSettings class that gets the values of TRITON_HTTP_ADDRESS and TRITON_PORT.
    _triton_service_port_triton_service_ipc              
      sl   t t|   zttjdd| _tjdd| _W d S  t	y5 } zt
d| W Y d }~d S d }~ww )NTRITON_PORTi@  TRITON_HTTP_ADDRESSz0.0.0.0zQAn exception occurred trying to retrieve set args in TritonSettings class. Error:)superr	   __init__intosenvirongetr
   r   	Exceptionr   error)selfr   	__class__ e/home/ubuntu/.local/lib/python3.10/site-packages/nemo/deploy/service/fastapi_interface_to_pytriton.pyr       s   zTritonSettings.__init__c                 C      | j S )zA
        Returns the port number for the Triton service.
        )r
   r   r   r   r   triton_service_port)      z"TritonSettings.triton_service_portc                 C   r   )z@
        Returns the IP address for the Triton service.
        )r   r   r   r   r   triton_service_ip0   r   z TritonSettings.triton_service_ip)__name__
__module____qualname____doc__r   __annotations__strr   propertyr   r   __classcell__r   r   r   r   r	      s   
 	
r	   c                   @   s\   e Zd ZU dZeed< dZeed< dZe	ed< dZ
e	ed< d	Zeed
< edddd ZdS )BaseRequesta  
    Common parameters for completions and chat requests for the server.

    Attributes:
        model (str): The name of the model to use for completion.
        max_tokens (int): The maximum number of tokens to generate in the response.
        temperature (float): Sampling temperature for randomness in generation.
        top_p (float): Cumulative probability for nucleus sampling.
        top_k (int): Number of highest-probability tokens to consider for sampling.
    modeli   
max_tokensg      ?temperatureg        top_pr   top_kafter)modec                 C   s(   | j dkr| jdkrtd d| _| S )z(Validate parameters for greedy decoding.r   zOBoth temperature and top_p are 0. Setting top_k to 1 to ensure greedy sampling.   )r+   r,   r   warningr-   r   r   r   r   set_greedy_paramsN   s   
zBaseRequest.set_greedy_paramsN)r    r!   r"   r#   r%   r$   r*   r   r+   floatr,   r-   r   r2   r   r   r   r   r(   <   s   
 r(   c                   @   s2   e Zd ZU dZeed< dZeed< dZe	ed< dS )CompletionRequesta4  
    Represents a request for text completion.

    Attributes:
        prompt (str): The input text to generate a response from.
        logprobs (int): Number of log probabilities to include in the response, if applicable.
        echo (bool): Whether to return the input text as part of the response.
    promptNlogprobsFecho)
r    r!   r"   r#   r%   r$   r6   r   r7   boolr   r   r   r   r4   W   s
   
 	r4   c                   @   s   e Zd ZU dZee ed< dS )ChatCompletionRequesta  
    Represents a request for chat completion.

    Attributes:
        messages (list[dict]): A list of message dictionaries for chat completion.
        logprobs (bool): Whether to return log probabilities for output tokens.
        top_logprobs (int): Number of log probabilities to include in the response, if applicable.
            logprobs must be set to true if this parameter is used.
    messagesN)r    r!   r"   r#   listdictr$   r   r   r   r   r9   f   s   
 
r9   z
/v1/healthc                   C   s   ddiS )z
    Health check endpoint to verify that the API is running.

    Returns:
        dict: A dictionary indicating the status of the application.
    statusokr   r   r   r   r   health_checkt   s   r?   z/v1/triton_healthc               
      s   dt j dtt j d} td|   ztj| dd}|jdkr(dd	iW S t	d
dd tj
yE } zt	d
dt| dd}~ww )aB  
    This method exposes endpoint "/triton_health" which can be used to verify if Triton server is accessible while
    running the REST or FastAPI application.
    Verify by running: curl http://service_http_address:service_port/v1/triton_health and the returned status should
    inform if the server is accessible.
    http://:z/v2/health/readyz+Attempting to connect to Triton server at:    )timeout   r=   z$Triton server is reachable and readyi  zTriton server is not ready)status_codedetailzCannot reach Triton server: N)triton_settingsr   r%   r   r   inforequestsr   rE   r   RequestException)
triton_urlresponseer   r   r   check_triton_health   s   	

rN   c                 C   sL   t | tjr
|  S t | trdd |  D S t | tr$dd | D S | S )z1
    Convert NumPy arrays in output to lists
    c                 S   s   i | ]	\}}|t |qS r   convert_numpy).0kvr   r   r   
<dictcomp>   s    z!convert_numpy.<locals>.<dictcomp>c                 S   s   g | ]}t |qS r   rO   )rQ   ir   r   r   
<listcomp>   s    z!convert_numpy.<locals>.<listcomp>)
isinstancenpndarraytolistr<   itemsr;   )objr   r   r   rP      s   

rP   c                 C   s.   t | |d}|j||||||||	d|
d
}|S )zn
    run_in_executor doesn't allow to pass kwargs, so we have this helper function to pass args as a list
    )url
model_namei,  )
promptsr+   r-   r,   compute_logprob
max_lengthapply_chat_templaten_top_logprobsinit_timeoutr7   )r   	query_llm)r]   r)   r_   r+   r-   r,   r`   ra   rb   rc   r7   nqoutputr   r   r   _helper_fun   s   rh   c                    sr   ddl }ddl}| }|j }||t| |||||||||	|
I dH }W d   |S 1 s2w   Y  |S )z
    Sends requests to `NemoQueryLLMPyTorch.query_llm` in a non-blocking way, allowing the server to process
    concurrent requests. This way enables batching of requests in the underlying Triton server.
    r   N)asyncio
concurrentget_event_loopfuturesThreadPoolExecutorrun_in_executorrh   )r]   r)   r_   r+   r-   r,   r`   ra   rb   rc   r7   ri   rj   looppoolresultr   r   r   query_llm_async   s0   
rr   z/v1/completions/requestc                    sZ  dt j dt j }td|   | j}t| jts | jg}t|| j	|| j
| j| j| jduo4| jdk| jd| j| jdI dH }t|}|d d d	 d d |d d d	< | jdur| jdkr|d d d
 d d |d d d
 d< |d d d
 d d |d d d
 d< | jr|d d d
 d dd nd|d d d
< td|  |S )z]
    Defines the completions endpoint and queries the model deployed on PyTriton server.
    r@   rA   	Request: Nr   Fr]   r)   r_   r+   r-   r,   r`   ra   rb   rc   r7   choicestextr6   token_logprobstop_logprobsOutput: )rG   r   r   r   rH   r5   rW   r;   rr   r)   r+   r-   r,   r6   r*   r7   rP   insert)rs   r]   r_   rg   output_serializabler   r   r   completions_v1   sP   $
r}   c                 C   s
   t | S )z 
    Serializes dict to str
    )jsondumps)r:   r   r   r   dict_to_str  s   
r   z/v1/chat/completions/c                    s  dt j dt j }td|   | j}t| jts | jg}t|g}t	|| j
|| j| j| jd| jddddI dH }d|d	 d
 d d|d	 d
 d< d|d< d|d	 d
 d< |d	 d
 d= t|}|d	 d
 d d d
 d
 |d	 d
 d d< td|  |S )zb
    Defines the chat completions endpoint and queries the model deployed on PyTriton server.
    r@   rA   rt   FTNru   	assistantrv   r   rw   )rolecontentmessagezchat.completionobjectr6   r   rz   )rG   r   r   r   rH   r:   rW   r;   r   rr   r)   r+   r-   r,   r*   rP   )rs   r]   r_   json_promptsrg   r|   r   r   r   chat_completions_v1  s@   
"r   )!r~   r   numpyrX   rI   fastapir   r   pydanticr   r   pydantic_settingsr   nemo.deploy.nlpr   
nemo.utilsr   r	   apprG   r(   r4   r9   r   r?   rN   rP   rh   rr   postr}   r   r   r   r   r   r   <module>   s6   
 


 )*