o
    }oiG[                     @   s   d dl Z d dlZd dlmZ d dlmZmZ d dlZd dl	m
Z
 dZz
d dlmZmZ W n ey7   dZY nw G dd	 d	eZG d
d deZG dd deZG dd deZdS )    N)ABC)ListOptional)str_list2numpyT)DecoupledModelClientModelClientFc                   @   s   e Zd ZdZdd ZdS )NemoQueryLLMBasez
    Abstract base class for querying a Large Language Model (LLM).

    Args:
    url (str): The URL of the inference server.
    model_name (str): The name of the model to be queried.
    c                 C   s   || _ || _d S )Nurl
model_nameselfr
   r    r   M/home/ubuntu/.local/lib/python3.10/site-packages/nemo/deploy/nlp/query_llm.py__init__(   s   
zNemoQueryLLMBase.__init__N)__name__
__module____qualname____doc__r   r   r   r   r   r      s    r   c                "       s   e Zd ZdZ fddZ															ddee dee d	ee	 d
ee
 dee	 dee	 dee dee dee deee  dee
 dee
 dedee
 de	dee f ddZ  ZS )NemoQueryLLMPyTorcha  
    Sends a query to Triton for LLM inference

    Example:
        from nemo.deploy import NemoTritonQueryLLMPyTorch

        nq = NemoTritonQueryLLMPyTorch(url="localhost", model_name="GPT-2B")

        prompts = ["hello, testing GPT inference", "another GPT inference test?"]
        output = nq.query_llm(
            prompts=prompts,
            max_length=100,
            top_k=1,
            top_p=0.0,
            temperature=0.0,
        )
        print("prompts: ", prompts)
    c                       t  j||d d S Nr	   superr   r   	__class__r   r   r   A      
zNemoQueryLLMPyTorch.__init__NF      N@prompts
use_greedytemperaturetop_ktop_prepetition_penaltyadd_BOS	all_probscompute_logprobend_strings
min_length
max_lengthapply_chat_templaten_top_logprobsinit_timeoutechoc                 C   s<  t |}d|i}|durtj|j|tjd|d< |dur(tj|j|tjd|d< |dur8tj|j|tjd|d< |durHtj|j|tjd|d< |durXtj|j|tjd|d< |durhtj|j|tjd|d	< |durxtj|j|tjd|d
< |	durtj|j|	tjd|d< |
durt |
|d< |durtj|j|tjd|d< |durtj|j|tjd|d< |durtj|j|tjd|d< |durtj|j|tjd|d< |durtj|j|tjd|d< t| j| j	|dd}|j
d$i |}|jjd j}d}d| v r	|d }d}d| v r|d }|tjkrd| v r(|d }n		 W d   dS tj|dd}dtt  dtt | j	d|igd}|duri |d d d < ||d d d  d!< |durd"d# |D }||d d d  d< |W  d   S |d W  d   S 1 sw   Y  dS )%a  
        Query the Triton server synchronously and return a list of responses.

        Args:
            prompts (List(str)): list of sentences.
            use_greedy (bool): use greedy sampling, effectively the same as top_k=1
            temperature (float): A parameter of the softmax function, which is the last layer in the network.
            top_k (int): limits us to a certain number (K) of the top tokens to consider.
            top_p (float): limits us to the top tokens within a certain probability mass (p).
            repetition_penalty (float): penalty applied to repeated sequences, 1.0 means no penalty.
            add_BOS (bool): whether or not to add a BOS (beginning of sentence) token.
            all_probs (bool): when using compute_logprob, returns probabilities for all tokens in vocabulary.
            compute_logprob (bool): get back probabilities of all tokens in the sequence.
            end_strings (List(str)): list of strings which will terminate generation when they appear in the output.
            min_length (int): min generated tokens.
            max_length (int): max generated tokens.
            apply_chat_template (bool): applies chat template if its a chat model. Default: False
            init_timeout (flat): timeout for the connection.
        r   Ndtyper   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r-   iX  )init_timeout_sinference_timeout_sr   	log_probstop_logprobs	sentencesUnknown output keyword.bytesutf-8cmpl-text_completiontextidobjectcreatedmodelchoicesr@   logprobstoken_logprobsc                 S   s   g | ]	}t |d  qS )r   )jsonloads).0top_log_probr   r   r   
<listcomp>   s    z1NemoQueryLLMPyTorch.query_llm.<locals>.<listcomp>r   )r   npfullshapebool_singleint_r   r
   r   infer_batchmodel_configoutputsr/   keysbytes_chardecodeastypeinttime)r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   inputsclientresult_dictoutput_typelog_probs_outputtop_log_probs_outputoutputr4   openai_responsen_log_probs_outputr   r   r   	query_llmI   s|   &



&&zNemoQueryLLMPyTorch.query_llm)NNNNNNNNNNNFNr   Nr   r   r   r   r   r   strr   boolfloatrV   ra   __classcell__r   r   r   r   r   -   sf    	

r   c                       s   e Zd ZdZ fddZ													ddee dee dee	 d	ee
 d
ee	 dee	 dee dee dee dee deee  dee
 dee
 de	fddZ  ZS )NemoQueryLLMHFa  
    Sends a query to Triton for LLM inference

    Example:
        from nemo.deploy import NemoQueryLLMHF

        nq = NemoQueryLLMHF(url="localhost", model_name="GPT-2B")

        prompts = ["hello, testing GPT inference", "another GPT inference test?"]
        output = nq.query_llm(
            prompts=prompts,
            max_length=100,
            top_k=1,
            top_p=0.0,
            temperature=0.0,
        )
        print("prompts: ", prompts)
    c                    r   r   r   r   r   r   r   r      r   zNemoQueryLLMHF.__init__Nr   r   r   r    r!   r"   r#   r$   r%   output_logitsoutput_scoresr'   r(   r)   r,   c                 C   s  t |}d|i}|durtj|j|tjd|d< |dur(tj|j|tjd|d< |dur8tj|j|tjd|d< |durHtj|j|tjd|d< |durXtj|j|tjd|d< |durhtj|j|tjd|d	< |durxtj|j|tjd|d
< |	durtj|j|	tjd|d< |
durtj|j|
tjd|d< |durt ||d< |durtj|j|tjd|d< |durtj|j|tjd|d< t| j| j	|d{}|j
di |}|jjd j}|tjkr;d| v r|d }n		 W d   dS tj|dd}dtt  dtt | j	d|igd}|	r$d|v r$|d |d< |
r2d|v r2|d |d< |W  d   S |d W  d   S 1 sJw   Y  dS )a]  
        Query the Triton server synchronously and return a list of responses.

        Args:
            prompts (List[str]): list of sentences.
            use_greedy (Optional[bool]): use greedy sampling, effectively the same as top_k=1
            temperature (Optional[float]): A parameter of the softmax function, which is the last layer in the network.
            top_k (Optional[int]): limits us to a certain number (K) of the top tokens to consider.
            top_p (Optional[float]): limits us to the top tokens within a certain probability mass (p).
            repetition_penalty (Optional[float]): penalty applied to repeated sequences, 1.0 means no penalty.
            add_BOS (Optional[bool]): whether or not to add a BOS (beginning of sentence) token.
            all_probs (Optional[bool]): when using compute_logprob, returns probabilities for all tokens in vocabulary.
            output_logits (Optional[bool]): whether to return logits for each token
            output_scores (Optional[bool]): whether to return scores for each token
            end_strings (Optional[List[str]]): list of strs which will stop generation when they appear in the output.
            min_length (Optional[int]): min generated tokens.
            max_length (Optional[int]): max generated tokens.
            init_timeout (float): timeout for the connection.
        r   Nr.   r   r    r!   r"   r#   r$   r%   rh   ri   r'   r(   r)   r0   r   r4   r5   r6   r7   r8   r9   r:   r;   logitsscoresr   )r   rH   rI   rJ   rK   rL   rM   r   r
   r   rN   rO   rP   r/   rR   rQ   rS   rT   rU   rV   rW   )r   r   r   r    r!   r"   r#   r$   r%   rh   ri   r'   r(   r)   r,   rX   rY   rZ   r[   r^   r4   r_   r   r   r   ra      sd   $


&zNemoQueryLLMHF.query_llm)NNNNNNNNNNNNr   rb   r   r   r   r   rg      sZ    	

rg   c                       s   e Zd ZdZ fddZ																					ddeded	ed
ededededefddZ											dddZ  Z	S )NemoQueryLLMa  
    Sends a query to Triton for LLM inference

    Example:
        from nemo.deploy import NemoQueryLLM

        nq = NemoQueryLLM(url="localhost", model_name="GPT-2B")

        prompts = ["hello, testing GPT inference", "another GPT inference test?"]
        output = nq.query_llm(
            prompts=prompts,
            max_output_len=100,
            top_k=1,
            top_p=0.0,
            temperature=0.0,
        )
        print("prompts: ", prompts)
    c                    r   r   r   r   r   r   r   r   E  r   zNemoQueryLLM.__init__Nr   Fr   r#   r$   r%   r&   openai_format_responseoutput_context_logitsoutput_generation_logitsc                 C   s  t |}d|i}|durtj|j|tjd|d< |dur(tj|j|tjd|d< |dur8tj|j|tjd|d< |durHtj|j|tjd|d< |	durXtj|j|	tjd|d< |
durhtj|j|
tjd|d	< |durrt ||d
< |dur|t ||d< |durtj|j|tjd|d< |durtj|d}t|jd t|gf||d< |durtj|d}t|jd t|f||d< |durtj|j|tj	d|d< |durtj|j|tjd|d< |durtj|j|tj	d|d< |durtj|j|tj	d|d< |durtj|j|tj	d|d< |durt ||d< |dur/tj|j|tj	d|d< |dur@tj|j|tj	d|d< t
| j| j|d}|jd%i |}|jjd j}|tjkrd| v rk|d }nd| v rw|d }n		 W d   dS tj|dd}|rdtt  dtt | jd |igd!}|r|d" |d# d d"< |r|d$ |d# d d$< |W  d   S |W  d   S |d W  d   S 1 sw   Y  dS )&a  
        Query the Triton server synchronously and return a list of responses.

        Args:
            prompts (List(str)): list of sentences.
            max_output_len (int): max generated tokens.
            top_k (int): limits us to a certain number (K) of the top tokens to consider.
            top_p (float): limits us to the top tokens within a certain probability mass (p).
            temperature (float): A parameter of the softmax function, which is the last layer in the network.
            random_seed (int): Seed to condition sampling.
            stop_words_list (List(str)): list of stop words.
            bad_words_list (List(str)): list of bad words.
            no_repeat_ngram_size (int): no repeat ngram size.
            task_id (str): downstream task id if virtual tokens are used.
            init_timeout (flat): timeout for the connection.
            openai_format_response: return response similar to OpenAI API format
            output_generation_logits: return generation logits from model on PyTriton
        r   Nr.   min_output_lenmax_output_lenr!   r"   r    random_seedstop_words_listbad_words_listno_repeat_ngram_sizer7   r   task_id	lora_uidsr   r#   r$   r%   r&   r'   ro   rp   rj   rP   r4   r5   r6   r8   r9   r:   r;   generation_logitsr@   context_logitsr   )r   rH   rI   rJ   rM   rL   rS   encodelenrK   r   r
   r   rN   rO   rP   r/   rR   rQ   rT   rU   rV   rW   )r   r   rt   ru   rv   rq   rr   r!   r"   r    rs   rw   rx   r   r#   r$   r%   r&   r'   r,   rn   ro   rp   rX   rY   rZ   r[   r^   r4   r_   r   r   r   ra   K  s   , 







&zNemoQueryLLM.query_llm                    ?c                 c   s8   t |}d|i}|durtj|j|tjd|d< |dur)tj|j|tjd|d< |dur9tj|j|tjd|d< |durItj|j|tjd|d< |	durYtj|j|	tjd|d< |durstj|d	}t|jd
 t|f||d< |durtj|d	}t|jd
 t|f||d< |durtj|j|tjd|d< |
durtj|
d	}
t|jd
 t|
gf|
|d< |durtj|d	}t|jd
 t|f||d< t	| j
| j|d6}|jdi |D ]$}|jjd
 j}|tjkrtj|d dd	}|V  q|d V  qW d   dS 1 sw   Y  dS )aS  
        Query the Triton server using streaming.

        Args:
            prompts (List(str)): list of sentences.
            max_output_len (int): max generated tokens.
            top_k (int): limits us to a certain number (K) of the top tokens to consider.
            top_p (float): limits us to the top tokens within a certain probability mass (p).
            temperature (float): A parameter of the softmax function, which is the last layer in the network.
            random_seed (int): Seed to condition sampling.
            stop_words_list (List(str)): list of stop words.
            bad_words_list (List(str)): list of bad words.
            no_repeat_ngram_size (int): no repeat ngram size.
            task_id (str): downstream task id if virtual tokens are used.
            init_timeout (flat): timeout for the connection.
        r   Nr.   rr   r!   r"   r    rs   r7   r   rt   ru   rv   rw   rx   rj   rP   r6   r   )r   rH   rI   rJ   rM   rL   rS   r{   r|   r   r
   r   rN   rO   rP   r/   rR   rT   rU   )r   r   rt   ru   rv   rr   r!   r"   r    rs   rw   rx   r,   rX   rY   partial_result_dictr[   r4   r   r   r   query_llm_streaming  sH     $z NemoQueryLLM.query_llm_streaming)NNNNNNNNNNNNNNNNNr   FFF)NNNr}   r~   r   r   NNNr   )
r   r   r   r   r   rd   re   ra   r   rf   r   r   r   r   rm   1  sl    	
 rm   )rC   rW   abcr   typingr   r   numpyrH   nemo.deploy.utilsr   use_pytritonpytriton.clientr   r   	Exceptionr   r   rg   rm   r   r   r   r   <module>   s"    x