o
    ci*                     @   s`  d dl mZmZ d dlmZmZmZ	m
ZmZ d dlmZ d dlmZ d dlmZ er0d dlmZ eddG d	d
 d
eZeddG dd de	ZeddG dd deZeddG dd deZeddG dd deZ
edddddd
dee ddfddZeddd#ddZeddG dd deZeddG d d! d!eZg d"ZdS )$    )TYPE_CHECKINGOptional)CloudMirrorConfig	LLMConfigLLMServingArgs
LoraConfigModelLoadingConfig)	LLMServer)	LLMRouter)	PublicAPI)Applicationalpha)	stabilityc                   @      e Zd ZdZdS )r   z1The configuration for starting an LLM deployment.N__name__
__module____qualname____doc__ r   r   J/home/ubuntu/.local/lib/python3.10/site-packages/ray/serve/llm/__init__.pyr          r   c                   @   r   )r   z=The configuration for starting an LLM deployment application.Nr   r   r   r   r   r   "   r   r   c                   @   r   )r   z+The configuration for loading an LLM model.Nr   r   r   r   r   r   )   r   r   c                   @   r   )r   z@The configuration for mirroring an LLM model from cloud storage.Nr   r   r   r   r   r   0   r   r   c                   @   r   )r   z5The configuration for loading an LLM model with LoRA.Nr   r   r   r   r   r   7   r   r   N)name_prefix
llm_configr   returnr   c                C   s   ddl m} || |dS )a  Helper to build a single vllm deployment from the given llm config.

    Examples:
        .. testcode::
            :skipif: True

            from ray import serve
            from ray.serve.llm import LLMConfig, build_llm_deployment

            # Configure the model
            llm_config = LLMConfig(
                model_loading_config=dict(
                    model_id="llama-3.1-8b",
                    model_source="meta-llama/Llama-3.1-8b-instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1,
                        max_replicas=2,
                    )
                ),
                accelerator_type="A10G",
            )

            # Build the deployment
            llm_app = build_llm_deployment(llm_config)

            # Deploy the application
            model_handle = serve.run(llm_app)

            # Querying the model handle
            import asyncio
            model_handle = model_handle.options(stream=True)
            async def query_model(model_handle):
                from ray.serve.llm.openai_api_models import ChatCompletionRequest

                request = ChatCompletionRequest(
                    model="qwen-0.5b",
                    messages=[
                        {
                            "role": "user",
                            "content": "Hello, world!"
                        }
                    ]
                )

                resp = model_handle.chat.remote(request)
                async for message in resp:
                    print("message: ", message)

            asyncio.run(query_model(model_handle))

    Args:
        llm_config: The llm config to build vllm deployment.
        name_prefix: Optional prefix to be used for the deployment name.

    Returns:
        The configured Ray Serve Application for vllm deployment.
    r   )build_llm_deployment)r   r   ) ray.llm._internal.serve.buildersr   )r   r   r   r   r   r   r   C   s   ?r   llm_serving_argsc                 C   s   ddl m} || dS )a  Helper to build an OpenAI compatible app with the llm deployment setup from
    the given llm serving args. This is the main entry point for users to create a
    Serve application serving LLMs.


    Examples:
        .. code-block:: python
            :caption: Example usage in code.

            from ray import serve
            from ray.serve.llm import LLMConfig, LLMServingArgs, build_openai_app

            llm_config1 = LLMConfig(
                model_loading_config=dict(
                    model_id="qwen-0.5b",
                    model_source="Qwen/Qwen2.5-0.5B-Instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1, max_replicas=2,
                    )
                ),
                accelerator_type="A10G",
            )

            llm_config2 = LLMConfig(
                model_loading_config=dict(
                    model_id="qwen-1.5b",
                    model_source="Qwen/Qwen2.5-1.5B-Instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1, max_replicas=2,
                    )
                ),
                accelerator_type="A10G",
            )

            # Deploy the application
            llm_app = build_openai_app(
                LLMServingArgs(
                    llm_configs=[
                        llm_config1,
                        llm_config2,
                    ]
                )
            )
            serve.run(llm_app)


            # Querying the model via openai client
            from openai import OpenAI

            # Initialize client
            client = OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key")

            # Basic completion
            response = client.chat.completions.create(
                model="qwen-0.5b",
                messages=[{"role": "user", "content": "Hello!"}]
            )

        .. code-block:: yaml
            :caption: Example usage in YAML.

            # config.yaml
            applications:
            - args:
                llm_configs:
                    - model_loading_config:
                        model_id: qwen-0.5b
                        model_source: Qwen/Qwen2.5-0.5B-Instruct
                      accelerator_type: A10G
                      deployment_config:
                        autoscaling_config:
                            min_replicas: 1
                            max_replicas: 2
                    - model_loading_config:
                        model_id: qwen-1.5b
                        model_source: Qwen/Qwen2.5-1.5B-Instruct
                      accelerator_type: A10G
                      deployment_config:
                        autoscaling_config:
                            min_replicas: 1
                            max_replicas: 2
              import_path: ray.serve.llm:build_openai_app
              name: llm_app
              route_prefix: "/"


    Args:
        llm_serving_args: The list of llm configs or the paths to the llm config to
            build the app.

    Returns:
        The configured Ray Serve Application router.
    r   )build_openai_app)r   )r   r   )r   r   r   r   r   r      s   c
r   c                   @   r   )r	   aB  The implementation of the vLLM engine deployment.

    To build a Deployment object you should use `build_llm_deployment` function.
    We also expose a lower level API for more control over the deployment class
    through `as_deployment` method.

    Examples:
        .. testcode::
            :skipif: True

            from ray import serve
            from ray.serve.llm import LLMConfig, LLMServer

            # Configure the model
            llm_config = LLMConfig(
                model_loading_config=dict(
                    served_model_name="llama-3.1-8b",
                    model_source="meta-llama/Llama-3.1-8b-instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1,
                        max_replicas=8,
                    )
                ),
            )

            # Build the deployment directly
            LLMDeployment = LLMServer.as_deployment(llm_config.get_serve_options())
            llm_app = LLMDeployment.bind(llm_config)

            model_handle = serve.run(llm_app)

            # Query the model via `chat` api
            from ray.serve.llm.openai_api_models import ChatCompletionRequest
            request = ChatCompletionRequest(
                model="llama-3.1-8b",
                messages=[
                    {
                        "role": "user",
                        "content": "Hello, world!"
                    }
                ]
            )
            response = ray.get(model_handle.chat(request))
            print(response)
    Nr   r   r   r   r   r	      s    0r	   c                   @   r   )r
   a  The implementation of the OpenAI compatiple model router.

    This deployment creates the following endpoints:
      - /v1/chat/completions: Chat interface (OpenAI-style)
      - /v1/completions: Text completion
      - /v1/models: List available models
      - /v1/models/{model}: Model information


    Examples:
        .. testcode::
            :skipif: True


            from ray import serve
            from ray.serve.llm import LLMConfig, LLMServer, LLMRouter
            from ray.serve.llm.openai_api_models import ChatCompletionRequest


            llm_config1 = LLMConfig(
                model_loading_config=dict(
                    served_model_name="llama-3.1-8b",  # Name shown in /v1/models
                    model_source="meta-llama/Llama-3.1-8b-instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1, max_replicas=8,
                    )
                ),
            )
            llm_config2 = LLMConfig(
                model_loading_config=dict(
                    served_model_name="llama-3.2-3b",  # Name shown in /v1/models
                    model_source="meta-llama/Llama-3.2-3b-instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1, max_replicas=8,
                    )
                ),
            )

            # Deploy the application
            vllm_deployment1 = LLMServer.as_deployment(llm_config1.get_serve_options()).bind(llm_config1)
            vllm_deployment2 = LLMServer.as_deployment(llm_config2.get_serve_options()).bind(llm_config2)
            llm_app = LLMRouter.as_deployment().bind([vllm_deployment1, vllm_deployment2])
            serve.run(llm_app)
    Nr   r   r   r   r   r
   )  s    1r
   )	r   r   r   r   r   r   r   r	   r
   )r   r   r   r   )typingr   r   -ray.llm._internal.serve.configs.server_modelsr   _CloudMirrorConfigr   
_LLMConfigr   _LLMServingArgsr   _LoraConfigr   _ModelLoadingConfig2ray.llm._internal.serve.deployments.llm.llm_serverr	   
_LLMServer2ray.llm._internal.serve.deployments.routers.routerr
   
_LLMRouterray.util.annotationsr   ray.serve.deploymentr   strr   r   __all__r   r   r   r   <module>   sB    Cl46