o
    `Û·iú1  ã                   @   s  d dl mZmZmZ d dlmZ d dlmZm	Z
mZmZ d dlmZ d dlmZ d dlmZ d dlmZ er<d dlmZ ed	d
G dd„ de
ƒƒZ	ed	d
G dd„ deƒƒZed	d
G dd„ deƒƒZed	d
G dd„ deƒƒZed	d
G dd„ deƒƒZeddddG dd„ deƒƒZeddddG dd„ deƒƒZed	d
ddddd œd!dd"ee d#ee d$ee d%eee  d&d'fd(d)„ƒZed	d
d*ed&d'fd+d,„ƒZ ed	d
d-ed&d'fd.d/„ƒZ!ed	d
ddd0œd!dd"ee d$ee d&d'fd1d2„ƒZ"ed	d
d3ed&d'fd4d5„ƒZ#g d6¢Z$dS )7é    )ÚTYPE_CHECKINGÚOptionalÚType)Ú
Deprecated)ÚCloudMirrorConfigÚ	LLMConfigÚ
LoraConfigÚModelLoadingConfig)ÚLLMServingArgs)ÚOpenAiIngress)Ú	LLMServer)Ú	PublicAPI)ÚApplicationÚalpha)Ú	stabilityc                   @   ó   e Zd ZdZdS )r   z1The configuration for starting an LLM deployment.N©Ú__name__Ú
__module__Ú__qualname__Ú__doc__© r   r   úL/home/ubuntu/vllm_env/lib/python3.10/site-packages/ray/serve/llm/__init__.pyr       ó    r   c                   @   r   )r
   z=The configuration for starting an LLM deployment application.Nr   r   r   r   r   r
   '   r   r
   c                   @   r   )r	   z+The configuration for loading an LLM model.Nr   r   r   r   r   r	   .   r   r	   c                   @   r   )r   z@The configuration for mirroring an LLM model from cloud storage.Nr   r   r   r   r   r   5   r   r   c                   @   r   )r   z5The configuration for loading an LLM model with LoRA.Nr   r   r   r   r   r   <   r   r   zray.serve.llm.LLMServerz"ray.serve.llm.deployment.LLMServerF)ÚoldÚnewÚerrorc                   @   ó   e Zd ZdS )r   N©r   r   r   r   r   r   r   r   H   s    r   zray.serve.llm.LLMRouterz#ray.serve.llm.ingress.OpenAIIngressc                   @   r   )Ú	LLMRouterNr   r   r   r   r   r   O   s    r   N)Úname_prefixÚbind_kwargsÚoverride_serve_optionsÚdeployment_clsÚ
llm_configr    r!   r"   r#   Úreturnr   c                C   s   ddl m} || ||||dS )aq  Helper to build a single vllm deployment from the given llm config.

    Examples:
        .. testcode::
            :skipif: True

            from ray import serve
            from ray.serve.llm import LLMConfig, build_llm_deployment

            # Configure the model
            llm_config = LLMConfig(
                model_loading_config=dict(
                    model_id="llama-3.1-8b",
                    model_source="meta-llama/Llama-3.1-8b-instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1,
                        max_replicas=2,
                    )
                ),
                accelerator_type="A10G",
            )

            # Build the deployment
            llm_app = build_llm_deployment(llm_config)

            # Deploy the application
            model_handle = serve.run(llm_app)

            # Querying the model handle
            import asyncio
            model_handle = model_handle.options(stream=True)
            async def query_model(model_handle):
                from ray.serve.llm.openai_api_models import ChatCompletionRequest

                request = ChatCompletionRequest(
                    model="qwen-0.5b",
                    messages=[
                        {
                            "role": "user",
                            "content": "Hello, world!"
                        }
                    ]
                )

                resp = model_handle.chat.remote(request)
                async for message in resp:
                    print("message: ", message)

            asyncio.run(query_model(model_handle))

    Args:
        llm_config: The llm config to build vllm deployment.
        name_prefix: Optional prefix to be used for the deployment name.
        bind_kwargs: Optional kwargs to pass to the deployment.
        override_serve_options: Optional serve options to override the original serve options based on the llm_config.
        deployment_cls: Optional deployment class to use.

    Returns:
        The configured Ray Serve Application for vllm deployment.
    r   )Úbuild_llm_deployment)r$   r    r!   r"   r#   )Ú+ray.llm._internal.serve.core.server.builderr&   )r$   r    r!   r"   r#   r&   r   r   r   r&   ]   s   Gûr&   Úllm_serving_argsc                 C   ó   ddl m} || dS )a†  Helper to build an OpenAI compatible app with the llm deployment setup from
    the given llm serving args. This is the main entry point for users to create a
    Serve application serving LLMs.


    Examples:
        .. code-block:: python
            :caption: Example usage in code.

            from ray import serve
            from ray.serve.llm import LLMConfig, LLMServingArgs, build_openai_app

            llm_config1 = LLMConfig(
                model_loading_config=dict(
                    model_id="qwen-0.5b",
                    model_source="Qwen/Qwen2.5-0.5B-Instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1, max_replicas=2,
                    )
                ),
                accelerator_type="A10G",
            )

            llm_config2 = LLMConfig(
                model_loading_config=dict(
                    model_id="qwen-1.5b",
                    model_source="Qwen/Qwen2.5-1.5B-Instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1, max_replicas=2,
                    )
                ),
                accelerator_type="A10G",
            )

            # Deploy the application
            llm_app = build_openai_app(
                LLMServingArgs(
                    llm_configs=[
                        llm_config1,
                        llm_config2,
                    ]
                )
            )
            serve.run(llm_app)


            # Querying the model via openai client
            from openai import OpenAI

            # Initialize client
            client = OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key")

            # Basic completion
            response = client.chat.completions.create(
                model="qwen-0.5b",
                messages=[{"role": "user", "content": "Hello!"}]
            )

        .. code-block:: yaml
            :caption: Example usage in YAML.

            # config.yaml
            applications:
            - args:
                llm_configs:
                    - model_loading_config:
                        model_id: qwen-0.5b
                        model_source: Qwen/Qwen2.5-0.5B-Instruct
                      accelerator_type: A10G
                      deployment_config:
                        autoscaling_config:
                            min_replicas: 1
                            max_replicas: 2
                    - model_loading_config:
                        model_id: qwen-1.5b
                        model_source: Qwen/Qwen2.5-1.5B-Instruct
                      accelerator_type: A10G
                      deployment_config:
                        autoscaling_config:
                            min_replicas: 1
                            max_replicas: 2
              import_path: ray.serve.llm:build_openai_app
              name: llm_app
              route_prefix: "/"


    Args:
        llm_serving_args: A dict that conforms to the LLMServingArgs pydantic model.

    Returns:
        The configured Ray Serve Application router.
    r   )Úbuild_openai_app©Úbuilder_config)Ú,ray.llm._internal.serve.core.ingress.builderr*   )r(   r*   r   r   r   r*   ±   s   b
r*   Úpd_serving_argsc                 C   r)   )a$
  Build a deployable application utilizing P/D disaggregation.


    Examples:
        .. code-block:: python
            :caption: Example usage in code.

            from ray import serve
            from ray.serve.llm import LLMConfig, build_pd_openai_app

            config = LLMConfig(
                model_loading_config=dict(
                    model_id="qwen-0.5b",
                    model_source="Qwen/Qwen2.5-0.5B-Instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1, max_replicas=2,
                    )
                ),
                accelerator_type="A10G",
            )

            # Deploy the application
            llm_app = build_pd_openai_app(
                dict(
                    prefill_config=config,
                    decode_config=config,
                )
            )

            serve.run(llm_app)


            # Querying the model via openai client
            from openai import OpenAI

            # Initialize client
            client = OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key")

            # Basic completion
            response = client.chat.completions.create(
                model="qwen-0.5b",
                messages=[{"role": "user", "content": "Hello!"}]
            )

        .. code-block:: yaml
            :caption: Example usage in YAML.

            # config.yaml
            applications:
            - args:
                prefill_config:
                    model_loading_config:
                        model_id: qwen-0.5b
                        model_source: Qwen/Qwen2.5-0.5B-Instruct
                    accelerator_type: A10G
                    deployment_config:
                        autoscaling_config:
                            min_replicas: 1
                            max_replicas: 2
                decode_config:
                    model_loading_config:
                    model_id: qwen-1.5b
                    model_source: Qwen/Qwen2.5-1.5B-Instruct
                    accelerator_type: A10G
                    deployment_config:
                    autoscaling_config:
                        min_replicas: 1
                        max_replicas: 2
              import_path: ray.serve.llm:build_pd_openai_app
              name: llm_app
              route_prefix: "/"


    Args:
        pd_serving_args: The dictionary containing prefill and decode configs. See PDServingArgs for more details.

    Returns:
        The configured Ray Serve Application router.
    r   )Úbuild_pd_openai_app)r.   )Ú?ray.llm._internal.serve.serving_patterns.prefill_decode.builderr/   )r.   r/   r   r   r   r/     s   S
r/   )r    r"   c                C   s   ddl m} || ||dS )aq  Build a data parallel attention LLM deployment.

    Args:
        llm_config: The LLM configuration.
        name_prefix: The prefix to add to the deployment name.
        override_serve_options: The optional serve options to override the
            default options.

    Returns:
        The Ray Serve Application for the data parallel attention LLM deployment.
    r   )Úbuild_dp_deployment)r$   r    r"   )Ú>ray.llm._internal.serve.serving_patterns.data_parallel.builderr1   )r$   r    r"   r1   r   r   r   r1   t  s   ýr1   Údp_serving_argsc                 C   r)   )a@  Build an OpenAI compatible app with the DP attention deployment
    setup from the given builder configuration.

    Args:
        dp_serving_args: The configuration for the builder. It has to conform
            to the DPOpenAiServingArgs pydantic model.

    Returns:
        The configured Ray Serve Application.
    r   )Úbuild_dp_openai_appr+   )r2   r4   )r3   r4   r   r   r   r4   ‘  s   
r4   )r   r
   r	   r   r   r&   r*   r/   r1   r4   r   r   )%Útypingr   r   r   Úray._common.deprecationr   Ú/ray.llm._internal.serve.core.configs.llm_configr   Ú_CloudMirrorConfigr   Ú
_LLMConfigr   Ú_LoraConfigr	   Ú_ModelLoadingConfigr-   r
   Ú_LLMServingArgsÚ,ray.llm._internal.serve.core.ingress.ingressr   Ú_OpenAiIngressÚ.ray.llm._internal.serve.core.server.llm_serverr   Ú
_LLMServerÚray.util.annotationsr   Úray.serve.deploymentr   r   ÚstrÚdictr&   r*   r/   r1   r4   Ú__all__r   r   r   r   Ú<module>   s„    ÿý	úÿýüû
úùShYüÿýüû