o
    `۷i1                     @   s  d dl mZmZmZ d dlmZ d dlmZm	Z
mZmZ d dlmZ d dlmZ d dlmZ d dlmZ er<d dlmZ ed	d
G dd de
Z	ed	d
G dd deZed	d
G dd deZed	d
G dd deZed	d
G dd deZeddddG dd deZeddddG dd deZed	d
ddddd d!dd"ee d#ee d$ee d%eee  d&d'fd(d)Zed	d
d*ed&d'fd+d,Z ed	d
d-ed&d'fd.d/Z!ed	d
ddd0d!dd"ee d$ee d&d'fd1d2Z"ed	d
d3ed&d'fd4d5Z#g d6Z$dS )7    )TYPE_CHECKINGOptionalType)
Deprecated)CloudMirrorConfig	LLMConfig
LoraConfigModelLoadingConfig)LLMServingArgs)OpenAiIngress)	LLMServer)	PublicAPI)Applicationalpha)	stabilityc                   @      e Zd ZdZdS )r   z1The configuration for starting an LLM deployment.N__name__
__module____qualname____doc__ r   r   L/home/ubuntu/vllm_env/lib/python3.10/site-packages/ray/serve/llm/__init__.pyr           r   c                   @   r   )r
   z=The configuration for starting an LLM deployment application.Nr   r   r   r   r   r
   '   r   r
   c                   @   r   )r	   z+The configuration for loading an LLM model.Nr   r   r   r   r   r	   .   r   r	   c                   @   r   )r   z@The configuration for mirroring an LLM model from cloud storage.Nr   r   r   r   r   r   5   r   r   c                   @   r   )r   z5The configuration for loading an LLM model with LoRA.Nr   r   r   r   r   r   <   r   r   zray.serve.llm.LLMServerz"ray.serve.llm.deployment.LLMServerF)oldnewerrorc                   @      e Zd ZdS )r   Nr   r   r   r   r   r   r   r   H   s    r   zray.serve.llm.LLMRouterz#ray.serve.llm.ingress.OpenAIIngressc                   @   r   )	LLMRouterNr   r   r   r   r   r   O   s    r   N)name_prefixbind_kwargsoverride_serve_optionsdeployment_cls
llm_configr    r!   r"   r#   returnr   c                C   s   ddl m} || ||||dS )aq  Helper to build a single vllm deployment from the given llm config.

    Examples:
        .. testcode::
            :skipif: True

            from ray import serve
            from ray.serve.llm import LLMConfig, build_llm_deployment

            # Configure the model
            llm_config = LLMConfig(
                model_loading_config=dict(
                    model_id="llama-3.1-8b",
                    model_source="meta-llama/Llama-3.1-8b-instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1,
                        max_replicas=2,
                    )
                ),
                accelerator_type="A10G",
            )

            # Build the deployment
            llm_app = build_llm_deployment(llm_config)

            # Deploy the application
            model_handle = serve.run(llm_app)

            # Querying the model handle
            import asyncio
            model_handle = model_handle.options(stream=True)
            async def query_model(model_handle):
                from ray.serve.llm.openai_api_models import ChatCompletionRequest

                request = ChatCompletionRequest(
                    model="qwen-0.5b",
                    messages=[
                        {
                            "role": "user",
                            "content": "Hello, world!"
                        }
                    ]
                )

                resp = model_handle.chat.remote(request)
                async for message in resp:
                    print("message: ", message)

            asyncio.run(query_model(model_handle))

    Args:
        llm_config: The llm config to build vllm deployment.
        name_prefix: Optional prefix to be used for the deployment name.
        bind_kwargs: Optional kwargs to pass to the deployment.
        override_serve_options: Optional serve options to override the original serve options based on the llm_config.
        deployment_cls: Optional deployment class to use.

    Returns:
        The configured Ray Serve Application for vllm deployment.
    r   )build_llm_deployment)r$   r    r!   r"   r#   )+ray.llm._internal.serve.core.server.builderr&   )r$   r    r!   r"   r#   r&   r   r   r   r&   ]   s   Gr&   llm_serving_argsc                 C      ddl m} || dS )a  Helper to build an OpenAI compatible app with the llm deployment setup from
    the given llm serving args. This is the main entry point for users to create a
    Serve application serving LLMs.


    Examples:
        .. code-block:: python
            :caption: Example usage in code.

            from ray import serve
            from ray.serve.llm import LLMConfig, LLMServingArgs, build_openai_app

            llm_config1 = LLMConfig(
                model_loading_config=dict(
                    model_id="qwen-0.5b",
                    model_source="Qwen/Qwen2.5-0.5B-Instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1, max_replicas=2,
                    )
                ),
                accelerator_type="A10G",
            )

            llm_config2 = LLMConfig(
                model_loading_config=dict(
                    model_id="qwen-1.5b",
                    model_source="Qwen/Qwen2.5-1.5B-Instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1, max_replicas=2,
                    )
                ),
                accelerator_type="A10G",
            )

            # Deploy the application
            llm_app = build_openai_app(
                LLMServingArgs(
                    llm_configs=[
                        llm_config1,
                        llm_config2,
                    ]
                )
            )
            serve.run(llm_app)


            # Querying the model via openai client
            from openai import OpenAI

            # Initialize client
            client = OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key")

            # Basic completion
            response = client.chat.completions.create(
                model="qwen-0.5b",
                messages=[{"role": "user", "content": "Hello!"}]
            )

        .. code-block:: yaml
            :caption: Example usage in YAML.

            # config.yaml
            applications:
            - args:
                llm_configs:
                    - model_loading_config:
                        model_id: qwen-0.5b
                        model_source: Qwen/Qwen2.5-0.5B-Instruct
                      accelerator_type: A10G
                      deployment_config:
                        autoscaling_config:
                            min_replicas: 1
                            max_replicas: 2
                    - model_loading_config:
                        model_id: qwen-1.5b
                        model_source: Qwen/Qwen2.5-1.5B-Instruct
                      accelerator_type: A10G
                      deployment_config:
                        autoscaling_config:
                            min_replicas: 1
                            max_replicas: 2
              import_path: ray.serve.llm:build_openai_app
              name: llm_app
              route_prefix: "/"


    Args:
        llm_serving_args: A dict that conforms to the LLMServingArgs pydantic model.

    Returns:
        The configured Ray Serve Application router.
    r   )build_openai_appbuilder_config),ray.llm._internal.serve.core.ingress.builderr*   )r(   r*   r   r   r   r*      s   b
r*   pd_serving_argsc                 C   r)   )a$
  Build a deployable application utilizing P/D disaggregation.


    Examples:
        .. code-block:: python
            :caption: Example usage in code.

            from ray import serve
            from ray.serve.llm import LLMConfig, build_pd_openai_app

            config = LLMConfig(
                model_loading_config=dict(
                    model_id="qwen-0.5b",
                    model_source="Qwen/Qwen2.5-0.5B-Instruct",
                ),
                deployment_config=dict(
                    autoscaling_config=dict(
                        min_replicas=1, max_replicas=2,
                    )
                ),
                accelerator_type="A10G",
            )

            # Deploy the application
            llm_app = build_pd_openai_app(
                dict(
                    prefill_config=config,
                    decode_config=config,
                )
            )

            serve.run(llm_app)


            # Querying the model via openai client
            from openai import OpenAI

            # Initialize client
            client = OpenAI(base_url="http://localhost:8000/v1", api_key="fake-key")

            # Basic completion
            response = client.chat.completions.create(
                model="qwen-0.5b",
                messages=[{"role": "user", "content": "Hello!"}]
            )

        .. code-block:: yaml
            :caption: Example usage in YAML.

            # config.yaml
            applications:
            - args:
                prefill_config:
                    model_loading_config:
                        model_id: qwen-0.5b
                        model_source: Qwen/Qwen2.5-0.5B-Instruct
                    accelerator_type: A10G
                    deployment_config:
                        autoscaling_config:
                            min_replicas: 1
                            max_replicas: 2
                decode_config:
                    model_loading_config:
                    model_id: qwen-1.5b
                    model_source: Qwen/Qwen2.5-1.5B-Instruct
                    accelerator_type: A10G
                    deployment_config:
                    autoscaling_config:
                        min_replicas: 1
                        max_replicas: 2
              import_path: ray.serve.llm:build_pd_openai_app
              name: llm_app
              route_prefix: "/"


    Args:
        pd_serving_args: The dictionary containing prefill and decode configs. See PDServingArgs for more details.

    Returns:
        The configured Ray Serve Application router.
    r   )build_pd_openai_app)r.   )?ray.llm._internal.serve.serving_patterns.prefill_decode.builderr/   )r.   r/   r   r   r   r/     s   S
r/   )r    r"   c                C   s   ddl m} || ||dS )aq  Build a data parallel attention LLM deployment.

    Args:
        llm_config: The LLM configuration.
        name_prefix: The prefix to add to the deployment name.
        override_serve_options: The optional serve options to override the
            default options.

    Returns:
        The Ray Serve Application for the data parallel attention LLM deployment.
    r   )build_dp_deployment)r$   r    r"   )>ray.llm._internal.serve.serving_patterns.data_parallel.builderr1   )r$   r    r"   r1   r   r   r   r1   t  s   r1   dp_serving_argsc                 C   r)   )a@  Build an OpenAI compatible app with the DP attention deployment
    setup from the given builder configuration.

    Args:
        dp_serving_args: The configuration for the builder. It has to conform
            to the DPOpenAiServingArgs pydantic model.

    Returns:
        The configured Ray Serve Application.
    r   )build_dp_openai_appr+   )r2   r4   )r3   r4   r   r   r   r4     s   
r4   )r   r
   r	   r   r   r&   r*   r/   r1   r4   r   r   )%typingr   r   r   ray._common.deprecationr   /ray.llm._internal.serve.core.configs.llm_configr   _CloudMirrorConfigr   
_LLMConfigr   _LoraConfigr	   _ModelLoadingConfigr-   r
   _LLMServingArgs,ray.llm._internal.serve.core.ingress.ingressr   _OpenAiIngress.ray.llm._internal.serve.core.server.llm_serverr   
_LLMServerray.util.annotationsr   ray.serve.deploymentr   r   strdictr&   r*   r/   r1   r4   __all__r   r   r   r   <module>   s    	
ShY