o
    ۷iZ!                     @   s   d dl Z d dlmZmZ d dlmZ d dlmZ d dlZddl	m
Z
 ddlmZ e
dZd	eejeeef fd
dZG dd deZeG dd dZeG dd dZdS )    N)	dataclassfield)Enum)Optional   )logging)tracedContinuousBatchingLoggerreturnc                  C   s   t j r(t d} t j  t j  t j| j}t j| }t j	| }n+t j
j rHt j
j rHt d} t j }|t j  }d}nt d} d }d}d}| |||fS )Ncudampsr   cpu)torchr   is_availabledeviceempty_cachesynchronizeget_device_propertiestotal_memorymemory_reservedmemory_allocatedbackendsr   is_builtdriver_allocated_memoryrecommended_max_memory)r   r   reserved_memoryallocated_memory r   j/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/generation/continuous_batching/requests.pyget_device_and_memory_breakdown   s"   






r   c                   @   s,   e Zd ZdZdZdZdZdZdZdZ	dZ
d	S )
RequestStatusz5Status of a generation request through its lifecycle.pending
prefillingprefilling_splitsplit_pending_remainderdecodingfinishedfailedN)__name__
__module____qualname____doc__PENDING
PREFILLINGPREFILLING_SPLITSPLIT_PENDING_REMAINDERDECODINGFINISHEDFAILEDr   r   r   r   r    5   s    r    c                   @   s   e Zd ZU dZeed< eedZee	 ed< eedZ
ee	 ed< eedZee ed< dZee ed< ejZeed	< eejdZeed
< dS )GenerationOutputa5  Tracks the output of a generation request.

    Attributes:
        request_id (str): The ID of the generation request.
        prompt_ids (list[int]): The IDs of the prompt tokens.
        generated_tokens (list[int]): The generated tokens.
        logprobs (list[float]): The log probabilities of the generated tokens.
        error (Optional[str]): Any error message associated with the request. When None, the request was successful.
        status (RequestStatus): The status of the request.
        created_time (float): The time the request was created.
    
request_iddefault_factory
prompt_idsgenerated_tokenslogprobsNerrorstatuscreated_time)r(   r)   r*   r+   str__annotations__r   listr7   intr8   r9   floatr:   r   r    r,   r;   timer<   r   r   r   r   r3   A   s   
 r3   c                   @   sT  e Zd ZU dZeed< dZeee	  ed< dZ
eee	  ed< eedZee	 ed< eedZee	 ed< d	Ze	ed
< d	Ze	ed< ejZeed< dZe	ed< dZe	ed< eejdZeed< dZee ed< dZeeef ed< edefddZejdefddZdd Zde	fddZde	fddZ e!d e	de"fd!d"Z#d#d$ Z$d%d& Z%dS )'RequestStateay  Tracks the state of a generation request through its lifecycle.

    Attributes:
        request_id (str): The ID of the generation request.
        full_prompt_ids (list[int] | None): The tokens IDs of the full prompt.
        prompt_ids (list[int] | None): The tokens IDs currently being processed.
        remaining_prompt_ids (list[int]): The tokens IDs remaining to be processed (for split requests).
        static_outputs (list[int]): The generated tokens.
        allocated_blocks (int): The number of blocks allocated to the request.
        position_offset (int): The current position in the sequence for position_ids.
        status (RequestStatus): The status of the request: can be one of PENDING, PREFILLING, PREFILLING_SPLIT,
                                SPLIT_PENDING_REMAINDER, DECODING, FINISHED, FAILED
        max_new_tokens (int): The maximum number of new tokens to generate.
        eos_token_id (int): The ID of the end-of-sequence token.
        created_time (float): The time the request was created.
        error (Optional[str]): Any error message associated with the request. When None, has had no error yet.
    r4   Nfull_prompt_idsr7   r5   remaining_prompt_idsstatic_outputsr   allocated_blocksposition_offset_status   max_new_tokenseos_token_idr<   r:   )rL   rL   lifespanr
   c                 C      | j S )N)rI   selfr   r   r   r;   {      zRequestState.statusvaluec                 C   sL   | j tjkrt df| _n|tjkr!| jd t f| _|   || _ d S )NrL   r   )rI   r    r,   rB   rN   r1   log_end_of_request)rQ   rS   r   r   r   r;      s   

c                 C   s`   t | j}|  }| jd | j }| jd | j }td| j d|d|d|d|
 d S )Nr      zRequest z finished: prefill_len = z decode_len = z start_time = z end_time = )lenrD   generated_lenrN   r<   loggerinfor4   )rQ   prefill_len
decode_len
start_timeend_timer   r   r   rT      s   
"zRequestState.log_end_of_requestc                 C   rO   )zCGet the current length of the sequence (prompt + generated tokens).)rH   rP   r   r   r   current_len   rR   zRequestState.current_lenc                 C   s
   t | jS )z*Get the number of tokens generated so far.)rV   rF   rP   r   r   r   rW      s   
zRequestState.generated_lentoken_idc                 C   s`   | j tjkrdS || jko| jdk}|  | jk}|r|r$| j|g |s(|r.tj| _ dS dS )zUpdate the request with a newly generated token and check for completion.

        Args:
            token_id: The token ID to add to the output sequence

        Returns:
            bool: True if the request is now complete, False otherwise
        FrL   T)	r;   r    r0   rM   rW   rK   rF   extendr1   )rQ   r_   is_eos
is_max_lenr   r   r   update_with_token   s   zRequestState.update_with_tokenc              
   C   s~   d| j  d| j d|   dt| j dt| j d| j dt| j d| j d	| j	 g	}d
d
| d S )Nzrequest_id=zstatus=zout_tokens=zquery_length=zremaining_tokens=z
kv_length=zfull_prompt_length=zallocated_blocks=zgenerated_tokens=zRequestState(
	z,
	z
))r4   rI   rW   rV   r7   rE   rH   rD   rG   rF   join)rQ   msgr   r   r   __repr__   s   




zRequestState.__repr__c                 C   s   t | j| j| j| jg | jdS )z7Convert the request state to a GenerationOutput object.)r4   r7   r;   r8   r9   r:   )r3   r4   rD   r;   rF   r:   rP   r   r   r   to_generation_output   s   z!RequestState.to_generation_output)&r(   r)   r*   r+   r=   r>   rD   r   r?   r@   r7   r   rE   rF   rG   rH   r    r,   rI   rK   rM   rB   r<   rA   r:   rN   tuplepropertyr;   setterrT   r^   rW   r   boolrc   rf   rg   r   r   r   r   rC   X   s4   
 	rC   )rB   dataclassesr   r   enumr   typingr   r   utils.loggingr   utils.metricsr   	getLoggerrX   rh   r   r@   r   r    r3   rC   r   r   r   r   <module>   s   
