o
    ei8                     @   s   d dl Z d dlmZmZ d dlmZ d dlZddlmZm	Z	 ddl
mZ ddlmZ e r1d dlZdZed	Zd
eejeeef fddZG dd deZeG dd dZeG dd dZdS )    N)	dataclassfield)Enum   )is_psutil_availableis_torch_xpu_available)logging)tracedContinuousBatchingLoggerreturnc                  C   s,  t j r(t d} t j  t j  t j| j}t j| }t j	| }nht
 rNt d} t j  t j  t j| j}t j| }t j	| }nBt jj rnt jj rnt d} t j }|t j  }d}n"t d} t rt j}t  j}|}ntd d}d}d}| |||fS )Ncudaxpumpsr   cpuzCannot get memory breakdown on CPU without psutil: returning 0 for all memory values. Please install psutil to get an actual memory breakdown.)torchr   is_availabledeviceempty_cachesynchronizeget_device_propertiestotal_memorymemory_reservedmemory_allocatedr   r   backendsr   is_builtdriver_allocated_memoryrecommended_max_memoryr   psutilvirtual_memorytotalProcessmemory_inforssloggererror)r   r   reserved_memoryallocated_memory r(   r/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/generation/continuous_batching/requests.pyget_device_and_memory_breakdown$   s>   










r*   c                   @   s,   e Zd ZdZdZdZdZdZdZdZ	dZ
d	S )
RequestStatusz5Status of a generation request through its lifecycle.pending
prefillingprefilling_splitsplit_pending_remainderdecodingfinishedfailedN)__name__
__module____qualname____doc__PENDING
PREFILLINGPREFILLING_SPLITSPLIT_PENDING_REMAINDERDECODINGFINISHEDFAILEDr(   r(   r(   r)   r+   K   s    r+   c                   @   s   e Zd ZU dZeed< eedZee	 ed< eedZ
ee	 ed< eedZee ed< dZedB ed< ejZeed	< eejdZeed
< dZeeef ed< dZee dB ed< defddZdS )GenerationOutputa  Tracks the output of a generation request.

    Attributes:
        request_id (str): The ID of the generation request.
        prompt_ids (list[int]): The IDs of the prompt tokens.
        generated_tokens (list[int]): The generated tokens.
        logprobs (list[float]): The log probabilities of the generated tokens.
        error (Optional[str]): Any error message associated with the request. When None, the request was successful.
        status (RequestStatus): The status of the request.
        created_time (float): The time the request was created.
        lifespan (tuple[float, float]): The time the request was no longer pending and the time the request finished.
    
request_iddefault_factory
prompt_idsgenerated_tokenslogprobsNr%   statuscreated_timer
   r
   lifespan
timestampsr   c                 C   s   | j tjkS N)rE   r+   r<   selfr(   r(   r)   is_finishedp   s   zGenerationOutput.is_finished)r3   r4   r5   r6   str__annotations__r   listrB   intrC   rD   floatr%   r+   r7   rE   timeperf_counterrF   rH   tuplerI   boolrM   r(   r(   r(   r)   r>   W   s   
 r>   c                   @   s  e Zd ZU dZeed< ee ed< dZe	ed< dZ
eed< eedZee ed	< eedZee ed
< eedZee ed< dZeed< dZeed< ejZeed< dZedB ed< dZeed< dZe	ed< eejdZeed< dZedB ed< dZeeef ed< eedZee ed< dZeed< dZ eed< dd Z!e"defd d!Z#e#j$d"efd#d!Z#e"dee dB fd$d%Z%d&d' Z&defd(d)Z'defd*d+Z(e)d,ede	fd-d.Z*d/d0 Z+d1d2 Z,d3edd fd4d5Z-d8d6d7Z.dS )9RequestStateaC  Tracks the state of a generation request through its lifecycle.

    Attributes:
        request_id (str): The ID of the generation request.
        initial_tokens (list[int]): The initial prompt tokens.
        num_children (int): The number of children requests
        full_prompt_ids (list[int] | None): The tokens IDs of the full prompt.
        prompt_ids (list[int] | None): The tokens IDs currently being processed.
        remaining_prompt_ids (list[int]): The tokens IDs remaining to be processed (for split requests).
        static_outputs (list[int]): The generated tokens.
        allocated_blocks (int): The number of blocks allocated to the request.
        position_offset (int): The current position in the sequence for position_ids.
        status (RequestStatus): The status of the request: can be one of PENDING, PREFILLING, PREFILLING_SPLIT,
                                SPLIT_PENDING_REMAINDER, DECODING, FINISHED, FAILED
        max_new_tokens (int | None): The maximum number of new tokens to generate.
        eos_token_id (int): The ID of the end-of-sequence token.
        streaming (bool): Whether to stream tokens as they're generated
        created_time (float): The time the request was created.
        error (Optional[str]): Any error message associated with the request. When None, has had no error yet.
    r?   initial_tokensFrecord_timestampsr   num_childrenr@   tokens_to_processremaining_prefill_tokensrC   allocated_blocksposition_offset_status   Nmax_new_tokensr
   eos_token_id	streamingrF   r%   rG   rH   _timestamps_true_initial_tokens_new_tokens_limitc                 C   s    | j d u r
d| _d S | j | _d S )Nrf   )ra   rg   rK   r(   r(   r)   __post_init__   s    zRequestState.__post_init__r   c                 C      | j S rJ   )r_   rK   r(   r(   r)   rE         zRequestState.statusvaluec                 C   sL   | j tjkrt df| _n|tjkr!| jd t f| _|   || _ d S )Nr
   r   )r_   r+   r7   rS   rT   rH   r<   log_end_of_request)rL   rk   r(   r(   r)   rE      s   

c                 C   s   | j r| jS d S rJ   )rY   rd   rK   r(   r(   r)   rI      s   zRequestState.timestampsc                 C   s`   t | j}|  }| jd | j }| jd | j }td| j d|d|d|d|
 d S )Nr      Request z finished: prefill_len = z decode_len = z start_time = z end_time = )lenrX   generated_lenrH   rF   r$   infor?   )rL   prefill_len
decode_len
start_timeend_timer(   r(   r)   rl      s   
"zRequestState.log_end_of_requestc                 C   ri   )zCGet the current length of the sequence (prompt + generated tokens).)r^   rK   r(   r(   r)   current_len   rj   zRequestState.current_lenc                 C   s
   t | jS )z*Get the number of tokens generated so far.)ro   rC   rK   r(   r(   r)   rp      s   
zRequestState.generated_lentoken_idc                 C   s   | j tjkrdS | jr| jt  || jko| jdk}| 	 d }|s*|| j
k r4|| jd< |d7 }ntd| j d|  | j  |sL|| j
krRtj| _ dS dS )zUpdate the request with a newly generated token and check for completion.

        Args:
            token_id: The token ID to add to the output sequence

        Returns:
            bool: True if the request is now complete, False otherwise
        Fr
   rm   rn   z generated a useless token: T)rE   r+   r;   rY   rd   appendrS   rT   rb   rp   rg   rC   r$   warningr?   popr<   )rL   rw   is_eosrv   r(   r(   r)   update_and_check_completion   s   


z(RequestState.update_and_check_completionc              
   C   s~   d| j  d| j d|   dt| j dt| j d| j dt| j d| j d	| j	 g	}d
d
| d S )Nzrequest_id=zstatus=zout_tokens=zquery_length=zremaining_tokens=z
kv_length=zfull_prompt_length=zallocated_blocks=zgenerated_tokens=zRequestState(
	z,
	z
))r?   r_   rp   ro   r[   r\   r^   rX   r]   rC   join)rL   msgr(   r(   r)   __repr__   s   




zRequestState.__repr__c                 C   sx   | j r| j d tkr| j   | jr'| j| jd | j  | _ | jd| j | _t| j| j| j g | j| j| j	| j
| jd	S )z7Convert the request state to a GenerationOutput object.r
   N)	r?   rB   rC   rD   r%   rE   rF   rH   rI   )rC   TMP_TOKEN_IDrz   re   rX   r>   r?   r%   rE   rF   rH   rI   rK   r(   r(   r)   to_generation_output   s    
z!RequestState.to_generation_outputnew_request_idc                 C   s   t  }tdi d|d| jd| jd| jdd d| jdd d| jdd d| jd	| j	d
| j
d| jd| jd| jd|d|dfdg d| jd| j}|S )ziFork the request into a new request with the same state expect for request_id, created_time and lifespan.r?   rX   rZ   r[   Nr\   rC   r]   r^   r_   ra   rb   rc   rF   rH   r
   rd   r%   rY   r(   )rS   rT   rW   rX   rZ   r[   r\   rC   r]   r^   rE   ra   rb   rc   r%   rY   )rL   r   tnew_requestr(   r(   r)   fork  sJ   	
zRequestState.forkc              
   C   s   | j r| j d tkr| j   | jdu rdn| jt| j  }t| j| j| j  | j| j	| j| j  || j
| jd}| jt| j |_|S )a  Creates an equivalent new request by removing the generated tokens and adding them to the initial prompt. The
        created request has THE SAME request_id. Notably, we can retrieve the original request from the created one with
        the _true_initial_tokens attribute.r
   N)r?   rX   rZ   rY   r[   ra   rb   rc   )rC   r   rz   ra   ro   rW   r?   rX   rZ   rY   rb   rc   re   )rL   ra   	new_stater(   r(   r)   !create_equivalent_initial_request'  s   



z.RequestState.create_equivalent_initial_request)r   rW   )/r3   r4   r5   r6   rN   rO   rP   rQ   rY   rV   rZ   r   r[   r\   rC   r]   r^   r+   r7   r_   ra   rb   rc   rS   rT   rF   rR   r%   rH   rU   rd   re   rg   rh   propertyrE   setterrI   rl   rv   rp   r	   r|   r   r   r   r   r(   r(   r(   r)   rW   t   sJ   
 	#rW   )rS   dataclassesr   r   enumr   r   utilsr   r   utils.loggingr   utils.metricsr	   r   r   	getLoggerr$   rU   r   rQ   r*   r+   r>   rW   r(   r(   r(   r)   <module>   s"   
'