o
    ॵi                     @   s<   d dl mZmZ d dlmZ d dlmZ G dd deZdS )    )ListUnion)InferFramework)is_vllm_availablec                	       sn   e Zd Z			ddedededef fdd	Zd
eee eee  f dee fddZdefddZ	  Z
S )VllmautoN   model_id_or_dirdtypequantizationtensor_parallel_sizec                    sV   t  | t stdddlm} tds|dv rd}|| j||d|d| _	d	S )
a  
        Args:
            dtype: The dtype to use, support `auto`, `float16`, `bfloat16`, `float32`
            quantization: The quantization bit, default None means do not do any quantization.
            tensor_parallel_size: The tensor parallel size.
        zLInstall vllm by `pip install vllm` before using vllm to accelerate inferencer   )LLM   )bfloat16r   float16T)r
   r   trust_remote_coder   N)
super__init__r   ImportErrorvllmr   r   check_gpu_compatibility	model_dirmodel)selfr	   r
   r   r   r   	__class__ X/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/pipelines/accelerate/vllm.pyr   	   s"   zVllm.__init__promptsreturnc           	      K   s   | dd}| dd}| dd}| dd}|s"|dkr"d|d< |r.|t|d	  |d
< |r4||d
< d	dlm} |di |}t|d	 trUdd | jj||dD S dd | jj||dD S )zGenerate tokens.
        Args:
            prompts(`Union[List[str], List[List[int]]]`):
                The string batch or the token list batch to input to the model.
            kwargs: Sampling parameters.
        	do_sampleNnum_beamr   
max_lengthmax_new_tokensTuse_beam_searchr   
max_tokens)SamplingParamsc                 S      g | ]}|j d  jqS r   outputstext.0outputr   r   r   
<listcomp>B       z!Vllm.__call__.<locals>.<listcomp>)sampling_paramsc                 S   r'   r(   r)   r,   r   r   r   r/   G   r0   )prompt_token_idsr1   r   )poplenr   r&   
isinstancestrr   generate)	r   r   kwargsr    r!   r"   r#   r&   r1   r   r   r   __call__%   s.   
zVllm.__call__
model_typec                    s   t  fdddD S )Nc                    s   g | ]}|   v qS r   )lower)r-   r   r:   r   r   r/   M   r0   z-Vllm.model_type_supported.<locals>.<listcomp>)llamabaichuaninternlmmistralaquilabloomfalcongptmptoptqwenrA   )any)r   r:   r   r<   r   model_type_supportedL   s   zVllm.model_type_supported)r   Nr   )__name__
__module____qualname__r6   intr   r   r   r9   rI   __classcell__r   r   r   r   r      s"    
'r   N)typingr   r   $modelscope.pipelines.accelerate.baser   modelscope.utils.import_utilsr   r   r   r   r   r   <module>   s    