o
    Ti*v                     @   s`  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d	d
lmZ d	dlmZ d	dlmZmZ d	dlmZ d	dlmZ d	dlmZ d	dlm Z m!Z!m"Z"m#Z# d dl$m%Z% d	dl&m'Z' d	dl(m)Z) d	dl*m+Z+ d	dl,m-Z-m.Z.m/Z/m0Z0 d	dl1m2Z2 d	dl3m4Z4 da5d dl m6Z6 dZ7G dd de	Z8dS )    N)comm)log_dist)Module)version)TorchCheckpointEngine)SynchronizedWallClockTimer)is_compile_supported   )SDLoaderFactory)WeightQuantization)replace_transformer_layergeneric_injection)init_distributed)PipelineModule)has_moe_layers)LinearAllreduceLinearLayer	NormalizeReplaceWithTensorSlicing)get_accelerator)TransformerPolicy)AutoTP)generic_policies)build_bloom_alibi_tensorbuild_mpt_atten_bias_tensorbuild_mpt_alibi_tensorget_alibi_mask)DeepSpeedSelfAttention)DeepSpeedTransformerInferenceF)nnzmodel-forward-inferencec                       s   e Zd ZdZdZdZ fddZdd Zd<ddZd	d
 Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zd=ddZdd  Zd=d!d"Zd>d#d$Zd%d& Zd'd( Zd)d* Zd+d, Zd-d. Zd/d0 Zd1d2 Zd3d4 Zd5d6 Ze  ! i fd?d8d9Z"e#d7e$fd:d;Z%  Z&S )@InferenceEngineNc                    s  da t   tjdur|   || _|| _| | t	| jdr%| j
| _t	| jdr0| jjt_|jt  vrGtd|j dt   d|j| _|jj| _|jj| _d| _d| _d| _d| _d	| _t | _ d}| !| d	| _"g | _#| js|j$r| %  | js|j$s|jj&dkr| '  | (  t  d
kr|j)rt*+t,j-t*+dksJ d|jr| .| | jrt/j0| j1 d|j_&| j1 | _n|jj&dkr| 2| | j|j_t3| jt,j4j5rt6| j\}}nd	}|rt/0 dkr| 7|j8j9 | jrZ|j$rJ d| j: D ]M\}}t;|t,j4j5sJ | dt3|t<r)|f|_=n||_=dd | j> D }|j=D ] t? fdd|D sOtd  dq9| @|| qn7|j$rd| @| n-|jj&dkrtAB|}	tCd|	 |	D ]\}}t3|t<r|f|_=n||_=| @|| qwt D }
t	| jdo| jjEjFdk}|r| jjG|
d n
|jHs| jI|
 |jj&dkrt J It D }t/K|d t L|M  |jj&dkr|j)rJ d| N| j| _Od	| _PdS )zg
        Args:
            model: torch.nn.Module
            config: DeepSpeedInferenceConfig
        TNgenerateconfigz
Data type z is not supported by z accelerator   Fcudaz1.10zEIf you want to use cuda graph, please upgrade torch to at least v1.10)groupzDCannot use both user specified injection policy and kernel injectionz% is not a subclass of torch.nn.Modulec                 S   s   g | ]\}}|qS  r&   ).0name_r&   r&   N/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/inference/engine.py
<listcomp>   s    z,InferenceEngine.__init__.<locals>.<listcomp>c                 3   s    | ]}|  V  qd S N)endswith)r'   r(   policyr&   r*   	<genexpr>   s    z+InferenceEngine.__init__.<locals>.<genexpr>zInjection policy layer'z' not valid.zAutoTP: devicemetar1   r   z1Cuda graph is not supported for model parallelism)QDS_INFERENCE_ENABLEDsuper__init__r   	workspacedestroymodule_config_get_model_config_generatehasattr	_generater!   r"   r   hf_model_configdtyper   supported_dtypes
ValueErrordevice_nameinjection_policyinjection_dicttensor_paralleltp_groupmp_groupmpuquantize_merge_countquantization_scalesep_groupexpert_mp_groupcuda_graph_createdr   checkpoint_engine_init_quantization_settingmodel_profile_enabled_model_timesreplace_with_kernel_injectremove_mask_prepare_for_bloomtp_sizebuild_alibi_tensorbuild_attn_biasenable_cuda_graphpkg_versionparsetorch__version___convert_to_dtypedistget_world_sizeget_model_parallel_group_create_model_parallel_group
isinstancer   r   r   _create_ep_parallel_groupmoemoe_expertsitems
issubclassstrinjection_policy_tuplenamed_modulesany_apply_injection_policyr   	tp_parserprintcurrent_device_namer1   typeto_emptykeep_module_on_hosttoget_rng_state	broadcastset_rng_statecpu_local_cuda_graph_usedlocal_cuda_graph_is_compiled)selfmodelr"   quantization_settingrc   r)   client_modulerC   layer_namesparser_dictr1   is_meta_device
_rng_state	__class__r.   r*   r6   -   s   















zInferenceEngine.__init__c                 C   s*   dt _dt_t j rt j  d t _d S )Nr   )r   layer_idr   
num_layersr7   is_allocatedrelease_workspacerz   r&   r&   r*   r8      s
   


zInferenceEngine.destroyTc                 C   sL   | j s| jjs| j| j | j| j d| _ || _| jr$t	 | _
d S d S )NT)rP   r:   rW   r9   register_forward_pre_hook_pre_forward_hookregister_forward_hook_post_forward_hookuse_cuda_eventsr   timers)rz   r   r&   r&   r*   profile_model_time   s   z"InferenceEngine.profile_model_timec                 C   s*   |j d u rt| jdd | _ d S |j | _ d S )Nr"   )r"   getattrr9   )rz   r"   r&   r&   r*   r;      s   *z*InferenceEngine._get_model_config_generatec                 C   s4   t | jdrt | jjdrdd | jj_d S d S d S )Ntransformer_prepare_attn_maskc                 _   s   | S r,   r&   )attention_maskargskwargsr&   r&   r*   <lambda>   s    z?InferenceEngine.remove_mask_prepare_for_bloom.<locals>.<lambda>)r<   r9   r   r   r   r&   r&   r*   rS      s
   z-InferenceEngine.remove_mask_prepare_for_bloomc                 C   s   t | jdr't | jjdrt| jj_t | jjdr'| jjj| jj_t| jjj_t | jdrDt | jjdrF| jjj	| jj_
t	| jjj_	d S d S d S )Nr   rU   r   r{   r   )r<   r9   r   r   rU   r   build_mpt_alibi_tensor_origr   r{   r   get_alibi_mask_origr   r&   r&   r*   rU      s   
z"InferenceEngine.build_alibi_tensorc                 C   sB   t | jdrt | jjdr| jjj| jj_t| jjj_d S d S d S )Nr   
_attn_bias)r<   r9   r   r   _attn_bias_origr   r   r   r&   r&   r*   rV      s   zInferenceEngine.build_attn_biasc                 O   s0   | j r| t  d S t   t | _d S r,   )r   r   INFERENCE_MODEL_TIMERstartr   synchronizetime_start)rz   r9   inputsr   r&   r&   r*   r      s   
z!InferenceEngine._pre_forward_hookc                 C   s\   | j r| t  | tjdd}nt   t | _| j| j	 d }| j
| d S )NT)reset     @@)r   r   r   stopelapsedr   r   r   _endr   rQ   append)rz   r9   inputoutputelapsed_timer&   r&   r*   r      s   

z"InferenceEngine._post_forward_hookc                 C   sf   t jd u r-t  ttdd}t | dd t|j	j
D }t|| _| jt _d S t j| _d S )N
LOCAL_RANK0c                 S   s   g | ]}|qS r&   r&   )r'   ir&   r&   r*   r+      s    z@InferenceEngine._create_model_parallel_group.<locals>.<listcomp>)r    inference_mp_groupr   intosgetenvr   
set_devicerangerE   rT   r]   	new_grouprG   )rz   r"   
local_rankranksr&   r&   r*   r`      s   
z,InferenceEngine._create_model_parallel_groupc                    sB  i | _ i | _t|tu r|n|g}|D ]}| j |d i | j|d i q| j  D ]st  }t|D ].   }t krGt n}tt||| }t	|}t
 |v re| j |i q7t krt | }t  }	t|D ]!  fddt|	D }
t	|
}t
 |
v r| j|i q|q+d S )Nc                    s   g | ]} |  qS r&   r&   )r'   nrr   moe_ep_sizer&   r*   r+     s    z=InferenceEngine._create_ep_parallel_group.<locals>.<listcomp>)rK   rL   ro   listupdatekeysr]   r^   r   r   get_rank)rz   rd   enum_ep_groupsep_cntsizer   	_ep_groupnum_expert_mp_groupsexpert_mp_sizeexpert_mp_comm_ranks_expert_mp_groupr&   r   r*   rb     s4   

z)InferenceEngine._create_ep_parallel_groupc                 C   sb   d| _ d| _d| _t|tu r|\| _| _n|d ur|| _td| j  d| j d| j dg d S )N   Fr#   zquantize_bits = z mlp_extra_grouping = z, quantize_groups = r   )quantize_bitsmlp_extra_groupingquantize_groupsro   tupler   )rz   r|   r&   r&   r*   rO     s"   
z*InferenceEngine._init_quantization_settingc              
      s   t jjjjd_g   fddtjtjtj	t
tidfdd	| d }| D ]\}}d|v sGd	|v sGd
|v rI|}q7|d urdt|drft|jdrh|jjjrj||j_d S d S d S d S d S )N)rG   mp_sizec                    s  ||i dg g  f}t | drK| jjjr'tjjjtj| jjdd| jjj	d| _d|v r<j
j| jj||d  dd| _n<j
| jj||d  | _n-| jjjjrhtjjjtj| jjjdd| jjjj	d| j_j
| jjj||d  | j_|d	 jv rt | d
r| jjjjrtjjjtj| jjjdd| jjjj	d| j_j
| jj||d	  | j_d S | jjjrtjjjtj| jjdd| jjj	d| _||d	  }|t  }j
| j|| _d S d S )NTweightrv   r3   )datarequires_gradquery_key_value   )
num_splitsbiasnorm)r<   r   r   is_metarZ   r   	parameter	Parameter
empty_liker   
mp_replacestrided_copycopyr   key_listr   rr   r   rn   )r9   
state_dictprefixr   r   )
error_msgsrz   r&   r*   load2  sL   





 


"
z8InferenceEngine.load_model_with_checkpoint.<locals>.load r   c                    s   |   D ]l\}}|jv r^|| d  t fddjD s qtt| dkrRt| d  dkrRt|jj	dkrRt
|jj	d |jj|jd}t| || |j|| d  q||dkrf|n|| d |d  qd S )N.c                 3   s    | ]} |v V  qd S r,   r&   )r'   itemchecking_keyr&   r*   r0   e  s    z\InferenceEngine.load_model_with_checkpoint.<locals>.load_module_recursive.<locals>.<genexpr>r   r#   )dimr?   eps)named_childrenr   rj   r   lenr   
parametersnumelr   ds_shaper   r?   r   setattrsd)r9   r   levelr(   child)layer_policiesr   load_module_recursiverz   r   r*   r   a  s   
,&zIInferenceEngine.load_model_with_checkpoint.<locals>.load_module_recursivezword_embeddings.zembed_tokens.zwte.lm_headr   )r   r   )r   rG   r:   rE   rT   r   r   Linear	Embedding	LayerNormr   r   named_parametersr<   r   r   r   )rz   r_moduleembedding_weightnpr&   )r   r   r   r   rz   r*   load_model_with_checkpoint-  s2   (z*InferenceEngine.load_model_with_checkpointc                 C   s`   |j }|d urt|| jnd }t| j|j|jd t| jt	j
jr.t|| j||| j d S d S )N)r?   rW   )
checkpointr
   get_sd_loader_jsonrN   r   r9   r?   rW   ra   rZ   r   r   r   r"   )rz   r"   r}   checkpoint_dirr   r&   r&   r*   rk   z  s   z'InferenceEngine._apply_injection_policyc                 C   s.   | j ||dd}dd l}||}|  |S )N*)mp_placeholderr   )_get_ckpt_nameglobsort)rz   checkpoints_pathtagckpt_file_patternr  
ckpt_filesr&   r&   r*   _get_all_ckpt_names  s
   
z#InferenceEngine._get_all_ckpt_namesc                 C   sJ   |d ur|}n| j d u rdn| j  }d|}tj|d| d }|S )Nr   z{:02d}mp_rank_z_model_states.pt)rH   get_model_parallel_rankformatr   pathjoin)rz   r  r  r  mp_rank_strmp_rank	ckpt_namer&   r&   r*   r    s   

zInferenceEngine._get_ckpt_namec              	   C   s:  t | jt}|rtdt |tsQtj|rQ|d u rCtj|d}tj	|rCt
|d}|  }W d    n1 s>w   Y  | ||}t|| j}nt|| j}|d }	t|	tu rtj|	d ddd| _t| j | _| | j td	t|	D ]0}
t rt dkrtd
|
 d tj|	|
 t  ! dd| _t| j | _| | j qd S | j"d u rdn| j"# }|j| j$j%j&||| j$j'tj(u | j)| j*d\}}	}|\| _+| _,t-| j\}}|rddl.m/} d}t |	d tsd}|j0|||	| 1|	 || j| j"| jd | jj2|	| 1|	 |d d S )Nz=pipeline parallelism is currently not supported in inference.latestrcheckpointsr   rv   F)map_locationweights_onlyr#   zloading checkpoint ())is_pipe_parallelquantizer   r   )DeepSpeedEnginenum_expertsT)r   old_moe_loadr{   rH   rN   )r   strict)3ra   r9   r   RuntimeErrordictr   r  isdirr  isfileopenreadstripr	  r
   get_sd_loaderrN   r   ro   r   rZ   r   r   r   r   r   r   r   r]   is_initializedr   rm   r   rB   rH   r  r:   rE   rT   r?   int8r   r   rJ   rI   r   deepspeed.runtime.enginer  load_moe_state_dict_choose_module_keyload_state_dict)rz   load_dirload_module_strictr  r  latest_pathfd	ckpt_list	sd_loaderr   r   r  	load_pathquantize_configrc   r)   r  r  r&   r&   r*   _load_checkpoint  sf   
z InferenceEngine._load_checkpointc                 C   sL   d|v rd|v rJ dd|v sd|v sJ dd|v rdS d|v r$dS d S )Nr9   r{   zFcheckpoint has both 'model' and 'module' keys, not sure how to proceedzMcheckpoint contains neither 'model' or 'module' keys, not sure how to proceedr&   )rz   r   r&   r&   r*   r*    s   z"InferenceEngine._choose_module_keyc                 C   sh   t | jtjjs
d S 	 |jtjkr| j  d S |jtjkr%| j  d S |jtjkr2| j  d S d S r,   )ra   r9   rZ   r   r   r   r   model_quantizerD   r   r   rJ   r?   halfbfloat16float)rz   r"   	quantizerr{   r&   r&   r*   r\     s   z!InferenceEngine._convert_to_dtypec                 O   s   t   }|t    t  | tdD ]
}| j|i |}qW d    n1 s-w   Y  t   | t   | _|| _	|| _
t  | j | j| j	i | j
| _W d    n1 scw   Y  d| _d S )Nr   T)r   Streamwait_streamcurrent_streamstreamr   r9   create_graph_cuda_graphsstatic_inputsstatic_kwargscapture_to_graphstatic_outputrM   )rz   r   r   cuda_streamr   retr&   r&   r*   _create_cuda_graph  s   

z"InferenceEngine._create_cuda_graphc                 O   st   t t|D ]}t|| r| j| ||  q|D ]}t|| r/| j| ||  qt | j	 | j
S r,   )r   r   rZ   	is_tensorr@  copy_rA  r   replay_graphr?  rC  )rz   r   r   r   kr&   r&   r*   _graph_replay  s   zInferenceEngine._graph_replayc                 C   sH   | j sJ d| j}| jjrt| jdkrtdt| j g | _|S )Nzmodel profiling is not enabledr   zModel times are empty and cuda graph is enabled. If this is a GPT-style model this combo is not supported. If this is a BERT-style model this is a bug, please report it. Model type is: )rP   rQ   r:   rW   r   rA   ro   r9   )rz   model_timesr&   r&   r*   rL    s   zInferenceEngine.model_timesc                 C   s$   t D ]}| }||r dS qdS )NTF)r   match_replaced)rz   r9   r/   r&   r&   r*   _module_match  s   
zInferenceEngine._module_matchc                 C   sL   t |tjjr	dS d}|j D ]}t||}| |r#t|dr#d}q|S )NFrW   T)	ra   rZ   r   r   __dict__r   r   rN  r<   )rz   r9   sub_module_cuda_graphr(   
sub_moduler&   r&   r*   rw     s   
z&InferenceEngine._local_cuda_graph_usedc                 O   s   d}| j rt  dkr| jjrt   t }t  dkrD| jjrD| jsD| jr3| j	|i |}n| j
|i | | j	|i |}n| j|i |}| j rf| jjrft   t | d }| j| |S )zExecute forward propagation

        Arguments:
            *inputs: Variable length input list
            **kwargs: variable length keyword arguments
        Nr$   r   )rP   r   rB   r:   rW   r   r   rx   rM   rK  rF  r9   rQ   r   )rz   r   r   r   outputsdurationr&   r&   r*   forward*  s   

zInferenceEngine.forwardc                 O   s   t | jdr| j  d}d|v r|d }t|dd}d|v r#|d }|dkr+tdd|v rV|d  dkrV|d D ]}|jd }|| jjkrUt	d	| d
| jj dq;| jj
|i |S )Nreset_cacher#   generation_config	num_beamszDeepSpeed does not support `num_beams` > 1, if this is important to you please add your request to: https://github.com/deepspeedai/DeepSpeed/issues/2506	input_idsr	   r   zInput with size z exceeds maximum length of z?. Please increase max_tokens in the DeepSpeed Inference Config.)r<   r9   rU  r   NotImplementedErrorr   shaper:   max_out_tokensr  r!   )rz   r   r   rW  
gen_configinput_tensortensor_lengthr&   r&   r*   r=   G  s&   

zInferenceEngine._generatereturnc                 C   sB   t  std| jrdS dtjj_| jjdd|i| d| _dS )zL
        Compile the module using the specified backend and kwargs.
        z4compile is not supported in your version of PyTorch.NFbackendTr&   )	r   r  ry   	deepspeedutilsnvtxenable_nvtxr9   compile)rz   r`  compile_kwargsr&   r&   r*   re  `  s   

zInferenceEngine.compilec                 C   s   | j S r,   )ry   r   r&   r&   r*   is_compiledo  s   zInferenceEngine.is_compiled)Tr,   )TN)r_  N)'__name__
__module____qualname__r   inference_ep_grouprL   r6   r8   r   r;   rS   rU   rV   r   r   r`   rb   rO   r   rk   r	  r  r4  r*  r\   rF  rK  rL  rN  rw   rT  r=   r   get_compile_backendre  propertyboolrg  __classcell__r&   r&   r   r*   r    (   sD     



M

;	
r    )9rZ   r   r   ra  r   r]   deepspeed.utils.loggingr   torch.nn.modulesr   	packagingr   rX   ;deepspeed.runtime.checkpoint_engine.torch_checkpoint_enginer   deepspeed.utils.timerr   deepspeed.runtime.compilerr   runtime.state_dict_factoryr
   runtime.weight_quantizerr   module_injectr   r   	comm.commr   piper   	moe.utilsr   r   r   r   r   deepspeed.acceleratorr   module_inject.policyr   module_inject.auto_tpr   module_inject.replace_policyr   !module_inject.auto_tp_model_utilsr   r   r   r   &ops.transformer.inference.ds_attentionr   1model_implementations.transformers.ds_transformerr   r4   r   r   r    r&   r&   r&   r*   <module>   s:   