o
    TiQ                     @   s   d dl Z d dlmZ d dlmZ d dlmZ ddlmZ ddl	m
Z
mZ d dlmZ d dlZd dlZd dlZd d	lmZ d d
lmZ d dl mZ d dlmZ d dlmZmZmZmZ ddlmZ zd dl Z e j!j"j#j$Z$W n   dZ$Y G dd deZ%dS )    N)DeepSpeedInferenceConfig)replace_policies)policy_to_ds_container   )DeepSpeedEngine)TLinearget_inactive_params)GatheredParameters)comm)get_accelerator)nn)logger)LinearLayer	NormalizeEmbeddingLayerOPTEmbedding   )WorkspaceOpc                       s   e Zd ZdZdZ fddZdd Zdd Zd	d
 Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zd'ddZdd Zdd Z fd d!Zd( fd#d$	Zd) fd%d&	Z  ZS )*DeepSpeedHybridEnginez,DeepSpeed engine for training and inference.Nc                    s   t  j||fi | t  t  }t|d t |	  | j
jjdk| _| j
jj| _g | _g | _g | _|   d | _d| _d| _d | _d| _d| _d | _d| _d| _t | _d S )Nr      F) super__init__r   get_rng_statetocurrent_device_namedist	broadcastset_rng_statecpu_configzero_configstage
Z3_enabledhybrid_enginepin_parametersgather_all_layers_inference_containers_orig_modules
_orig_fwdscreate_inference_module_t_start_total_latency_iters_training_start_time_generate_latency_training_latency_total_batch_size_gather_latencyis_lora_fusedr   	workspace)selfargsmodelkwargs
_rng_state	__class__ S/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/runtime/hybrid_engine.pyr   "   s(   zDeepSpeedHybridEngine.__init__c                    s   d fdd	  | d S )Nc                    sd   |   D ]+\}}|jtjjfv r&|tjju s|tjju r&t| |t|| q |t| |d q| S )N)	prev_type)	named_childrenr:   torchr   Linear
ModuleListsetattrr   type)r_moduleparent_typer=   namechild_replace_linear_layerr;   r<   rI   C   s   zQDeepSpeedHybridEngine.convert_to_linear_transposed.<locals>._replace_linear_layer)NNr;   )r4   r6   r;   rH   r<   convert_to_linear_transposedA   s   	z2DeepSpeedHybridEngine.convert_to_linear_transposedc              	   C   s   ||dd}| j jrtj}n| j jrtj}ntj}t|td|| j j	j
| j j	j
ddt| jdr3| jjnd ||d}| jd ur^t| jdrQ|| j | j  n|| j | j  n
|| j j	j| j |jdd |  |  |j| jd |S )	NT)	inference)set_empty_paramsdtypemax_out_tokensmin_out_tokenstransposed_modeconfig)policyrQ   model_configlayer_idrG   get_model_parallel_world_size)enable_training)r"   )r   fp16_enabledr?   float16bfloat16_enabledbfloat16float32r   r   r#   rN   hasattrmodulerQ   mpuset_tensor_parallel_configrU   get_model_parallel_group$get_tensor_model_parallel_world_sizeget_tensor_model_parallel_groupinference_tp_sizemp_groupinitialize_tensorscreate_ds_model_configcreate_moduleset_params_wo_copyr"   )r4   
orig_layer
policy_clsrT   rR   inference_dtype
_containerr;   r;   r<   new_inference_containerN   sB   
z-DeepSpeedHybridEngine.new_inference_containerc              
   C   s   i | _ tD ]/}|d }t|jtr#|jD ]}| j || j|fi qq|jd ur4| j |j| j|fi q| j tjt	ftj
tftjtfttfi d S N)inference_policiesr   
isinstance_orig_layer_classlistupdaterm   r   r@   r   	Embeddingr   	LayerNormr   OPTLearnedPositionalEmbeddingr   )r4   plcy_orig_layer_classr;   r;   r<   populate_all_inference_policiest   s    


z5DeepSpeedHybridEngine.populate_all_inference_policiesc                 C      | j |   d S rn   )r&   	fuse_lorar4   rT   r;   r;   r<   _fuse_lora_layer      z&DeepSpeedHybridEngine._fuse_lora_layerc                 C   "   t t| jD ]}| | qd S rn   )rangelenlayer_paramsr~   r}   r;   r;   r<   fuse_lora_weight      z&DeepSpeedHybridEngine.fuse_lora_weightc                 C   r{   rn   )r&   unfuse_lorar}   r;   r;   r<   _unfuse_lora_layer   r   z(DeepSpeedHybridEngine._unfuse_lora_layerc                 C   r   rn   )r   r   r   r   r}   r;   r;   r<   unfuse_lora_weight   r   z(DeepSpeedHybridEngine.unfuse_lora_weightc              	   C   sp   t t| jD ].}t| j| }t| j| }|| t| | | W d    n1 s0w   Y  qd S rn   )r   r   r   r   layer_lora_paramsextendr	   r   )r4   rT   non_active_paramsnon_active_lora_paramsr;   r;   r<   unfuse_lora_weight_non_pinned   s   

z3DeepSpeedHybridEngine.unfuse_lora_weight_non_pinnedc                 C   sV   | j jjr%| j }|s'td t  t	 
  | j }|s)tdd S d S d S )NzJUnable to acquire workspace on first attempt, emptying cache and retrying.z%Unable to retake inference workspace.)r   r#   release_inference_cacher3   retake_workspacer   warninggccollectr   empty_cacheRuntimeError)r4   retake_successr;   r;   r<   retake_inference_cache   s   




z,DeepSpeedHybridEngine.retake_inference_cachec              	   O   s  | j d u r t|dkr|d jd n|d jd }|t  | _ t | _| jr| jr| j	j
jdkrg }| jD ]}|t|  q:| j	j
j}tt| j| }t|D ]}g }	g }
t|| tt| j|d | dD ]"}|| j| d d  |	t| j|  |	t| j|  qqt|	9 t|| tt| j|d | dD ]}t| jdkr| | | jd ur| j| j| jdd qW d    n1 sw   Y  qYt   t! "  t | j | _#t|dkr|d jn|d j}t$j%|d | j	j
j f|dd   t|dkr|d j&n|d j&t|dkr(|d j'n|d j'd}t|dkr<|d ( n|d ( }tj)||| j*d t|dkr]|g|dd  R }n||d< | +  t|}	t|	 | j,|i |}W d    n	1 sw   Y  tt| jD ]
}| j| -  qtj.| j*d}||d | |d |d   }nt| j/}t| j}
||
 t|4 t | j | _#t| jdkr| 0  | +  | j,|i |}t| jdkr| 1  W d    n	1 sw   Y  n4t| jdkr| js| 0  | +  | j,|i |}t| jdkr5| js.| 1  n| 2  d	| _3| j	j
j4rI| j56  t   t! "  t | j | j# | _7|S )
Nr   	input_idsr      T)reversed_dim)rM   device)groupF)8r0   r   shaper   get_world_sizetime_t0r"   r%   r   r#   rc   _other_layersr   rr   
parameterstp_gather_partition_sizemathceilr   r   minr   r   r	   all_lora_paramsr~   r^   r&   apply_tensor_parallelism
mp_replacer   r   r   r   r1   r?   zerosrM   r   
contiguousall_gather_into_tensorrd   r   	_generaterelease_memoryget_rankall_layers_paramsr   r   r   r2   r   r3   release_workspacer.   )r4   inputsr7   bsznon_tp_paramsother_layerpartition_sizelayer_groupslgr   r   rT   input_shapeoutput
input_contgenerate_ret_valsranknon_active_layersr;   r;   r<   generate   s   







  &
"






zDeepSpeedHybridEngine.generater   c                 C   s  |  D ]\}}|j| jv r| j|j d | jkr{| j| j|j d || j|j d | | j| | j|j | j	| j| 
  | j| j|   | jg  | j| D ]}| j| |d d  | j|d d  q]|d7 }q| j|j d tkr| j| j|j d |d dd n| j| j|j d |jt|dr|jnd d | j| | j|j q| j||d qd S )	Nr   r   T)r]   rd   skip_partitionbias)weightr   )rT   )r>   r:   ro   rm   r&   appendr'   r(   forwardr   get_all_paramslora_paramsget_lora_paramsr   r   r   r   r   r   r\   r   _orig_modules_others_orig_fwds_otherscreate_inference_containers)r4   r]   rT   rF   rG   
lora_paramr;   r;   r<   r     s6   

z1DeepSpeedHybridEngine.create_inference_containersc                 C   s  g | _ g | _g | _g | _g | _g | _g | _| jjj	dkr| j
d u rrt }t }|| jjj	 }|| jjj	 }t|D ]5}tt|| jjj	 |d | jjj	 d}t|}||v rp|| _ddlm} || j| jjj	ddd| _q;n+t| j
dr}| j
 n| j
 | _ddlm} || j| jjj	ddd| _nd | _d | _|   t| j | _| | j t| jdkr| jj| _ | j| j_t!! | _"d S )Nr   r   )ReplaceWithTensorSlicing)rd   mp_sizeout_dimin_dimr`   )#r   r   r   r   r   r   r   r   r#   rc   r^   r   r   r   r   rr   	new_grouprd   deepspeed.module_injectr   r   r\   r`   rb   rz   r]   r   r   r   r   r&   r   r   r   r   )r4   global_rank
world_sizemp_group_idnum_mp_groupsranksrd   r   r;   r;   r<   r)   1  sd   




z-DeepSpeedHybridEngine.create_inference_modulec                    s    fdd}|S )Nc                     s   t j  }t j  }|| t|/ tjdkr3js'   tjd kr3d_j	  j
j| i |W  d    S 1 sIw   Y  d S )Nr   r   T)r   r   r   r   r	   r   r   r2   r~   r&   r]   r   )r   r7   r   r   rT   r4   r;   r<   run_forwardl  s   


$z9DeepSpeedHybridEngine._zero3_forward.<locals>.run_forwardr;   )r4   rT   r   r;   r   r<   _zero3_forwardj  s   z$DeepSpeedHybridEngine._zero3_forwardc           	         s  | j d urt | j  }| j| | _| jd | _t r"t dkr| jd urCdd| | j dd}dd| j| j  | j d}nd}d}|| j| j	  }t
d|dd	d
| jdd| j| d dd| jdd| j| d dd	 d| j	dd| j	| d dd d|dd|| d dd | |  t | _ d| _	t   t| jdkrtt| j| jD ]\}\}}| jr| js| ||_n|jj|_|  q| jr| jrt| j| jD ]\}}|j|_q| jrt  t   | j d u rt | _ d S d S )Nr   r   z|CurSamplesPerSec=z.2f z|AvgSamplesPerSec= z|E2E latency=zs z|Gather latency=zs (d   z%) |Generate time=z%) z|Training time=z|Others=z (z%))r*   r   r+   r,   r   is_initializedr   r0   r.   r/   printr1   r   evalr   r&   	enumeratezipr'   r"   r%   r   r   r]   transform_for_inferencer   r   r   r   r   r   )	r4   latencycur_samples_p_secavg_samples_p_secothersiorig_moduleinference_containerinference_layerr9   r;   r<   r   }  s\   

  





zDeepSpeedHybridEngine.evalTc                    s~   |r.t | jdkr.t| j| j| jD ]\}}}|  ||_qt| j| jD ]\}}||_q&t	 
| |r=t | _d S d S )Nr   )r   r'   r   r&   r(   transform_for_trainingr   r   r   r   trainr   r-   )r4   moder   r   orig_fwdr9   r;   r<   r     s   
zDeepSpeedHybridEngine.trainc                    sj   t  j|d t| jdkr| js| jD ]}|  q| jd ur3|  jt | j 7  _t | _d S d S )N)	lr_kwargsr   )	r   stepr   r&   r"   reset_paramsr-   r/   r   )r4   r   r   r9   r;   r<   r     s   


zDeepSpeedHybridEngine.step)r   )Trn   )__name__
__module____qualname____doc__inference_mp_groupr   rJ   rm   rz   r~   r   r   r   r   r   r   r   r)   r   r   r   r   __classcell__r;   r;   r9   r<   r      s(    &	
j9*r   )&r?   deepspeed.inference.configr   &deepspeed.module_inject.replace_policyr   deepspeed.module_inject.utilsr   enginer   utilsr   r   deepspeed.runtime.zeror	   r   r   r   	deepspeedr
   r   deepspeed.acceleratorr   r   deepspeed.utilsr   deepspeed.module_inject.layersr   r   r   r   .ops.transformer.inference.op_binding.workspacer   transformersmodelsoptmodeling_optrv   r   r;   r;   r;   r<   <module>   s,   