o
    Ti,                     @   s`  d dl Z d dlZd dlZd dlZd dlm  mZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ ddlmZmZ ddlmZmZmZ dd	lmZmZ d d
lmZ d dlmZ d dlmZm Z m!Z!m"Z" ddl#m$Z$ d dl%Z%ddl&m'Z' d dl(Z(dd Z)G dd dZ*dd Z+d&ddZ,da-dd Z.d'ddZ/d(ddZ0dd l1m2Z2 d dl3Z3d!d" Z4d)d$d%Z5dS )*    N)DeepSpeedDiffusersAttention)"DeepSpeedDiffusersTransformerBlock)Diffusers2DTransformerConfig)get_accelerator   )replace_policiesgeneric_policies)AutoTPReplaceWithTensorSlicingLoading)TensorParallelOcShardConv2dTensorParallelIcShardConv2d)is_autotp_training_mode)comm)set_num_kv_heads
set_n_embdset_num_attention_headsset_tp_grain_size)load_model_with_checkpoint)policy_to_ds_containerc                 C   sz   ddl m} ddlm} d}|  D ](\}}|j|v r:||d 7 }| D ]\}}|j|u r6||7 }  |S q% |S q|S )Nr   )supported_modelsr   )
ModuleList .)
containersr   torch.nnr   named_children	__class__)replaced_moduler   r   transformer_namencnamechild r$   Z/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/module_inject/replace_module.pyget_transformer_name   s"   

r&   c                   @   s    e Zd Zd
ddZdddZd	S )GroupQuantizerTr      r   c                 C   s   || _ || _|| _|| _d S N)
group_sizenum_bitsq_int8
num_groups)selfr,   r*   r+   r-   r$   r$   r%   __init__.   s   
zGroupQuantizer.__init__c           	         s  | j r|stjj|dd}td|_|S d| j | jdkr"| jn|jd | j	 |
t  }|d  tj dddd  tj dddd  t  d	  } |   d d d   |j
tj }tjj|dd}|j|j| d |d
fddtdD   fddtdD  fddtdD fddtdD }tj| d|d |d gdd
d |_|S )NF)requires_gradr      r   Tdimkeepdim       @)r4   c                    s    g | ]} |  d  qS )r2   )reshape
contiguous.0i)inputs_splitr-   r$   r%   
<listcomp>E   s     z+GroupQuantizer.quantize.<locals>.<listcomp>c                    (   g | ]}t j | d ddd  qS r   Tr3   r   )torchminfloatr9   
input_flatr$   r%   r=   F      ( c                    r>   r?   )r@   maxrB   r9   rC   r$   r%   r=   G   rE   c                    s:   g | ]}t |   |  d    dqS )r6   r   )r@   rF   abssqueeze	unsqueezer9   )	input_max	input_minq_ranger$   r%   r=   H   s    2)r,   r@   nn	Parameteremptyscaler+   r-   shaper*   tor   current_device_namer7   r8   rA   rB   rF   rG   roundclampint8splitrangecatrH   rI   )	r.   inputsqkvcountparallel_dimrP   inputs_qoutscale1r$   )rD   rJ   rK   r<   r-   rL   r%   quantize5   s6   

 "(zGroupQuantizer.quantizeN)Tr   r(   r   )Tr   r   )__name__
__module____qualname__r/   ra   r$   r$   r$   r%   r'   ,   s    
r'   c                 C   s&   t D ]}| }|| r|  S qd S r)   )r   match)modulepolicyr$   r$   r%   _module_matchP   s   
rh   Tc                    s:  fdd}dd }t | tjjrd S tjtjfvrtdz#dd l}t|j	j
dr1|j	j
j}n|j	jj}|j	j
j}||||iW n tyM   i Y nw dd	lm} || j|d
}	t| d|	 | j D ]4}
t| |
}t|}|d ur fdd  || |j||d
}td|
 dt|  t| |
| qfd S )Nc                    s   | | }|d u r| S t|dkr|\}}}}}n	|\}}	}
}}}}tj|| ddd}t|}dd }t|dkrB||j|j_nd |_||j|j_||	j|j_||
j|j	_d |_
||j|j_|jj|jt   |S )N   Fi   )hidden_sizeheadsdtypetriangular_maskingmax_out_tokensc                 S   sV   |   } | d| dd  d | | jd | jd } | t   | S )Nr2   )r8   r7   copy_	transposerQ   rR   r   rS   )datar$   r$   r%   rq   l   s
   "z:generic_injection.<locals>.replace_attn.<locals>.transpose)	attentionlentransformer_inferenceDeepSpeedInferenceConfigr   rr   	attn_qkvwattn_qwattn_kwattn_vw	attn_qkvbattn_owattn_obrp   rR   r   rS   )r#   rg   policy_attnqkvwr|   r}   rj   rk   qwkwvwconfigattn_modulerq   )rl   r$   r%   replace_attnZ   s2   
z'generic_injection.<locals>.replace_attnc                 S   s   t  }t| |S r)   )r   r   )r#   rg   r   r$   r$   r%   replace_attn_block   s   
z-generic_injection.<locals>.replace_attn_blockz*Generic injection only supported with FP16r   CrossAttentionr1   )DSClipEncoder)enable_cuda_graphtext_encoderc                    sF   |   D ]\}} || |jv r |j ||}t| || qd S r)   )r   r   setattr)rf   rg   r"   r#   r   )_replace_modulenew_policiesr$   r%   r      s   

z*generic_injection.<locals>._replace_modulez**** found and replaced z w. )
isinstancer@   rM   Modulefloat16half
ValueError	diffusershasattrmodelsrs   r   attention_processor	AttentionBasicTransformerBlockImportError/model_implementations.transformers.clip_encoderr   r   r   __dict__keysgetattrrh   applyprinttype)rf   rl   r   r   r   r   cross_attentionattention_blockr   
cg_encoderr"   
sub_modulerg   
new_moduler$   )r   rl   r   r%   generic_injectionX   s@   &

	

r   c           *         s  j tjk	d	 d}d}d}tjjjjddK	fdd	dLfd	d
	dMfdd	}fdd}	fdd|durjs|d }
tjt	|
dt	|
 dd}t
t	|
D ]}tjj|
| }t||j|d}|d t  qbdt|v r|t t  n	t||jd}js|	|}t	d}t rt ndt rt nd}|dur%jr%tjsJ dtjj dt }|d }
t|
t u r|
d n|
|!dd}|!dt	}|!d |}|!d!j |dkrGt|
t"u rGtjt	|
dt	|
 dd}t
t	|
D ]#}tj#tj |
| d"dd#g}t$|||||td$ |d q"nt	| }| }t%|| }t%|t&d| | | }tj|d| dd}t
|D ]=}|d || |  fd%d&t
|D }d'd& |D }t$|||||t%|| td$ d(d& |D }t  qsd)|
v rtjt	|
d) dt	|
d)  dd}t
t	|
d) D ]@}|d  rtj |
d) | n|
d) | }tj#|d"dd#g}t$|||||t%|| td$ d*d& |D }t  q|	| t'd+| d,t |  d- t( sJj)durJdd.l*m+} ddl,}d/}|du r_d0}zdd1l-m.}  t/|| rQd2}W n t0y^   d0}Y nw |d3 }t rlt1  t2|d4}!|!g}tj3j)d5d6 t rt dkrt'd7 t4|fd8d9t |5 6 D j) d:|!  tj7d;tj8d<tjd=tj9d>i}"|:|j) |fd?d&t
|D d@dAd|"j  dB}#t;j) dCdD}$|$<|# W d   n	1 sw   Y  |5 
|= D ]\}%}&t>|&dEr|&|&j?g
|%< qt"
@ }'t	|'| d }(t
|D ])})t4|
fdFd9|'|)|( |)d |(  D j) dG|dHdI|)dHdJ q |S )Na   Replace bert-style transformer layers with DeepSpeed's transformer layer
    Arguments:
        orig_layer_impl (torch.nn.Module): the original transformer layer implementation to look for,
            e.g., transformers.models.bert.modeling_bert.BertLayer or transformers.BertLayer
        model (torch.nn.Module): user's nn.module representing their model
        checkpoint_dict: Dictionary for checkpoint passed from the Inference Engine
        config: top-level DS Inference config defined in inference/config.py
        model_config: HuggingFace model config passed from the inference/engine.py
    Returns:
        Updated nn.module with replaced transformer layers
    Nr2   )mp_groupmp_sizeFr   c                    s   || |d}|j s jrJ dddlm} d}t| dr)t| j|r)| jj}d}t| || d}	|		| |	
 jj jj |	   jtjtjtjfv rT|	  td	}
|	|
 |	  |	  |	  |	 |	  td u ry|	a|	jS )
N)	inferencez;cuda graph is not supported with this model, please disabler   )MoEFmlpT)rg   r   model_configlayer_idr#   r,   )cuda_graph_supportedr   deepspeed.moe.layerr   r   r   r   num_expertsr   set_moeset_tensor_parallel_configtensor_paralleltp_sizetp_groupinitialize_tensorsrl   r@   r   bfloat16rV   convert_to_required_dtyper'   set_quantization_configcreate_ds_model_configcreate_modulerq   apply_tensor_parallelismcopy_data_to_new_modulecontainer_grf   )r#   
policy_clsrm   r   r   rg   r   moer   
_container	quantizer)r   r   
mp_replacera   r$   r%   replace_with_policy   s:   



z6replace_transformer_layer.<locals>.replace_with_policyr   c           	         s  t | ||| j}| jj jj tdr9dt| v r'|j	}ntdr3|j
}n|}n|}t| d }ddg}|D ]}t|rVt|}|d kr\ nqJt| tdrmttd t jj |  d|v sd|v r|| S || S )	Nvision_configMllamaVisionEncoderLayertext_confign_embdrj   num_attention_headslm_head	embed_out)r	   keep_module_on_hostr   r   r   r   r   strget_model_num_kv_headsr   r   r   r   r   r   r   tp_grain_sizeupdate_linear_policies_replace_last_linear_moduler   )	rf   all_reduce_linearsprefix
state_dict_autotpnum_kv_headsr   multi_query_n_embd_namesr"   )r   linear_layer_settingr   orig_layer_implr$   r%   replace_wo_policy  s8   







z4replace_transformer_layer.<locals>.replace_wo_policyc                    sR   d}|r| | j }|S t s jr| | j d|d}|S | |||d}|S )NFT)r   r   r   r   )rm   r   replace_with_kernel_inject)r#   _policyr   r   r   trainingr   )r   r   r   r$   r%   
replace_fn>  s   z-replace_transformer_layer.<locals>.replace_fnc                    s  t  rd S d }|  D ]\}}d|v sd|v sd|v r|}q|d ur6t| dr6t| jdr6| jjjr6|| j_t| drRt| jdrRt| jtjj	rR | ddd} | S t| drnt| j
drnt| j
tjj	rn | d	dd} | S t| d
rt| jdr | jddd} | S )Nzword_embeddings.zembed_tokens.zwte.r   weight)r   r   r   )r   language_model)r   named_parametersr   r   r   is_metar   r@   rM   Linearr   r   )rf   embedding_weightr    p)r   r$   r%   set_lm_headQ  s4   

z.replace_transformer_layer.<locals>.set_lm_headc           
         s   dg}dg}|   D ]3\}}|  D ]$\}}||v r&t|||}	t|||	 ||v r6t|||}	t|||	 q ||| q
d S )Nconv1conv2)r   r   r   r   )
modelrank
world_sizeshard_oc_nameshard_ic_namer"   sub_ml_namel_sub_mTPConv2d)conv2d_parallel_shard_weightsr$   r%   r   h  s*   z@replace_transformer_layer.<locals>.conv2d_parallel_shard_weightscheckpointszLoading z checkpoint shards)totaldesc)r   
orig_classr   _replace_policy
checkpointr   Yuanr   r   r   r   r   z0Meta Tensor checkpoint loading not supported in z
 containertpparallelizationppr   r   base_dircpumap_locationweights_only)	containerc                    s2   g | ]} rt j |  n|  qS r$   )ospathjoin)r:   j)	base_dir1
ckpt_index	ckpt_listr$   r%   r=     s    $z-replace_transformer_layer.<locals>.<listcomp>c                 S   s   g | ]
}t j|d ddqS )r  Fr  )r@   load)r:   	ckpt_filer$   r$   r%   r=     s    c                 S      g | ]}d qS r)   r$   r:   _r$   r$   r%   r=         non_tpc                 S   r  r)   r$   r  r$   r$   r%   r=     r  z checkpoint loading time at rank z: z sec)OrderedDictr(   ds_model)BloomForCausalLMbloomr   z	non-tp.ptT)exist_okzSaving tp-sharded checkpointsc                    s   i | ]\}} |vr||qS r$   r$   )r:   kv)r   r$   r%   
<dictcomp>  s    z-replace_transformer_layer.<locals>.<dictcomp>/float32r   rV   r   c                    s0   g | ]}t  D ]}d |dd|ddqqS )tp_0>2dr  .pt)rX   )r:   mr)r   r$   r%   r=     s   0 )r  r   g      ?)r   r   r   versionr   r   rl   z/ds_inference_config.jsonwrP   c                    s>   i | ]}|v r|t  | d r |  | jgn | qS )rP   )r   rP   )r:   r  )rep_sdr   r$   r%   r    s    
(z/tp_r  r  r   )Fr   )r   N)r   r   N)Arl   r@   rV   r
   r   r   r   r   tqdmrt   rX   r  r  r  r   replace_moduleinjection_policy_tupleupdategccollectr   distget_rankget_world_sizer'   is_initializedr   ckpt_load_enabledr   rb   timer   dictgetlistr  r   intrF   r   r   save_mp_checkpoint_pathcollectionsr  json(transformers.models.bloom.modeling_bloomr  r   r   barrierr&   makedirssaver   itemsr  r   r   dumpsopenwriter   r   rP   r   )*r   r   checkpoint_dictr   r   micro_batch_sizeseed
local_rankr   r   r   pbarr;   checkpoint_filer   r   r   
start_time	ckpt_typeckpt_mp_sizesdnum_checkpointstp_split_size	sd_offsetsd_count
ckpt_filessdsr  r  r8  num_partitions	ckpt_namer  non_tp_ckpt_namedtype_reprsckpt_configcfgr    r   r   partition_sizer!  r$   )r
  r  r  r   r   r   r   r   r   ra   r%  r   r   r   r   r%   replace_transformer_layer   sb  93


 








rX  Fc                    s"    fdd}t |tj|ddS )a    Revert DeepSpeed's transformer layer back to original bert-style transformer layer
    Arguments:
        orig_layer_impl (torch.nn.Module): the original transformer layer implementation that was replaced,
            e.g., transformers.models.bert.modeling_bert.BertLayer or transformers.BertLayer
        model (torch.nn.Module): user's nn.module representing their model
        config (dict): model config containing hidden size, attention heads, etc.
    Returns:
        Updated nn.module with original bert-style transformer layers
    c                    s   }| j j}| jj}tj|ddd\}}}tj|ddd\}	}
}||jjjj_|	|jjjj	_||jjj
j_|
|jjj
j	_||jjjj_||jjjj	_| jj|jjjj_| jj|jjjj	_| jj}| jj}rs||jj_||jj	_n||jjjj_||jjjj	_| jj}| jj}r||jjj_||jjj	_n||jjj_||jjj	_| jj|jjj_| jj|jjj	_| jj}| jj}r||jj_||jj	_|S ||jjj_||jjj	_|S )N   r   )axis)rw   rr   r{   r@   chunkrs   r.   queryr   biaskeyvaluer|   outputdenser}   attn_nwattn_nbPostAttentionLayerNorm	LayerNorminter_winter_bintermediate	dense_actoutput_woutput_bnorm_wnorm_bPreAttentionLayerNorm)r#   r   r   orig_moduler   qkvbr   r   r   qbkbvb	attn_ln_w	attn_ln_b
inter_ff_w
inter_ff_btransformer_ln_wtransformer_ln_br   r   prelnr$   r%   r   1  sL   


z,revert_transformer_layer.<locals>.replace_fnNr   )r'  	deepspeedDeepSpeedTransformerLayer)r   r   r   r{  r   r$   rz  r%   revert_transformer_layer&  s   4r~  c                 C   s   d}|dur| drddlm} ||}ntj|ddd}i }|dur.||||fi n.tD ]+}|d}	t|jt	rL|jD ]}
||
||fi q?q0|jdur[||j||fi q0t
| dkshJ dt| ||d	\}}	|S )
a   Scan the model for instances of ``orig_clas:`` to replace using ``replace_fn``.
    Arguments:
        model (torch.nn.Module): the model to augment
        orig_class (torch.nn.Module): the module to search for
        replace_fn (method): a method to convert instances of ``orig_class`` to the
                             desired type and return a new instance.
    Returns:
        A modified ``model``.
    Nz.safetensorsr   )	load_filer  Fr  zNo default policy found! Please specify your policy injection_policy (like {BertLayer:HFBEertLayerPolicy}).You can find some samples here: https://github.com/deepspeedai/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py)r   )endswithsafetensors.torchr  r@   r  r)  r   r   _orig_layer_classr4  rt   r=  r   )r   r   r   r   r   rJ  r  rg   plcyr  orig_layer_classr   r$   r$   r%   r'  k  s.   




r'  r1   )PipelineModulec                 C   s   t | } td| }|d u rtd| }|d u rtd| }|d ur2| D ]}td|r1 dS q&|d urA|d dv rAdS dS )	Nz: (.*?)Modelz: (.*?)Stackz
(.*?)Modelz	^model[.]Fr   )r  optT)r   researchre   r   grouplower)r   r   r^  itemr$   r$   r%   skip_level_0_prefix  s   r  r   c           
   	      s>  |   D ]\}}|j|v rC||j d |||j d ||| |d}t| || t| tr>t| ds6J d|| j| j| < |d7 }q|| d  t	|rf|durft
 fd	d
|D ret||  nqt|jdkrx|durxt||  t|||dkrt| |r|n|| d ||d |d\}	}qtjj| _| |fS )a   Traverse model's children recursively and apply any transformations in ``policies``.
    Arguments:
        model (torch.nn.Module): model to augment
        policies (dict): Mapping of source class to replacement function.
    Returns:
        Modified ``model``.
    r   r2   r   forward_funcsz8we require pipe-module to have the list of fwd_functionsr   r   Nc                 3   s    | ]} |v V  qd S r)   r$   )r:   r  checking_keyr$   r%   	<genexpr>  s    z"_replace_module.<locals>.<genexpr>)r   level_idr   )r   r   r   r   r  r   r  fwd_mapr   is_load_moduleanyr  rt   _buffersload_bufferr   r  ru   DeepSpeedTransformerInferencereset_cache)
r   policiesr   r   r  r   r"   r#   r   r  r$   r  r%   r     sH   




	r   )NT)Fr)   )r   r   r   N)6r  r@   r&  r|  deepspeed.ops.transformeropstransformerru   7deepspeed.ops.transformer.inference.diffusers_attentionr   ?deepspeed.ops.transformer.inference.diffusers_transformer_blockr   <deepspeed.ops.transformer.inference.diffusers_2d_transformerr   deepspeed.acceleratorr   replace_policyr   r   auto_tpr	   r
   r   layersr   r   deepspeed.module_inject.layersr   r   r,   deepspeed.module_inject.tp_shardr   r   r   r   load_checkpointr   r1  utilsr   r*  r&   r'   rh   r   r   rX  r~  r'  piper  r  r  r   r$   r$   r$   r%   <module>   sB   $
\  
q
E&