o
    i5                     @   s  d dl Z d dlZd dlZd dlmZmZ d dlmZ d dlm	Z	 d dl
mZmZ d dlZd dlmZ d dlm  mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$m%Z% d dl&m'Z'm(Z( d dl)m*Z*m+Z+ d dl,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6m7Z7m8Z8m9Z9 d dl:m;Z; d dl<m=Z= zd dl>m?Z? W n e@y   dZ?Y nw zd dlAmBZB W n e@y   dZBY nw z
d dlCmDZDmEZE W n e@y   d\ZDZEY nw e FeGZHd5ddZId5ddZJd5ddZKG d d! d!ejLZM	$d6d%d&ZNG d'd( d(eMZOG d)d* d*eMe;ZPd+d, ZQd-eeeRejSf  d.efd/d0ZTd1d2 ZUd3d4 ZVdS )7    N)OrderedDict
namedtuple)Sequence)partial)DictList	rearrange)
GPT2Config)remap_state_dict_hf_bigcode)remap_state_dict_hf_falcon)remap_state_dict_hf_gpt_neox)remap_state_dict_hf_gptj)remap_state_dict_hf_llama)remap_state_dict_hf_opt)BlockParallelBlock)GPT2EmbeddingsParallelGPT2Embeddings)MHAParallelMHA)FusedMLPGatedMlpMlpParallelFusedMLPParallelGatedMlpParallelMLP)
sqrelu_fwd)
all_gatherall_gather_rawget_dim_for_local_ranksync_shared_params)GenerationMixin)state_dict_from_pretrained)ColumnParallelLinear)FusedDenseSqreluDense)layer_norm_fnRMSNormNNc                 C   s  ||d}t | d| j| j }t | ddsdnd}| jsdn||  }|t | dd9 }| jr<|d us4J |t|d  }t | d	d}	|	rL|d u sLJ d
t | dd}
t | dd}tt | dd| }t | dd}t | dd }t | dd}t | dd}t | dd}t | dd}t | dd}|s|d u sJ d|d u rtnt}|d u r||	dni }|d ur|t | dddni }t | dd }t	|f| j||
|| j
|d||||||||d|||}|S )Ndevicedtypehead_dimmup_scale_qk_dot_by_dFg      ?      ?mup_attn_multiplier   attn_dwconvz.TensorParallel MHA does not support dwconv yetqkv_proj_biasTout_proj_biasrotary_emb_fraction        rotary_emb_baseg     @rotary_emb_scale_baserotary_emb_interleaved	use_alibiwindow_size)r;   use_flash_attnfused_bias_fcz)TensorParallel MHA requires fused_bias_fc)r=   dwconvsequence_parallelprocess_groupr?   	n_head_kv)	num_headsnum_heads_kvr2   r3   dropoutsoftmax_scalecausal	layer_idxrotary_emb_dimr6   r7   r8   r9   r:   r<   )getattrhidden_sizenum_attention_headsscale_attn_weightsscale_attn_by_inverse_layer_idxfloatintr   r   r   
attn_pdrop)configrH   rA   r*   r+   factory_kwargsr,   attn_scale_powerrF   r>   r2   r3   rI   r6   r7   r8   r9   r:   r<   r=   mha_clsserial_kwargsparallel_kwargsrD   	mixer_cls rY   K/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/models/gpt.pycreate_mixer_cls>   sv   

r[   c                 C   s  ||d}t | dd}t | dd}t | dd}|r | jdv s J t | dd}	|	r1| jd	ks1J d
|	r7|r7J |s|	s| jdv sBJ | jdv r| jdkrOtjn
| jdkrWtjntj}
|d u r`tnt}|d uro|t | dddni }t | dd}t|f| j	|
|||d||}|S | jdkrttj
dd}
n| jd	krt}
n| jdv rdnd}ttj|d}
|d u rtnt}|d ur|t | dddni }t|f| j	|
||d||}|S t | dd}t|tr|d usJ || }|r.td u rtd| jdv rdn| j}
|d u r	tnt}|d ur|t | dddni }t|f| j	|
|||d||}|S |	rR|d ur=|s=J d td usDJ ttf| j	|d!|}|S td")#Nr)   mlp_fc1_biasTmlp_fc2_bias	fused_mlpF)gelu_new	gelu_fastgelu_approxgelu_pytorch_tanhrelusqrelufused_dense_sqrelu_denserd   zMfused_dense_sqrelu_dense only supports approximate activation_function sqrelu
gelur_   r`   ra   rb   rc   rd   gluswiglugeglurh   ri   rj   rh   ri   r?   r@   mlp_multiple_of   )hidden_features
activationbias1bias2multiple_ofrc   )inplace)r_   r`   ra   rb   tanhnone)approximate)rn   ro   rp   rq   mlp_checkpoint_lvlr   zfused_dense is not installedra   )rn   ro   checkpoint_lvlrp   rq   z<Tensor Parallel is not implemented for FusedDenseSqreluDense)rn   rx   zMLP type not supported)rJ   activation_functionFsigmoidsilurg   r   r   r   n_innerrc   r   r   r   
isinstancer   r   ImportErrorr   r%   RuntimeError)rR   rH   rA   r*   r+   rS   r\   r]   r^   re   ro   mlp_clsrW   rl   rv   rw   rY   rY   rZ   create_mlp_cls{   s   



W


7



r   c                 C   s:  ||d}t | dd}t| |fd|i|}t| |fd|i|}t | dd}	t|	s-tjntfd| ji|}
t | dd}|d u sE|d	krH| jn| j	}t | d
d}t | dd}|sut
| j|||
||| jt | dd||on|d u|d ud}n#|syJ t| j|||
|| jt | ddt | dd||o|d u|d ud}||_|S )Nr)   r?   TrA   rms_normFepsresidual_in_fp32r   prenormparallel_blockfused_dropout_add_ln)norm_clsr   resid_dropout1resid_dropout2r   r   r?   mark_shared_paramsparallel_block_tied_norm)r   r   r   	tied_normr   r   r?   r   )rJ   r[   r   r   nn	LayerNormr'   layer_norm_epsilonresid_pdrop
embd_pdropr   rK   r   rH   )rR   rH   rA   r*   r+   rS   r?   rX   r   use_rms_normr   r   r   r   r   blockrY   rY   rZ   create_block  s\   





r   c                       s:   e Zd ZdZ fddZeddddddd	d
Z  ZS )GPTPreTrainedModelzAn abstract class to handle weights initialization and
    a simple interface for dowloading and loading pretrained models.
    c                    s6   t    t|tstd| jj| jj|| _d S )NzParameter config in `{}(config)` should be an instance of class `GPT2Config`. To create a model from a Google pretrained model use `model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`)	super__init__r~   r
   
ValueErrorformat	__class____name__rR   )selfrR   inputskwargsr   rY   rZ   r   <  s   


zGPTPreTrainedModel.__init__TNr0   r   )strictr*   r+   
world_sizerankc                O   s.  | |g|R ||d|	}
t |d|d}|dr t||}n^|dr+t||}nS|ds5|dr;t||}nC|dsJ|dsJ|d	rPt||}n.|d
r[t||}n#|drft||}n|dsp|drvt||}nt	d| d|dkrt
||||}|
j||d}t| |
S )z
        Instantiate a GPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
        Download and cache the pre-trained model file if needed.
        r)   cpugpt2zfacebook/optzEleutherAI/gpt-j-ztogethercomputer/GPT-JT-zEleutherAI/gpt-neox-zEleutherAI/pythia-z"togethercomputer/RedPajama-INCITE-ztiiuae/falcon-zmeta-llama/Llama-zbigcode/z	WizardLM/zModel z not supportedr0   r   )r#   
startswithremap_state_dict_hf_gpt2r   r   r   r   r   r   NotImplementedErrorshard_state_dict_tpload_state_dictloggerinfo)cls
model_namerR   r   r*   r+   r   r   argsr   model
state_dictload_returnrY   rY   rZ   from_pretrainedH  s:   




z"GPTPreTrainedModel.from_pretrained)r   
__module____qualname____doc__r   classmethodr   __classcell__rY   rY   r   rZ   r   7  s    r   {Gz?r.   Tc           	   	   C   s   t |}t| tjr8tjj| j|| d t| jdi }|	d|i t
| jd| | jd ur7tj| j nt| tjrGtjj| j|d |rh|  D ]\}}|dv rgtjj|d|| t d|  d qMd S d S )N)std_optimlr_multiplier)zout_proj.weightz
fc2.weightr5      )meanr   )mathsqrtr~   r   Linearinitnormal_weightrJ   updatesetattrbiaszeros_	Embeddingnamed_parameters)	modulen_layerinitializer_rangemup_width_scalerescale_prenorm_residualmup_init_scale	optim_cfgnameprY   rY   rZ   _init_weights|  s(   

r   c                       s@   e Zd Zddef fddZdd ZdddZdd	d
Z  ZS )GPTModelNrR   c              
      s  t    ||d| _t dd| _ jdv sJ t dd}t j| | }t dd| _	t d	d
| _
t dd| _t dd
}t dd }t dd
| _d u rht j| jfd|i| _nt j| jf| jd| _t fddt jD | _t dd}	|	dkr| jdd  D ]}
| jd jj|
j_qt dd
| _| jrtd u rtd| jrt j| _|stjnt }| jfd j!i| _"d ur| j"# D ]}d|_$| jrd|_%q| &t't( j j)t ddd | *  d S )Nr)   r?   Trf   pad_vocab_size_multipler0   mup_embeddings_multiplierr.   r   Fr   r   word_embed_proj_dimr   r@   c                    s"   g | ]}t  f|d qS ))rH   rA   )r   ).0irR   rS   rA   rY   rZ   
<listcomp>  s    z%GPTModel.__init__.<locals>.<listcomp>r4   r5   r   r   zTriton is not installedr   r   r   r   r   )+r   r   rA   rJ   r?   ry   r   ceil
vocab_sizeembeddings_multiplierr   r   r   r   rK   max_position_embeddings
embeddingsr   r   
ModuleListrangenum_hidden_layerslayersmixer
rotary_embr   r&   r   Dropoutr   drop_fr   r'   r   ln_f
parameters_shared_params_sequence_parallelapplyr   r   r   tie_weights)r   rR   rA   r*   r+   r   r   r   r   r4   layerr   r   r   r   rZ   r     s   


zGPTModel.__init__c                 C   s   | j d urt| | j  d S d S N)rA   r!   r   rY   rY   rZ   r     s   
zGPTModel.tie_weightsc                    s    fddt | jD S )Nc                    s*   i | ]\}}||j  fd iqS r+   )allocate_inference_cache)r   r   r   
batch_sizer+   r   
max_seqlenrY   rZ   
<dictcomp>  s    z5GPTModel.allocate_inference_cache.<locals>.<dictcomp>)	enumerater   r   r   r   r+   r   rY   r   rZ   r     s   z!GPTModel.allocate_inference_cachec                 C   s  | j d ur| jrddini }| j|fd|i|}| jdkr#|| j }| jr(d }d }| j d ur9| jr9d|jd ini }|d urC||d< | jD ]$}	| jrd| jsX|	|||d\}}qF|	||||d\}}}qF|	||d}qF| jr| js| 	|}
| js|d ur|
| n|
}n| 	|}|d ur||
 | n|
| }| 
|j| j
jjd	}|S t|| j
j| j
j|| jsd n|| j
j| jr| j	jnd
dt| j
td	}|S )Ncombine_batch_seqlen_dimTposition_idsr.   seqlenr0   inference_params)mixer_kwargsr   r5   F)residualx1r   	dropout_pr   is_rms_norm)rA   r?   r   r   r   shaper   r   r   r   r   tor   r+   r&   r   r   trainingr   r~   r'   )r   	input_idsr   r  embedding_kwargshidden_stateshidden_states2r  r  r   droppeddropped2rY   rY   rZ   forward  sd   





zGPTModel.forwardNNNr   r(   )	r   r   r   r
   r   r   r   r  r   rY   rY   r   rZ   r     s
    ^
r   c                       sN   e Zd Zddef fddZdd ZdddZdd
dZd fdd	Z  Z	S )GPTLMHeadModelNrR   c                    sd  ||d}t  | || _t|fd|i|| _t|dd| _t|dd}t|dd}t|j	| | }t|d	d }	|	d u rC|j
n|	}
|	d urXtj|j
|
fd
di|| _nd | _t|dd}t|dd}|| | _|d u r~tj|
|fd
|i|| _ntd u rtdt|
||f|t|ddd|| _t|dd| _| tt|j|j|d |   d S )Nr)   rA   tie_word_embeddingsTlm_head_biasFr   r0   r   r   r   r.   mup_output_multiplierz fused_dense_lib is not installedr?   )r   r?   	norm_headr   )r   r   rA   r   transformerrJ   r  r   r   r   n_embdr   r   project_outoutput_scalelm_headr$   r   r  r   r   r   r   r   r   )r   rR   rA   r*   r+   rS   r  r   r   r   	embed_dimr   r  r   rY   rZ   r   B  sR   


zGPTLMHeadModel.__init__c                 C   s4   | j r| jjjj| j_| jd urt| | j d S d S r   )r  r  r   word_embeddingsr   r  rA   r!   r   rY   rY   rZ   r   p  s
   
zGPTLMHeadModel.tie_weightsc                 K   s   | j j||fd|i|S )Nr+   )r  r   r   rY   rY   rZ   r   v  s   z'GPTLMHeadModel.allocate_inference_cacher   c                 C   s4  |j dksJ d|j |j\}}| j|||d}|dur'|j dks'J d|dkr6|dd| df }| jdur@| |}| jdkrJ|| j }| jsS| |}n"t| jj	}	t
| jtrk| jjrkt|| jj}tj||	| jjd	}t
| jtr|durt|| jj\}}
t|d
|d}tddg}||dS )ac  
        input_ids: (batch, seqlen) int tensor
        inference_params: for generation. Adapted from Megatron-LM (and Apex)
        https://github.com/NVIDIA/apex/blob/3ff1a10f72ec07067c4e44759442329804ac5162/apex/transformer/testing/standalone_transformer_lm.py#L470
        num_last_tokens: if > 0, only return the logits for the last n tokens
        r   z<Expected `input_ids` to have shape [b, slen], but got shape )r   r  N   z5sequence_parallel is not supported in generation moder   r.   )r   z(n b) ... d -> b ... (n d))bCausalLMOutputlogits)r!  )ndimr  r  r  r  r  r  rz   	normalizer   r~   r$   r?   r   rA   linearr   r   r	   r   )r   r
  r   r  num_last_tokensr  slenr  	lm_logitslm_head_weight_r   rY   rY   rZ   r  {  s4   






zGPTLMHeadModel.forwardTc                    s.  d|v rt | jj}|d|d  d}|d|d  d}||d< ||d< tt|D ]N}|d| d}|d| d	}||d| d< ||d| d< |d
kr||d|d  d}|d|d  d}||d| d< ||d| d	< q.|d}|d}||d< ||d< t j||dS )Nztransformer.ln_0.weighttransformer.layers.r0   z.norm2.weightz.norm2.biasztransformer.ln_f.weightztransformer.ln_f.biasz.norm1.weightz.norm1.biasr   ztransformer.ln_0.biasz!transformer.layers.0.norm1.weightztransformer.layers.0.norm1.biasr   )lenr  r   popreversedr   r   r   )r   r   r   n_layers	ln_weightln_biaslr   rY   rZ   r     s,   

zGPTLMHeadModel.load_state_dictr  r   )NNr   )T)
r   r   r   r
   r   r   r   r  r   r   rY   rY   r   rZ   r  A  s    .

$r  c                    s  t |dd}t|j| | }| dksJ |j dks!J |jdur)|jnd|j }| dks6J |jt |d|j}|  fdd}dfd	d
	}	fdd}
 fdd}|| d d| v rt|| d d| v r}|	| d t|jD ]n}|| d| d || d| d |	| d| d d dkr| 	d| dd |j
dv r|
| d| d |
| d| d n|| d| d || d| d |	| d| d dkr| 	d| dd q| S )zConvert the state_dict of a standard GPT model to the state_dict of a GPT model
    with tensor parallel.

    This function modifies state_dict in place.
    r   r0   r   N   rB   c                    sB   || v r| | }|j d  }| |  d |  | |< d S d S )Nr   r0   )r  r   keyxdimr   r   rY   rZ   shard_first_dim  s
    z,shard_state_dict_tp.<locals>.shard_first_dimc                    sh   || v r2| | fddt D  t fddd fD \}}d||f | |< d S d S )Nc                    s    g | ]}t d | qS r;   )r    sizer   
local_rank)rr   r   r5  rY   rZ   r     s    z?shard_state_dict_tp.<locals>.shard_last_dim.<locals>.<listcomp>c                 3   s     | ]}t  d | V  qd S r   )sum)r   pos)dim_each_rankrY   rZ   	<genexpr>      z>shard_state_dict_tp.<locals>.shard_last_dim.<locals>.<genexpr>r0   .)r   tuple)r   r4  rr   begendr7  )r?  rr   r5  rZ   shard_last_dim  s   "z+shard_state_dict_tp.<locals>.shard_last_dimc                    s^   || v r-| | }|j d  d }tt|dddd d  |  d | f d| |< d S d S )Nr   r   z(two o) ... -> two o ...twor0   ztwo o ... -> (two o) ...)r  r	   r3  r7  rY   rZ   shard_gatedmlp_fc1_dim  s   (z3shard_state_dict_tp.<locals>.shard_gatedmlp_fc1_dimc           	         s4  || v rfddt D }fddt D }t|d  }t|d d  }t|d  }t|d d  }krbt| | ddd}t|d d |  |  f d| |< d S t| | d	d
  d}ttj||| || |  | |  |  gddd| |< d S d S )Nc                       g | ]}t  |qS rY   r    r;  n_headr   rY   rZ   r         
zBshard_state_dict_tp.<locals>.shard_qkv_headdim.<locals>.<listcomp>c                    rI  rY   rJ  r;  rB   r   rY   rZ   r     rM  r0   (three d) ... -> three d ...r  threethree d ... -> (three d) ....(nheadqkv headdim) ... -> nheadqkv headdim ...r   )nheadqkvr   r6  .nheadqkv headdim ... -> (nheadqkv headdim) ...)r   r=  r	   torchcat)	r   r4  n_head_each_rankn_head_kv_each_rank
beg_n_head
end_n_headbeg_n_head_kvend_n_head_kvr5  r,   rL  rB   r   r   rY   rZ   shard_qkv_headdim  sZ   

z.shard_state_dict_tp.<locals>.shard_qkv_headdim-transformer.embeddings.word_embeddings.weightlm_head.weight1transformer.embeddings.position_embeddings.weightr*  .mixer.Wqkv.weight.mixer.Wqkv.bias.mixer.out_proj.weight)rr   z.mixer.out_proj.biasrk   .mlp.fc1.weight.mlp.fc1.bias.mlp.fc2.weightz.mlp.fc2.bias)r0   )rJ   r   r   r   rK   r}   rL  r   r   r,  ry   )r   rR   r   r   r   r   	inner_dimr  r8  rE  rH  r`  r   rY   r_  rZ   r     sJ   
	
/


r   state_dictsrR   c                    s  t | | d  }t dd}t j| |  dks"J  j dks+J  jdur3 jnd j }| dks@J  j j dksJJ  j j fdd}dd	d
} fdd}dd }| d 	 }	|| |	d d|	v r~|| |	d d|	v r|| |	dd  j
dv r|nt|dd}
t jD ]A}|| |	d| d || |	d| d || |	d| dd |
| |	d| d || |	d| dd || |	d| dd q|	S )a-  Convert the list of sharded state_dict of a GPT model with tensor parallel to
    the state_dict of a standard GPT model.

    This function is meant to be the "reverse" of shard_state_dict_tp.

    Precondition:
        - state_dicts should be ordered in the same way as the shards were created.
    r   r   r0   Nr2  c                    sF   | d   j d  krdnd}tj fdd| D |d| < d S )Nr   r0   c                       g | ]}|  qS rY   rY   r   sr4  rY   rZ   r   F      zKcombine_state_dicts_tp.<locals>.combine_word_embeddings.<locals>.<listcomp>rU  )r  rW  rX  rk  r   r4  r6  )r   r   ro  rZ   combine_word_embeddingsD  s   "$z7combine_state_dicts_tp.<locals>.combine_word_embeddingsr;   c                    s0    |v rt j fdd| D |d| < d S d S )Nc                    rl  rY   rY   rm  ro  rY   rZ   r   J  rp  z?combine_state_dicts_tp.<locals>.combine_dim.<locals>.<listcomp>rU  )rW  rX  rq  rY   ro  rZ   combine_dimH  s   $z+combine_state_dicts_tp.<locals>.combine_dimc                    s*  j td |v rkr( fdd| D }ttj|ddd| < d S fddtD fddtD  fd	dt| D }tjfd
dt|D dd}tjfddt|D dd}tjfddt|D dd}tj|||gdd}t|d| < d S d S )NrB   c                       g | ]}t |  d ddqS )rO  r  rP  r   rm  ro  rY   rZ   r   Q  s    zGcombine_state_dicts_tp.<locals>.combine_qkv_headdim.<locals>.<listcomp>r0   rU  rR  c                    rI  rY   rJ  r;  rK  rY   rZ   r   V  rM  c                    rI  rY   rJ  r;  rN  rY   rZ   r   Z  rM  c                    s.   g | ]\}}}t | d |d|   dqS )rS  r   )rT  headdimr   )r   rn  rank_n_headrank_n_head_kv)ru  r4  rY   rZ   r   ^  s    
c                    s    g | ]\}}|d  |  qS r   rY   r   r   r5  )rY  rY   rZ   r   i  s     r   c                    s,   g | ]\}}| |  | |   qS rY   rY   rx  rY  rZ  rY   rZ   r   k  s    c                    s(   g | ]\}}| | |  d  qS r   rY   rx  ry  rY   rZ   r   u  s    rV  )rL  rJ   r	   rW  rX  r   zipr   )rk  r   r4  xswqwkwvwqkv)rR   ru  r   )r4  rL  rY  rB   rZ  rZ   combine_qkv_headdimL  sR   
 
z3combine_state_dicts_tp.<locals>.combine_qkv_headdimc                    s:    |v r fdd| D }t tj|ddd| < d S d S )Nc                    rt  )z(two d) ... -> two d ...r   rF  r   rm  ro  rY   rZ   r     s    zEcombine_state_dicts_tp.<locals>.combine_gated_mlp.<locals>.<listcomp>r0   rU  ztwo d ... -> (two d) ...)r	   rW  rX  )rk  r   r4  r{  rY   ro  rZ   combine_gated_mlp  s   z1combine_state_dicts_tp.<locals>.combine_gated_mlpra  rb  rc  rk   rU  r*  rd  re  rf  rg  rh  ri  r9  )r+  keysrJ   r   r   r   rK   r}   rL  copyry   r   r   r   )rk  rR   r  r   rj  rr  rs  r  r  r   mlp_combine_fnr   rY   )rR   ru  r   r   rZ   combine_state_dicts_tp.  sH   	
8

r  c           
   	      s  dd t fdd|  D } | d}t|dd}t|j| | }t|ddd||j	d  f| d	< | d	 | d
< dd t fdd|  D } t
|jD ](}| d| d}| | d| d< | d| d}| | d| d< qSdd t fdd|  D } t
|jD ]2}| d| dd  | d| d}| | d| d< | d| d}	|	 | d| d< qdd  t  fdd|  D } | S )Nc                 S      t dd| S Nz^wpe.z+transformer.embeddings.position_embeddings.resubro  rY   rY   rZ   key_mapping_pos_emb     z5remap_state_dict_hf_gpt2.<locals>.key_mapping_pos_embc                 3        | ]\}} ||fV  qd S r   rY   r   kvr  rY   rZ   r@    rA  z+remap_state_dict_hf_gpt2.<locals>.<genexpr>z
wte.weightr   r0   r   ra  rb  c                 S       t dd| } t dd| } | S )Nz^ln_f.(weight|bias)transformer.ln_f.\1z^h.(\d+).ln_(1|2).(weight|bias)ztransformer.layers.\1.norm\2.\3r  ro  rY   rY   rZ   key_mapping_ln     z0remap_state_dict_hf_gpt2.<locals>.key_mapping_lnc                 3   r  r   rY   r  r  rY   rZ   r@    rA  zh.z.mlp.c_fc.weightr*  rg  z.mlp.c_proj.weightri  c                 S   r  )Nz^h.(\d+).mlp.c_fc.biasz"transformer.layers.\1.mlp.fc1.biasz^h.(\d+).mlp.c_proj.biasz"transformer.layers.\1.mlp.fc2.biasr  ro  rY   rY   rZ   key_mapping_mlp  r  z1remap_state_dict_hf_gpt2.<locals>.key_mapping_mlpc                 3   r  r   rY   r  r  rY   rZ   r@    rA  z
.attn.biasz.attn.c_attn.weightrd  z.attn.c_proj.weightrf  c                 S   r  )Nz^h.(\d+).attn.c_attn.biasz%transformer.layers.\1.mixer.Wqkv.biasz^h.(\d+).attn.c_proj.biasz)transformer.layers.\1.mixer.out_proj.biasr  ro  rY   rY   rZ   key_mapping_attn  s
   z2remap_state_dict_hf_gpt2.<locals>.key_mapping_attnc                 3   r  r   rY   r  r  rY   rZ   r@    rA  )r   itemsr,  rJ   r   r   r   rz   padr  r   r   t)
r   rR   r  r   r   dW1W2WqkvWoutrY   )r  r  r  r  rZ   r     s6   
r   c           	   	      s  dd t fdd|  D } dd t fdd|  D } | d}t|d	d
}t|jd | | }t|ddd||jd  f| d< | d | d< dd t fdd|  D } dd t fdd|  D } dd  t  fdd|  D } |j	|j
 }t|jD ]0}| d| d}t|dd|d| d| d< | d| d}t|dd|d| d| d< q| S )Nc                 S   s    t dd| } t dd| } | S )Nz^language_model.encoder.ztransformer.z^language_model.r  ro  rY   rY   rZ   key_mapping_transformer  r  z:remap_state_dict_megatron.<locals>.key_mapping_transformerc                 3   r  r   rY   r  )r  rY   rZ   r@    rA  z,remap_state_dict_megatron.<locals>.<genexpr>c                 S   r  r  r  ro  rY   rY   rZ   r    r  z6remap_state_dict_megatron.<locals>.key_mapping_pos_embc                 3   r  r   rY   r  r  rY   rZ   r@    rA  z,transformer.embedding.word_embeddings.weightr   r0   r   ra  rb  c                 S   .   t dd| } t dd| } t dd| } | S )Nz*^transformer.final_layernorm.(weight|bias)r  z7^transformer.layers.(\d+).input_layernorm.(weight|bias)ztransformer.layers.\1.norm1.\2z@^transformer.layers.(\d+).post_attention_layernorm.(weight|bias)ztransformer.layers.\1.norm2.\2r  ro  rY   rY   rZ   r    s   z1remap_state_dict_megatron.<locals>.key_mapping_lnc                 3   r  r   rY   r  r  rY   rZ   r@    rA  c                 S   r  )Nz9^transformer.layers.(\d+).mlp.dense_h_to_4h.(weight|bias)z transformer.layers.\1.mlp.fc1.\2z9^transformer.layers.(\d+).mlp.dense_4h_to_h.(weight|bias)z transformer.layers.\1.mlp.fc2.\2r  ro  rY   rY   rZ   r    s   z2remap_state_dict_megatron.<locals>.key_mapping_mlpc                 3   r  r   rY   r  r  rY   rZ   r@    rA  c                 S   r  )Nz<^transformer.layers.(\d+).self_attention.rotary_emb.inv_freqz/transformer.layers.\1.mixer.rotary_emb.inv_freqzF^transformer.layers.(\d+).self_attention.query_key_value.(weight|bias)z#transformer.layers.\1.mixer.Wqkv.\2z<^transformer.layers.(\d+).self_attention.dense.(weight|bias)z'transformer.layers.\1.mixer.out_proj.\2r  ro  rY   rY   rZ   r    s    z3remap_state_dict_megatron.<locals>.key_mapping_attnc                 3   r  r   rY   r  r  rY   rZ   r@  '  rA  r*  rd  z8(nheads three headdim) ... -> (three nheads headdim) ...r  )rQ  ru  re  z0(nheads three headdim) -> (three nheads headdim))r   r  r,  rJ   r   r   r  rz   r  rK   rL   r   r   r	   )	r   rR   r  r   r   ru  r  r  bqkvrY   )r  r  r  r  r  rZ   remap_state_dict_megatron  s@   
r  )NNNN)r   r.   T)Wloggingr   r  collectionsr   r   collections.abcr   	functoolsr   typingr   r   rW  torch.nnr   torch.nn.functional
functionalrz   einopsr	   transformersr
   flash_attn.models.bigcoder   flash_attn.models.falconr   flash_attn.models.gpt_neoxr   flash_attn.models.gptjr   flash_attn.models.llamar   flash_attn.models.optr   flash_attn.modules.blockr   r   flash_attn.modules.embeddingr   r   flash_attn.modules.mhar   r   flash_attn.modules.mlpr   r   r   r   r   r   flash_attn.ops.activationsr   flash_attn.utils.distributedr   r   r    r!   flash_attn.utils.generationr"   flash_attn.utils.pretrainedr#   flash_attn.ops.fused_denser$   r   flash_attn.ops.triton.mlpr%    flash_attn.ops.triton.layer_normr&   r'   	getLoggerr   r   r[   r   r   Moduler   r   r   r  r   strTensorr  r   r  rY   rY   rY   rZ   <module>   sp    


= 
1F
 )y tt9