o
     i$                     @   s|   d dl Z d dlZd dlmZ d dlZd dlm  mZ d dl	m
Z
mZmZ defddZdefddZd	ed
e
fddZdS )    N)OrderedDict)
GPT2ConfigGPTBigCodeConfigPretrainedConfigconfigc              	      s  dd t fdd|  D } | d}t|dd}t|j| | }t|ddd||j	d  f| d	< | d	 | d
< dd t fdd|  D } dd t fdd|  D } |j
sfJ dt|jD ]z}|j}||j }| d| d}tj||||gdd\}	}
}t|
|jdf}
t||jdf}tj|	|
|fdd| d| d< | d| d}tj||||gdd\}	}
}t|
|jf}
t||jf}tj|	|
|fdd| d| d< qkdd  t  fdd|  D } | S )zX
    Map the state_dict of a Huggingface BigCode model to be flash_attn compatible.
    c                 S      t dd| S )Nz^transformer.wpe.z+transformer.embeddings.position_embeddings.resubkey r   W/home/ubuntu/.local/lib/python3.10/site-packages/xformers/_flash_attn/models/bigcode.pykey_mapping_pos_emb      z8remap_state_dict_hf_bigcode.<locals>.key_mapping_pos_embc                 3        | ]\}} ||fV  qd S Nr   .0kv)r   r   r   	<genexpr>       z.remap_state_dict_hf_bigcode.<locals>.<genexpr>transformer.wte.weightpad_vocab_size_multiple   r   -transformer.embeddings.word_embeddings.weightlm_head.weightc                 S       t dd| } t dd| } | S )N^transformer.ln_f.(weight|bias)transformer.ln_f.\1z+^transformer.h.(\d+).ln_(1|2).(weight|bias)ztransformer.layers.\1.norm\2.\3r   r   r   r   r   key_mapping_ln      z3remap_state_dict_hf_bigcode.<locals>.key_mapping_lnc                 3   r   r   r   r   )r!   r   r   r   '   r   c                 S   <   t dd| } t dd| } t dd| } t dd| } | S )	Nz$^transformer.h.(\d+).mlp.c_fc.weightz$transformer.layers.\1.mlp.fc1.weightz&^transformer.h.(\d+).mlp.c_proj.weightz$transformer.layers.\1.mlp.fc2.weightz"^transformer.h.(\d+).mlp.c_fc.biasz"transformer.layers.\1.mlp.fc1.biasz$^transformer.h.(\d+).mlp.c_proj.biasz"transformer.layers.\1.mlp.fc2.biasr   r   r   r   r   key_mapping_mlp)   *   z4remap_state_dict_hf_bigcode.<locals>.key_mapping_mlpc                 3   r   r   r   r   )r$   r   r   r   @   r   z'Only multi-query attention is supportedtransformer.h..attn.c_attn.weightdimtransformer.layers..mixer.Wqkv.weight.attn.c_attn.bias.mixer.Wqkv.biasc                 S   r   )Nz'^transformer.h.(\d+).attn.c_proj.weightz+transformer.layers.\1.mixer.out_proj.weightz%^transformer.h.(\d+).attn.c_proj.biasz)transformer.layers.\1.mixer.out_proj.biasr   r   r   r   r   key_mapping_attn^      z5remap_state_dict_hf_bigcode.<locals>.key_mapping_attnc                 3   r   r   r   r   )r.   r   r   r   k   r   )r   itemspopgetattrmathceil
vocab_sizeFpadshapemulti_queryrangenum_hidden_layersn_embdn_headtorchsplittilecat)
state_dictr   word_embeddingsr   r5   d	embed_dimhead_dimc_attn_weightqr   r   c_attn_biasr   )r.   r!   r$   r   r   remap_state_dict_hf_bigcode
   s<   
	
 "rJ   c                    s  dd t fdd|  D } | d}|ddd|jf }|| d< || d< d	d
 t fdd|  D } dd t fdd|  D } t|jD ]x}|j}||j }| d| d}tj	||||j ||j gdd\}}}	tj
||d| |	d| fdd}
|
| d| d< | d| d}tj	||||j ||j gdd\}}}	tj
||d| |	d| fdd}|| d| d< qPdd  t  fdd|  D } | S )z
    Map the state_dict of a flash_attn model to be Huggingface BigCode compatible.

    This function is meant to be the inverse of remap_state_dict_hf_bigcode.
    c                 S   r   )Nz,^transformer.embeddings.position_embeddings.ztransformer.wpe.r   r   r   r   r   inv_key_mapping_pos_embx   r   z@inv_remap_state_dict_hf_bigcode.<locals>.inv_key_mapping_pos_embc                 3   r   r   r   r   )rK   r   r   r   {   r   z2inv_remap_state_dict_hf_bigcode.<locals>.<genexpr>r   Nr   r   c                 S   r   )Nr   r    z1^transformer.layers.(\d+).norm(1|2).(weight|bias)ztransformer.h.\1.ln_\2.\3r   r   r   r   r   inv_key_mapping_ln   r"   z;inv_remap_state_dict_hf_bigcode.<locals>.inv_key_mapping_lnc                 3   r   r   r   r   )rL   r   r   r      r   c                 S   r#   )	Nz(^transformer.layers.(\d+).mlp.fc1.weightz transformer.h.\1.mlp.c_fc.weightz(^transformer.layers.(\d+).mlp.fc2.weightz"transformer.h.\1.mlp.c_proj.weightz&^transformer.layers.(\d+).mlp.fc1.biasztransformer.h.\1.mlp.c_fc.biasz&^transformer.layers.(\d+).mlp.fc2.biasz transformer.h.\1.mlp.c_proj.biasr   r   r   r   r   inv_key_mapping_mlp   r%   z<inv_remap_state_dict_hf_bigcode.<locals>.inv_key_mapping_mlpc                 3   r   r   r   r   )rM   r   r   r      r   r*   r+   r   r(   r&   r'   r-   r,   c                 S   r   )Nz/^transformer.layers.(\d+).mixer.out_proj.weightz#transformer.h.\1.attn.c_proj.weightz-^transformer.layers.(\d+).mixer.out_proj.biasz!transformer.h.\1.attn.c_proj.biasr   r   r   r   r   inv_key_mapping_attn   r/   z=inv_remap_state_dict_hf_bigcode.<locals>.inv_key_mapping_attnc                 3   r   r   r   r   )rN   r   r   r      r   )r   r0   r1   r5   r:   r;   r<   r=   r>   r?   rA   )rB   r   rC   rD   rE   rF   Wqkv_weightrH   r   r   rG   	Wqkv_biasrI   r   )rN   rL   rM   rK   r   inv_remap_state_dict_hf_bigcodep   s8   
	
$$rQ   bigcode_configreturnc                 C   s   t di d| jd| jd| jd| jd| jd| jd| jd| jd	| j	d
| j
d| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jS )Nactivation_function
attn_pdropbos_token_id
embd_pdropeos_token_idinitializer_rangelayer_norm_epsilonmax_batch_sizemax_sequence_length
model_typer9   r<   r=   n_innern_layern_positionsresid_pdropscale_attn_weightssummary_activationsummary_first_dropoutsummary_proj_to_labelssummary_typesummary_use_proj	use_cacher5   r   )r   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r9   r<   r=   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   r5   )rR   r   r   r   bigcode_config_to_gpt2_config   sf   	
ri   )r3   r	   collectionsr   r>   torch.nn.functionalnn
functionalr6   transformersr   r   r   rJ   rQ   ri   r   r   r   r   <module>   s    f^