o
    ̳iL                     @   s   d dl mZmZ d dlZd dlmZ dddddd	d
ddd	Zdeeejf dee	 dee	 dee	 deeejf f
ddZ
deeejf deeejf fddZdS )    )DictOptionalN)get_mapped_keyztok_embeddings.weightzlayers.{}.attn.q_proj.weightz!layers.{}.attn.output_proj.weightzlayers.{}.mlp.w1.weightzlayers.{}.mlp.w2.weightzlayers.{}.sa_norm.scalezlayers.{}.mlp_norm.scalez
norm.scalezoutput.weight)	zmodel.embed_tokens.weightz)model.layers.{}.self_attn.qkv_proj.weightz'model.layers.{}.self_attn.o_proj.weightz'model.layers.{}.mlp.gate_up_proj.weightz$model.layers.{}.mlp.down_proj.weightz&model.layers.{}.input_layernorm.weightz/model.layers.{}.post_attention_layernorm.weightzmodel.norm.weightzlm_head.weight
state_dict	num_headsnum_kv_headsdimreturnc                 C   s  i }|dur!|du s|du rt d|}|| | }|| | }nd\}}}|  D ]]\}}	t|t}
d|v ri|durJtj|	|||gdd\}}}n
|	jddd\}}}|||
< |||
dd	< |||
dd
< q*d|v r|	jddd\}}|||
< |||
dd< q*|	||
< q*|S )z
    Convertor from HF state dict to torchtune state dict. This handles:
    - Splitting the fused q,k and v matrix
    - Splitting the fused gate and up projection matrix
    NzKPhi models with GQA require dim, num_heads and num_kv_heads to be specified)NNNqkvr   r      q_projk_projv_projgate   w1w3)
ValueErroritemsr   
_PHI3_MINItorchsplitchunkreplace)r   r   r   r   converted_state_dictq_dimk_dimv_dimkeyvaluenew_keyqkvr   r    r%   Z/home/ubuntu/.local/lib/python3.10/site-packages/torchtune/models/phi3/_convert_weights.pyphi3_hf_to_tune   s:   


r'   c                 C   s   i }dd t  D }|  D ][\}}d|v sd|v sd|v r qt||}d|v rJ|}| |dd }| |dd }tj|||gdd}	|	||< qd	|v rf|}
| |d	d }tj|
|gdd}|||< q|||< q|S )
z
    Convertor from torchtune state dict to HF state dict. This handles:
    - Fusing q,k and v matrix
    - Fusing gate and up projection matrix
    c                 S   s   i | ]\}}||qS r%   r%   ).0r#   r$   r%   r%   r&   
<dictcomp>P   s    z#phi3_tune_to_hf.<locals>.<dictcomp>r   r   r   r   r   r   r   )r   r   r   r   r   cat)r   r   inverted_mapping_dictr   r    r!   r"   r#   r$   r
   	gate_projup_projgate_up_projr%   r%   r&   phi3_tune_to_hfI   s&   



r/   )typingr   r   r    torchtune.models.convert_weightsr   r   strTensorintr'   r/   r%   r%   r%   r&   <module>   s2   
*.