o
    }oi-                     @   sv  d dl mZ d dlmZ d dlmZmZmZ d dlZd dl	m
  mZ d dlm
Z
 d dlmZmZ d dlmZmZmZ d dlmZ erUd d	lmZ d d
lmZ d dlmZ eG dd deZeG dd deZG dd deZeedG dd dejdef Z e!edG dd dejedf Z"ej#ddddej$fddZ%ej#ddddej$fddZ&g d Z'dS )!    )	dataclass)Path)TYPE_CHECKINGCallableOptionalN)nn)	GPTConfigGPTModel)OptimizerModuleioteardown)dtype_from_hf
Phi3ConfigPhi3ForCausalLM)TokenizerSpecc                   @   s|   e Zd ZU dZeed< ejZe	ed< dZ
eed< dZeed< dZeed	< d
Zeed< dZeed< dZeed< dZeed< dS )r   RMSNormnormalizationactivation_funcTgated_linear_unitropeposition_embedding_typeFadd_bias_lineari   
seq_lengthg        attention_dropouthidden_dropout#share_embeddings_and_output_weightsN)__name__
__module____qualname__r   str__annotations__Fsilur   r   r   boolr   r   r   intr   floatr   r    r(   r(   [/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/gpt/model/phi3mini.pyr   "   s   
 r   c                   @   sb   e Zd ZU dZeed< dZeed< dZeed< dZeed< dZ	eed< d	Z
eed
< dZeed< dS )Phi3ConfigMini    
num_layersi   hidden_sizei    ffn_hidden_sizenum_attention_headsnum_query_groupsg     @rotary_basei@}  
vocab_sizeN)r   r   r    r,   r&   r"   r-   r.   r/   r0   r1   r'   r2   r(   r(   r(   r)   r*   0   s   
 r*   c                       sV   e Zd Z				d	dee dee ded deeejgejf  f fddZ	  Z
S )
	Phi3ModelNconfigoptim	tokenizerr   model_transformc                    s   t  j|pt |||d d S )N)r5   r6   r7   )super__init__r   )selfr4   r5   r6   r7   	__class__r(   r)   r9   >   s   zPhi3Model.__init__)NNNN)r   r   r    r   r   r
   r   r   Moduler9   __classcell__r(   r(   r;   r)   r3   <   s    r3   hfc                   @   sR   e Zd ZdefddZdedefddZdd Zed	d
 Z	ede
fddZdS )HFPhi3Importerreturnc                 C   s   t | j| jdS )N)r6   )r3   r4   r6   r:   r(   r(   r)   initK   s   zHFPhi3Importer.initoutput_pathc              
   C   s   ddl m} z|jt| dd}W n ty' } z
td|  d| d }~ww |  }| |}| || | 	|| t
d| d|j d	 t|| ~~|S )
Nr   r   auto)torch_dtypez&Failed to load the model from source 'z': z-Converted Phi3 model to Nemo, model saved to z in .)transformersr   from_pretrainedr!   	Exception
ValueErrorrC   
nemo_setupconvert_state	nemo_saveprintdtyper   )r:   rD   r   sourceetargettrainerr(   r(   r)   applyN   s   

zHFPhi3Importer.applyc              
   C   s,   ddddddddd	d
	}t j|||tgdS )N embedding.word_embeddings.weight2decoder.layers.*.self_attention.linear_proj.weight1decoder.layers.*.self_attention.linear_qkv.weight&decoder.layers.*.mlp.linear_fc1.weight&decoder.layers.*.mlp.linear_fc2.weight<decoder.layers.*.self_attention.linear_qkv.layer_norm_weight1decoder.layers.*.mlp.linear_fc1.layer_norm_weightdecoder.final_layernorm.weightoutput_layer.weight)	model.embed_tokens.weight&model.layers.*.self_attn.o_proj.weight(model.layers.*.self_attn.qkv_proj.weight&model.layers.*.mlp.gate_up_proj.weight#model.layers.*.mlp.down_proj.weight%model.layers.*.input_layernorm.weight.model.layers.*.post_attention_layernorm.weightmodel.norm.weightlm_head.weightmapping
transforms)r   apply_transforms_import_qkvr:   rQ   rS   ri   r(   r(   r)   rM   c   s   zHFPhi3Importer.convert_statec                 C   s   ddl m} || t| S )Nr   )AutoTokenizer)=nemo.collections.common.tokenizers.huggingface.auto_tokenizerrn   save_hf_tokenizer_assetsr!   )r:   rn   r(   r(   r)   r6   t   s   zHFPhi3Importer.tokenizerc                 C   s~   ddl m} |t| }dd }t|j|j|j|j|j|j	|j
d||jdt|tjkt|tjkt|d}td| |S )	Nr   r   c                 S   s(   d}| | dkr|d }| | dks|S )N   r      r(   )r2   baser(   r(   r)   make_vocab_size_divisible_by   s
   z;HFPhi3Importer.config.<locals>.make_vocab_size_divisible_byTF)r,   r-   r.   r/   init_method_stdlayernorm_epsilonr1   r   rt   r   fp16bf16params_dtypezoutput:)rH   r   rI   r!   num_hidden_layersr-   intermediate_sizer/   initializer_rangerms_norm_eps
rope_thetar2   r   torchfloat16bfloat16rO   )r:   HFPhi3ConfigrQ   rt   outputr(   r(   r)   r4   {   s(   
zHFPhi3Importer.configN)r   r   r    r3   rC   r   rU   rM   propertyr6   r   r4   r(   r(   r(   r)   r@   H   s    
r@   r   c                   @   sJ   e Zd ZdddZdedefddZdd	 Zed
d ZedddZ	dS )HFPhi3ExporterrA   r   c                 C   s   ddl m} || jS )Nr   )AutoModelForCausalLM)rH   r   from_configr4   )r:   r   r(   r(   r)   rC      s   zHFPhi3Exporter.initrD   c                 C   sD   |   }| t| \}}| ||}| | | j| |S N)rC   	nemo_loadr!   rM   cpusave_pretrainedr6   )r:   rD   rS   rQ   _r(   r(   r)   rU      s   zHFPhi3Exporter.applyc              	   C   s*   ddddddddd	}t j|||tgd
S )Nr_   r`   rb   rc   rd   re   rf   rg   )rV   rW   rY   rZ   r[   r\   r]   r^   rh   )r   rk   _export_qkvrm   r(   r(   r)   rM      s   zHFPhi3Exporter.convert_statec                 C   s   t t| jjjS r   )r   load_contextr!   modelr6   rB   r(   r(   r)   r6      s   zHFPhi3Exporter.tokenizerr   c                 C   sV   t jt| dd}ddlm} |dg|j|j|j|j|j	dd|j
|j| jj| jjdS )	Nzmodel.config)subpathr   r   r   g{Gz?gh㈵>)architecturesrz   r-   r{   r/   max_position_embeddingsr|   r}   num_key_value_headsr~   r2   pad_token_id)r   r   r!   rH   r   r,   r-   r.   r/   r   r0   r1   r6   r2   r   )r:   rQ   r   r(   r(   r)   r4      s    zHFPhi3Exporter.configN)rA   r   )rA   r   )
r   r   r    rC   r   rU   rM   r   r6   r4   r(   r(   r(   r)   r      s    


r   ra   rX   )
source_key
target_keyctxc              	   C   s  | j j}|j}|j}|| }|j}|j}| }|||d f}	|||d f}
|j|| || || gdd\}}}|j|	 }|j|
 }|j|
 }t	
d||d f|}t|D ]B}t	|||| |d | d d d d f f}t	||||d d d d d f f}t	||||d d d d d f f}qY|jdksJ |j|jd |d | ksJ |j|jd |ksJ |j|jd |d ksJ |j|||d|   |g}|S )N   r   dim   rr   )rS   r4   r/   r0   r-   kv_channelssizesplitviewr   emptytype_asrangecatndimshapereshape)r   
qkv_weightmegatron_confighead_numr0   heads_per_groupr-   	head_sizeold_tensor_shapenew_q_tensor_shapenew_kv_tensor_shapeqkvqkv_weightsir(   r(   r)   rl      s4   


0(* rl   c                    s   | j j}|j}|j}||  |j}|j}|d|  }||||g}t fddt	|D }t
 | d }	t
 d | d }
|| d| }||	 d| }||
 d| }tj|||gddS )a  Transform function to convert fused QKV weights to separate Q,K,V format.

    Converts NeMo's fused QKV projection weights to HF's separate Q, K, V format,
    handling grouped query attention (GQA) appropriately.

    Args:
        ctx: Transform context
        linear_qkv: Fused QKV projection weights

    Returns:
        Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Separate Q, K, V projection weights
    rr   c                    s,   g | ]}t  d  |  d  |   qS )rr   )r   arange).0r   r   r(   r)   
<listcomp>  s    z_export_qkv.<locals>.<listcomp>r   r   r   )rQ   r4   r/   r0   r-   r   r   r   r   r   r   r   )r   
linear_qkvr   r   r0   r-   r   qkv_total_dimq_slicek_slicev_sliceq_projk_projv_projr(   r   r)   r      s&   
r   )r   r*   r3   )(dataclassesr   pathlibr   typingr   r   r   r   torch.nn.functionalr   
functionalr#   #nemo.collections.llm.gpt.model.baser   r	   nemo.lightningr
   r   r   nemo.lightning.pytorch.utilsr   rH   r   r   r   1nemo.collections.common.tokenizers.tokenizer_specr   r*   r3   model_importerModelConnectorr@   model_exporterr   state_transformTransformCTXrl   r   __all__r(   r(   r(   r)   <module>   sB   

R<"'