o
    wi.                     @   s  d dl mZmZ d dlmZ d dlmZmZmZm	Z	 d dl
Z
d dlm  mZ d dl
mZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ ersd dl m!Z!m"Z" d dl#m$Z$ d dl%m&Z& eG dd deZ'eG dd de'Z(eG dd de'Z)G dd deZ*e+e*dG dd dej,de*f Z-e.e*dG dd dej,e*df Z/ddgZ0dS )    )	dataclassfield)Path)TYPE_CHECKINGCallableListOptionalN)nn)	Annotated)	GPTConfigGPTModeltorch_dtype_from_mcore_config)Config)ioteardown)TransformFns)OptimizerModule)dtype_from_hf)MistralConfigMistralForCausalLMAutoTokenizer)TokenizerSpecc                   @   s
  e Zd ZU dZdZeed< ejZ	e
ed< dZeed< dZeed< d	Zeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< edd d Zee ed!< d"Zeed#< e j!Z"e j#ed$< d%S )&MistralConfig7Bz
    Mistral 7B config.
    RMSNormnormalizationactivation_funcropeposition_embedding_typeFadd_bias_linearTgated_linear_unit    
num_layers   hidden_sizenum_attention_heads   num_query_groupsi 8  ffn_hidden_sizei   
seq_lengthg        attention_dropouthidden_dropout#share_embeddings_and_output_weightsg{Gz?init_method_stdgh㈵>layernorm_epsilonc                   C   s   ddgS )Nr#   r    r/   r/   r/   c/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/gpt/model/mistral.py<lambda>>   s    zMistralConfig7B.<lambda>)default_factorywindow_sizea2acp_comm_typeparams_dtypeN)$__name__
__module____qualname____doc__r   str__annotations__Fsilur   r   r   r   boolr    r"   intr$   r%   r'   r(   r)   r*   floatr+   r,   r-   r.   r   r3   r   r5   torchbfloat16r6   dtyper/   r/   r/   r0   r   &   s*   
 r   c                   @   s   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
ee ed< d
Zeed< dZeed< dZeed< ejZejed< d
S )MistralNeMoConfig12Bz/
    https://mistral.ai/news/mistral-nemo/
    (   r"   i   r$      kv_channelsr#   r)   Nr3   r5         ?rotary_percent    .Arotary_baser6   )r7   r8   r9   r:   r"   r@   r<   r$   rH   r)   r3   r   r5   r;   rJ   rA   rL   rB   rC   r6   rD   r/   r/   r/   r0   rE   C   s   
 rE   c                   @   s   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dZeed< dZee ed< dZeed< dZeed< dZeed< ejZejed< dS )MistralNeMoConfig123Bz5
    https://mistral.ai/news/mistral-large-2407/
    X   r"   i 0  r$   i p  r(   `   r%   rG   rH   r#   r)   Nr3   r5   rI   rJ   rK   rL   r6   )r7   r8   r9   r:   r"   r@   r<   r$   r(   r%   rH   r)   r3   r   r5   r;   rJ   rA   rL   rB   rC   r6   rD   r/   r/   r/   r0   rM   U   s   
 rM   c                       sf   e Zd ZdZ				d
deee ee f dee ded dee	e
jge
jf  f fdd	Z  ZS )MistralModel Nconfigoptim	tokenizerr   model_transformc                    s   t  j|pt |||d d S )N)rS   rT   rU   )super__init__r   )selfrR   rS   rT   rU   	__class__r/   r0   rW   l   s   
zMistralModel.__init__)NNNN)r7   r8   r9   r:   r
   r   r   r   r   r   r	   ModulerW   __classcell__r/   r/   rY   r0   rP   i   s    rP   hfc                   @   sX   e Zd ZdZdefddZdedefddZdd	 Ze	dddZ
e	defddZdS )HFMistralImporterrQ   returnc                 C   s   t | j| jdS )N)rT   )rP   rR   rT   rX   r/   r/   r0   init|   s   zHFMistralImporter.initoutput_pathc                 C   sh   ddl m} |jt| dd}|  }| |}| || | || td|  t	|| ~~|S )Nr   )r   autotorch_dtypez3Converted Mistral 7B model to Nemo, model saved to )
transformersr   from_pretrainedr;   ra   
nemo_setupconvert_state	nemo_saveprintr   )rX   rb   r   sourcetargettrainerr/   r/   r0   apply   s   

zHFMistralImporter.applyc                 C   sJ   dddddddd}t jd	d
tjdt jddtjdg}t j||||dS )rQ    embedding.word_embeddings.weight2decoder.layers.*.self_attention.linear_proj.weight&decoder.layers.*.mlp.linear_fc2.weight<decoder.layers.*.self_attention.linear_qkv.layer_norm_weight1decoder.layers.*.mlp.linear_fc1.layer_norm_weightdecoder.final_layernorm.weightoutput_layer.weight)model.embed_tokens.weight&model.layers.*.self_attn.o_proj.weight#model.layers.*.mlp.down_proj.weight%model.layers.*.input_layernorm.weight.model.layers.*.post_attention_layernorm.weightmodel.norm.weightlm_head.weightz&model.layers.*.self_attn.q_proj.weightz&model.layers.*.self_attn.k_proj.weightz&model.layers.*.self_attn.v_proj.weight1decoder.layers.*.self_attention.linear_qkv.weight
source_key
target_keyfnz#model.layers.*.mlp.gate_proj.weightz!model.layers.*.mlp.up_proj.weight&decoder.layers.*.mlp.linear_fc1.weightmapping
transforms)r   state_transformr   	merge_qkv	merge_fc1apply_transformsrX   rl   rm   r   r   r/   r/   r0   ri      s(   	zHFMistralImporter.convert_stater   c                 C   s   ddl m} || t| S )rQ   r   r   )=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr   save_hf_tokenizer_assetsr;   )rX   r   r/   r/   r0   rT      s   zHFMistralImporter.tokenizerc           	   	   C   s  ddl m}m} |t| }|t| }dd }d\}}t|dddur-|jdg}d}tdi d	|jd
|jd|j	d|j
dt|d|j	|j d|jd|jd|jd|jd|jddd||jd|d|dddt|tjkdt|tjkdt|d|}|S ) rQ   r   )GenerationConfigr   c                 S   s(   d}| | dkr|d }| | dks|S )NrG   r      r/   )mistral_vocab_sizebaser/   r/   r0   make_vocab_size_divisible_by   s
   z>HFMistralImporter.config.<locals>.make_vocab_size_divisible_by)NNsliding_windowNr4   r)   r"   r$   r(   rH   head_dimr%   r-   r.   r'   rL   r    Tr   r3   r5   r,   Ffp16bf16r6   generation_configr/   )rf   r   r   rg   r;   getattrr   r   num_hidden_layersr$   intermediate_sizer%   initializer_rangerms_norm_epsnum_key_value_heads
rope_theta
vocab_sizer   rB   float16rC   )	rX   r   r   rl   r   r   r3   r5   outputr/   r/   r0   rR      s`   
	

zHFMistralImporter.configN)r_   r   )r7   r8   r9   r:   rP   ra   r   ro   ri   propertyrT   r   rR   r/   r/   r/   r0   r^   x   s    r^   r   c                   @   sT   e Zd ZdZejfdddZdedefddZd	d
 Z	e
dd Ze
dddZdS )HFMistralExporterrQ   r_   r   c                 C   sR   ddl m} ddlm} |  |j| j|dW  d    S 1 s"w   Y  d S )Nr   )AutoModelForCausalLM)no_init_weightsrd   )rf   r   transformers.modeling_utilsr   from_configrR   )rX   rD   r   r   r/   r/   r0   ra      s
   $zHFMistralExporter.initrb   c                 C   sP   |  t| \}}| t|j}| ||}| }|| | j| |S )N)		nemo_loadr;   ra   r   rR   ri   cpusave_pretrainedrT   )rX   rb   rl   _rm   r/   r/   r0   ro      s   
zHFMistralExporter.applyc                 C   sf   dddddd}t jddtjd	t jd
dtjd	t jddtjd	t jddtjd	g}t j||||dS )rQ   rx   ry   rz   r{   r|   )rq   rr   rs   rt   ru   rp   rw   r   rv   r}   r   r~   r   r   r   )r   r   r   prune_padding	split_qkv	split_fc1r   r   r/   r/   r0   ri      sB   		zHFMistralExporter.convert_statec                 C   s   t t| jjjS )rQ   )r   load_contextr;   modelrT   r`   r/   r/   r0   rT   %  s   zHFMistralExporter.tokenizerr   c                 C   sn   t jt| dd}ddlm} |dg|jdur|jd nd|j|j|j|j	|j
|j|j|j|j| jj|jdS )rQ   zmodel.config)subpathr   )r   r   N)architecturesr   r   r$   r   r%   max_position_embeddingsr   r   r   r   r   r   )r   r   r;   rf   r   r3   r"   r$   r(   r%   r)   r-   r.   r'   rL   rT   r   rH   )rX   rl   HfMistralConfigr/   r/   r0   rR   *  s"   zHFMistralExporter.configN)r_   r   )r_   r   )r7   r8   r9   r:   rB   rC   ra   r   ro   ri   r   rT   rR   r/   r/   r/   r0   r      s    +
r   )1dataclassesr   r   pathlibr   typingr   r   r   r   rB   torch.nn.functionalr	   
functionalr=   typing_extensionsr
   #nemo.collections.llm.gpt.model.baser   r   r   nemo.collections.llm.utilsr   nemo.lightningr   r   nemo.lightning.io.stater   nemo.lightning.pytorch.optimr   nemo.lightning.pytorch.utilsr   rf   r   r   r   r   1nemo.collections.common.tokenizers.tokenizer_specr   r   rE   rM   rP   model_importerModelConnectorr^   model_exporterr   __all__r/   r/   r/   r0   <module>   s>   

gb