o
    wi?                     @   s  d dl mZ d dlmZ d dlmZmZmZmZm	Z	 d dl
Z
d dlm  mZ d dl
mZ d dlmZmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZ erkd dlmZ d dlmZ d dl m!Z! d dl"m#Z# eG dd deZeG dd deZ$eG dd deZ%eG dd deZ&G dd deZ'e(e'dG dd dej)de'f Z*e+e'dG dd dej)e'df Z,g dZ-dS )     )	dataclass)Path)TYPE_CHECKING	AnnotatedCallableListOptionalN)nn)	GPTConfigGPTModel)Config)OptimizerModuleioteardown)TransformFns)dtype_from_hfStarcoder2ConfigStarcoder2ForCausalLMAutoTokenizer)TokenizerSpecc                   @   s   e Zd ZU dZdZeed< ejZ	e
ed< dZeed< dZeed< d	Zeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeee  ed< dZeed< dZeed< dZeed< dZeed< dS )r   zS
    Configuration class for the Starcoder2 Config, inheriting from GPTConfig.
    	LayerNormnormalizationactivation_funcTadd_bias_lineari @  
seq_lengthropeposition_embedding_typeg      ?rotary_percentg        hidden_dropoutattention_dropoutg{Gz?init_method_stdF#share_embeddings_and_output_weightsNkv_channelsnum_query_groupswindow_sizeattention_softmax_in_fp32bias_activation_fusionbias_dropout_fusiongh㈵>layernorm_epsilon)__name__
__module____qualname____doc__r   str__annotations__Fgelur   r   r   boolr   intr   r    floatr!   r"   r#   r$   r%   r&   r'   r   r   r(   r)   r*   r+    r7   r7   f/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/gpt/model/starcoder2.pyr   %   s&   
 r   c                   @   f   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dZeed< dZeed< dS )Starcoder2Config3Bz]
    Configuration class for the Starcoder2 3B Config, inheriting from Starcoder2Config.
       
num_layersi   hidden_sizei 0  ffn_hidden_size   r&      num_attention_headsVy?r#   gR~.Arotary_baseNr,   r-   r.   r/   r<   r5   r1   r=   r>   r&   rA   r#   r6   rC   r7   r7   r7   r8   r:   ?      
 r:   c                   @   r9   )Starcoder2Config7Bz]
    Configuration class for the Starcoder2 7B Config, inheriting from Starcoder2Config.
        r<   i   r=   i H  r>      r&   $   rA   rB   r#   i@B rC   NrD   r7   r7   r7   r8   rF   N   rE   rF   c                   @   r9   )Starcoder2Config15Bz^
    Configuration class for the Starcoder2 15B Config, inheriting from Starcoder2Config.
    (   r<   i   r=   i `  r>   rH   r&   0   rA   g&1?r#   i rC   NrD   r7   r7   r7   r8   rJ   ]   rE   rJ   c                       sf   e Zd ZdZ				d
deee ee f dee ded dee	e
jge
jf  f fdd	Z  ZS )Starcoder2Modelz
    Starcoder2 model implementation based on the GPT model architecture.

    This class provides a high-level interface for Starcoder2 models,
    implementing the specific architecture and settings needed for Starcoder2 models.
    Nconfigoptim	tokenizerr   model_transformc                    s   t  j|pt |||d d S )N)rO   rP   rQ   )super__init__r   )selfrN   rO   rP   rQ   	__class__r7   r8   rS   t   s   
zStarcoder2Model.__init__)NNNN)r,   r-   r.   r/   r   r   r   r   r   r   r	   ModulerS   __classcell__r7   r7   rU   r8   rM   l   s    	rM   hfc                   @   sX   e Zd ZdZdefddZdedefddZdd	 Ze	dddZ
e	defddZdS )HFStarcoder2Importerz
    Importer for converting Hugging Face Starcoder2 models to NeMo format.

    This class handles the conversion of Hugging Face's Starcoder2ForCausalLM models
    to NeMo's Starcoder2 format, including weight mapping and configuration translation.
    returnc                 C   s   t | j| jdS )z
        Initialize a NeMo Starcoder2Model instance.

        Returns:
            Starcoder2Model: Initialized NeMo Starcoder2 model with the appropriate configuration
                        and tokenizer.
        )rP   )rM   rN   rP   rT   r7   r7   r8   init   s   zHFStarcoder2Importer.initoutput_pathc                 C   sh   ddl m} |jt| dd}|  }| |}| || | || td|  t	|| ~~|S )z
        Apply the conversion from HF to NeMo format.

        Args:
            output_path: Path where the converted model will be saved

        Returns:
            Path: Path to the saved NeMo model
        r   r   auto)torch_dtypez3Converted Starcoder2 model to Nemo, model saved to )
transformersr   from_pretrainedr0   r]   
nemo_setupconvert_state	nemo_saveprintr   )rT   r^   r   sourcetargettrainerr7   r7   r8   apply   s   


zHFStarcoder2Importer.applyc                 C   sX   ddddddddd	d
ddddd}t jddtjdt jddtjdg}t j||||dS )aS  
        Convert state dict from HF format to NeMo format.

        Maps the weights from the HF model to the NeMo model according to
        the appropriate mapping scheme.

        Args:
            source: Source HF model
            target: Target NeMo model

        Returns:
            The result of applying the transforms
         embedding.word_embeddings.weight2decoder.layers.*.self_attention.linear_proj.weight0decoder.layers.*.self_attention.linear_proj.bias&decoder.layers.*.mlp.linear_fc1.weight$decoder.layers.*.mlp.linear_fc1.bias&decoder.layers.*.mlp.linear_fc2.weight$decoder.layers.*.mlp.linear_fc2.bias<decoder.layers.*.self_attention.linear_qkv.layer_norm_weight:decoder.layers.*.self_attention.linear_qkv.layer_norm_bias1decoder.layers.*.mlp.linear_fc1.layer_norm_weight/decoder.layers.*.mlp.linear_fc1.layer_norm_biasdecoder.final_layernorm.weightdecoder.final_layernorm.biasoutput_layer.weight)model.embed_tokens.weight&model.layers.*.self_attn.o_proj.weight$model.layers.*.self_attn.o_proj.biasmodel.layers.*.mlp.c_fc.weightmodel.layers.*.mlp.c_fc.bias model.layers.*.mlp.c_proj.weightmodel.layers.*.mlp.c_proj.bias%model.layers.*.input_layernorm.weight#model.layers.*.input_layernorm.bias.model.layers.*.post_attention_layernorm.weight,model.layers.*.post_attention_layernorm.biasmodel.norm.weightmodel.norm.biaslm_head.weightz&model.layers.*.self_attn.q_proj.weightz&model.layers.*.self_attn.k_proj.weightz&model.layers.*.self_attn.v_proj.weight1decoder.layers.*.self_attention.linear_qkv.weight
source_key
target_keyfnz$model.layers.*.self_attn.q_proj.biasz$model.layers.*.self_attn.k_proj.biasz$model.layers.*.self_attn.v_proj.bias/decoder.layers.*.self_attention.linear_qkv.biasmapping
transforms)r   state_transformr   	merge_qkvmerge_qkv_biasapply_transformsrT   rg   rh   r   r   r7   r7   r8   rd      s6   	z"HFStarcoder2Importer.convert_stater   c                 C   s   ddl m} || t| S )z
        Get the tokenizer for the HF model.

        Returns:
            AutoTokenizer: Tokenizer instance initialized from the HF model's tokenizer
        r   r   )=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr   save_hf_tokenizer_assetsr0   )rT   r   r7   r7   r8   rP      s   zHFStarcoder2Importer.tokenizerc                 C   sz   ddl m} |t| }dd }t|j|j|j|j|j|j	|j
|j|j||jdt|tjkt|tjkt|d}|S )a  
        Create a NeMo Starcoder2Config from the HF model config.

        Translates the HF configuration parameters to the equivalent NeMo
        configuration.

        Returns:
            Starcoder2Config: NeMo configuration for Starcoder2 models
        r   r   c                 S   s(   d}| | dkr|d }| | dks|S )N   r   r?   r7   )
vocab_sizebaser7   r7   r8   make_vocab_size_divisible_by   s
   zAHFStarcoder2Importer.config.<locals>.make_vocab_size_divisible_byF)r<   r=   r>   rA   r#   r   r+   r&   rC   r   r$   fp16bf16params_dtype)ra   r   rb   r0   num_hidden_layersr=   intermediate_sizerA   initializer_rangemax_position_embeddingsnorm_epsilonnum_key_value_heads
rope_thetar   r   torchfloat16bfloat16)rT   HFStarcoder2Configrg   r   outputr7   r7   r8   rN      s(   zHFStarcoder2Importer.configN)r[   r   )r,   r-   r.   r/   rM   r]   r   rj   rd   propertyrP   r   rN   r7   r7   r7   r8   rZ      s    
5rZ   r   c                   @   sN   e Zd ZdZdddZdedefddZd	d
 Zedd Z	edddZ
dS )HFStarcoder2Exporterz
    Exporter for converting NeMo Starcoder2Model to Hugging Face format.

    This class handles the conversion of NeMo's Starcoder2Model to Hugging Face's
    Starcoder2ForCausalLM format, including weight mapping and configuration translation.
    r[   r   c                 C   sN   ddl m} ddlm} |  || jW  d   S 1 s w   Y  dS )z
        Initialize a HF Starcoder2ForCausalLM instance.

        Args:
            dtype: Data type for model parameters

        Returns:
            Starcoder2ForCausalLM: Initialized HF Starcoder2 model
        r   r   )no_init_weightsN)ra   r   transformers.modeling_utilsr   _from_configrN   )rT   r   r   r7   r7   r8   r]     s
   

$zHFStarcoder2Exporter.initr^   c                 C   sH   |   }| t| \}}| ||}| }|| | j| |S )N)r]   	nemo_loadr0   rd   cpusave_pretrainedrP   )rT   r^   rh   rg   _r7   r7   r8   rj   /  s   
zHFStarcoder2Exporter.applyc                 C   st   ddddddddd	d
ddd}t jddtjdt jddtjdt jddtjdt jddtjdg}t j||||dS )ac  
        Convert state dict from NeMo format to HF format.

        Maps the weights from the NeMo model to the HF model according to
        the appropriate mapping scheme.

        Args:
            source: Source NeMo model
            target: Target HF model

        Returns:
            The target model with weights transferred from source
        rz   r{   r|   r}   r~   r   r   r   r   r   r   r   )rl   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   r   r   r   r   r   rk   ry   rx   r   r   )r   r   r   	split_qkvsplit_qkv_biasprune_paddingr   r   r7   r7   r8   rd   :  sP   		z"HFStarcoder2Exporter.convert_statec                 C   s   t t| jjjS )z
        Get the tokenizer from the NeMo model.

        Returns:
            TokenizerSpec: Tokenizer from the NeMo model
        )r   load_contextr0   modelrP   r\   r7   r7   r8   rP   |  s   zHFStarcoder2Exporter.tokenizerr   c                 C   sv   ddl m} tjt| dd}|dg|j|j|j|j|j	dur"|j	n|j|j |j
|j|j|j|j|j|j| jjdS )zCreate a HF HFStarcoder2Config from the NeMo model config.

        Translates the NeMo configuration parameters to the equivalent HF
        configuration.

        Returns:
            HFStarcoder2Config: HF configuration for Starcoder2 models
        r   r   zmodel.config)subpathr   N)architecturesr   r=   r   rA   head_dimtie_word_embeddingsr   r   norm_epsr   r   partial_rotary_factorr   )ra   r   r   r   r0   r<   r=   r>   rA   r%   r$   r   r#   r+   r&   rC   r    rP   r   )rT   r   rg   r7   r7   r8   rN     s(   


zHFStarcoder2Exporter.configN)r[   r   )r[   r   )r,   r-   r.   r/   r]   r   rj   rd   r   rP   rN   r7   r7   r7   r8   r     s    
B
	r   )r   r:   rF   rJ   rM   ).dataclassesr   pathlibr   typingr   r   r   r   r   r   torch.nn.functionalr	   
functionalr2   #nemo.collections.llm.gpt.model.baser
   r   nemo.collections.llm.utilsr   nemo.lightningr   r   r   nemo.lightning.io.stater   nemo.lightning.pytorch.utilsr   ra   r   r   r   r   r   1nemo.collections.common.tokenizers.tokenizer_specr   r:   rF   rJ   rM   model_importerModelConnectorrZ   model_exporterr   __all__r7   r7   r7   r8   <module>   s@   
 
 