o
    }oiJ                     @   s   d dl m  mZ d dlmZmZmZmZm	Z	m
Z
mZmZmZmZmZmZmZmZmZ dd Zdd Zdd Zd	d
 Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Z dd Z!dd  Z"dS )!    N)CodeLlamaConfig7BCodeLlamaConfig13BCodeLlamaConfig34BCodeLlamaConfig70BLlama2Config7BLlama2Config13BLlama2Config70BLlama3ConfigLlama3Config8BLlama3Config70BLlama31ConfigLlama31Config8BLlama31Config70BLlama31Config405BLlamaConfigc                  C   s   t dddd} | jdksJ | jtjksJ | jdu sJ | jdks$J | jdu s+J | jdks2J | j	dks9J | j
dks@J | jdu sGJ d S )	N       )num_attention_heads
num_layershidden_sizeRMSNormTropeF        )r   normalizationactivation_funcFsilugated_linear_unitposition_embedding_typeadd_bias_linear
seq_lengthattention_dropouthidden_dropout#share_embeddings_and_output_weightsconfig r&   ^/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/llm/gpt/model/test_llama.pytest_llama_config$   s   r(   c                  C   s   t dddddd} | jdksJ | jdksJ | jdksJ | jdks%J | jd	ks,J | jd
ks3J | jdu s:J | jdu sAJ | j	du sHJ | j
du sOJ | jdu sVJ | jdu s]J | jdu sdJ | jdkskJ | jdksrJ d S )NP   i         r   )r   r   r   num_query_groupsffn_hidden_sizer   r   g{Gz?gh㈵>FTr   g      ?)r	   r,   r"   r!   r   init_method_stdlayernorm_epsilonr   bias_activation_fusionmasked_softmax_fusionpersist_layer_normbias_dropout_fusionapply_rope_fusionr#   r   rotary_percentr$   r&   r&   r'   test_llama3_config1   s$   
r6   c                  C   s^   t  } | jdks
J | jdksJ | jdksJ | jdksJ | jdks&J | jdks-J d S )Nr   r   i +  r   )r   r   r   r   r,   r-   r   r$   r&   r&   r'   test_llama2_config_7bI      r7   c                  C   sP   t  } | jdks
J | jdksJ | jdksJ | jdksJ | jdks&J d S )N(   i   i 6  )r   r   r   r   r,   r-   r$   r&   r&   r'   test_llama2_config_13bS      r:   c                  C   P   t  } | jdks
J | jdksJ | jdksJ | jdksJ | jdks&J d S )Nr)       @       p  )r   r   r   r   r,   r-   r$   r&   r&   r'   test_llama2_config_70b\   r;   rA   c                  C   ^   t  } | jdks
J | jdksJ | jdksJ | jdksJ | jdks&J | jdks-J d S )N  r=   r   r    8  )r
   rotary_baser    r   r   r-   r   r$   r&   r&   r'   test_llama3_config_8be   r8   rF   c                  C   sz   t  } | jdks
J | jdksJ | jdksJ | jdksJ | jdks&J | jdks-J | jdks4J | jdks;J d S )NrC   r=   r)   r@   r>   gܠ[;Q?   )	r   rE   r    r   r   r-   r   r.   make_vocab_size_divisible_byr$   r&   r&   r'   test_llama3_config_70bo   s   rI   c                  C   sX   t dddd} | jdksJ | jdksJ | jdksJ | jdks#J | jdks*J d S )	Nr   r   )r   r   r   r?      r+   r=   g{Gz?)r   scale_factorlow_freq_factorhigh_freq_factorold_context_lenr.   r$   r&   r&   r'   test_llama31_config{   s   rO   c                  C   rB   )NrC      r   r   rD   )r   rE   r    r   r   r-   r   r$   r&   r&   r'   test_llama31_config_8b   r8   rQ   c                  C   l   t  } | jdks
J | jdksJ | jdksJ | jdksJ | jdks&J | jdks-J | jdks4J d S )NrC   rP   r)   r=   r@   r>   rG   )r   rE   r    r   r   r-   r   rH   r$   r&   r&   r'   test_llama31_config_70b      rS   c                  C   sl   t  } | jdks
J | jdksJ | jdksJ | jdksJ | jdks&J | jdks-J | jdks4J d S )NrC   rP   ~    @  i   rG   )r   rE   r    r   r   r-   r   rH   r$   r&   r&   r'   test_llama31_config_405b   rT   rW   c                  C   &   t  } | jdks
J | jdksJ d S N@B rV   )r   rE   r    r$   r&   r&   r'   test_codellama_config_7b      r[   c                  C   rX   rY   )r   rE   r    r$   r&   r&   r'   test_codellama_config_13b   r\   r]   c                  C   rR   )N0   r=   r>   r?   i V  rZ   rV   )r   r   r   r   r,   r-   rE   r    r$   r&   r&   r'   test_codellama_config_34b   rT   r_   c                  C   r<   )Nr   r)   r=   r@   r>   )r   r    r   r   r-   r   r$   r&   r&   r'   test_codellama_config_70b   r;   r`   )#torch.nn.functionalnn
functionalr   $nemo.collections.llm.gpt.model.llamar   r   r   r   r   r   r   r	   r
   r   r   r   r   r   r   r(   r6   r7   r:   rA   rF   rI   rO   rQ   rS   rW   r[   r]   r_   r`   r&   r&   r&   r'   <module>   s"   D
		
	
