o
    wi~                   	   @   sd  d dl Z d dlZd dlmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ edeZdedeje fddZdejdeje defddZd,ddZd-dede fddZ!	d-dejdede fddZ"dedeje de#fd d!Z$d"ee de dB fd#d$Z%	%d.d&ed'ed(ed)efd*d+Z&dS )/    N)List)Callback)YamlSerializer)
_serializeAutoTokenizerSquadDataModule)GPTModel)MegatronCommOverlapCallback)DEFAULT_NEMO_CACHE_HOME)logging	NEMO_HOME
model_namereturnc                 C   s6   dt  dddddg}td| tjt| dd	S )
a8  
    HuggingFace tokenizer.

    Args:
        model_name (str): corresponds to HuggingFace-AutoTokenizer's 'pretrained_model_name_or_path' input argument.
                For more details please refer to-
                huggingface.co/docs/transformers/v4.47.1/en/model_doc/auto#transformers.AutoTokenizer
    zE`AutoTokenizer` first searches for tokenizer files locally stored in .zK(from env var `NEMO_HOME`- can be changed using '-nh/--nemo_home' CLI arg).zbIf files are missing locally, `AutoTokenizer` will try downloading from HuggingFace. In this case-zkmake sure env vars 'TRANSFORMERS_OFFLINE':'0' and 'HF_TOKEN':'<token_value>' are set in your sbatch script.zPBoth of these will be set automatically if you provide '-hf/--hf_token' CLI arg. T)pretrained_model_nameuse_fast)DEFAULT_NEMO_HOMEr   warningjoinrunConfigr   )r   log_msg r   V/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/scripts/performance/utils.pyhf_tokenizer"   s   

r   executormodelsourcec                 C   sD   ddl m} ddlm} || }d|_d|_tj|||dd|dfS )a  
    Downloads/Acceses checkpoint to be used for fine-tuning. `import_ckpt` first tries find the nemo checkpoint in
    <NEMO_HOME>/models/. For eg: for llama3 8b, the path will look like- <NEMO_HOME>/models/meta-llama/Meta-Llama-3-8B
    If missing, tries to downloads at the same location from HuggingFace and converts it nemo format.

    Args:
        source (str): HuggingFace URL. For eg- hf://meta-llama/Meta-Llama-3-70B
    r   deepcopy)import_ckpt   F)r   r    	overwriteimport_ckpt_exp)copyr"   nemo.collections.llmr#   ntasks_per_nodenodesr   Partial)r   r   r    r"   r#   import_executorr   r   r   import_ckpt_experiment;   s   	r-   c                 C   sh   | du }dt jv }|r%|r%t jd | kr#td|  dt jd  d | S |r)| S |r0t jd S td)zd
    Get NEMO_HOME path. Checks for both nemo_home argument and NEMO_HOME environment variable.
    Tr   zUsing nemo_home (z) instead of NEMO_HOME ()zJNeither -nh/--nemo_home argument nor NEMO_HOME environment variable is set)osenvironr   r   
ValueError)	nemo_homearg_nemo_setenv_nemo_setr   r   r   get_nemo_homeO   s   

r5      
seq_lengthc                 C   s   ddl m} ddlm} ddlm} ddlm} |t|}|d d }|j	ddd	 || d
}	|||dd||d|	dddd	}
|

  |d | dd }td|  | retdt|d dS td| d)a  Prepare the SQuAD dataset for fine-tuning.

    Args:
        model_name (str): The name of the model
        seq_length (int): The sequence length to use for packing. Defaults to 2048.
        nemo_home: Optional path to NEMO home directory set via args.nemo_home
    r   )Pathr   )PackedSequenceSpecsr   datasetssquadT)parentsexist_ok)r      r$   )packed_sequence_sizeFi  )	dataset_rootr7   global_batch_sizemicro_batch_sizepacked_sequence_specs	tokenizerforce_redownload
delete_rawseedpacked/--zPacked files should be in: zFiles found:*z Packed dataset dir not found at z. Dataset download failedN)pathlibr8   =nemo.collections.common.tokenizers.huggingface.auto_tokenizerr   -nemo.collections.llm.gpt.data.packed_sequencer9   #nemo.collections.llm.gpt.data.squadr	   r5   mkdirprepare_datareplaceprintexistslistglobFileNotFoundError)r   r7   r2   r8   r   r9   r	   nemo_home_pathr@   rD   
datamodule
packed_dirr   r   r   prepare_squad_datasetd   s2   
r[   c                 C   s8   ddl m} || }d|_d|_tjt|||d|dfS )zC
    Downloads and prepares the SQuAD dataset for fine-tuning.
    r   r!   r$   )r   r7   r2   prepare_squad_dataset_exp)r'   r"   r)   r*   r   r+   r[   )r   r   r7   r2   r"   dataset_executorr   r   r    prepare_squad_dataset_experiment   s   r^   hf_model_uridata_configc                 C   sZ   t dt jtd}| dd}|j d}t j|dd||}t j|o,t j|S )z
    This method is used for fine-tuning. It checks if packed train data for a partiular
    sequence length exists locally. This is needed to set data flag (force_redownload=True)
    which avoids experiment crash in case files are missing.
    NEMO_DATASETS_CACHEr:   rI   rJ   z_metadata.jsonlr;   rH   )	r/   getenvpathr   r   rR   r7   rT   isfile)r_   r`   datasets_dir	model_dirmetadata_filenametrain_pack_metadata_filepathr   r   r   isfile_train_pack_metadata   s
   ri   	callbacksc                 C   s,   | rt | D ]\}}|jtkr|  S qdS )a   
    nemo.lightning.Trainer has a list of callbacks defined. This method identifies index of MegatronCommOverlapCallback
    from the list defined in recipes in nemo.collections.llm.recipes. The index is needed to override ddp communication
    params
    N)	enumerate__fn_or_cls__r   )rj   idxcallbackr   r   r   get_comm_overlap_callback_idx   s   
ro   config_diff.txtbase_recipe
new_recipe
output_dir	file_namec                 C   s   t | td}t |td}tj|jdd|jdddddd}d|}tdtj|| t	tj||d	}|
| W d
   d
S 1 sHw   Y  d
S )z4
    Dump the config diff from the base recipe.
    )serializer_clsT)keependsrq   rr    )fromfiletofilelinetermzdumping config diff to wN)r   r   difflibunified_diff
splitlinesr   rS   r/   rc   openwrite)rq   rr   rs   rt   base_recipe_confignew_recipe_configdifffr   r   r   !dump_config_diff_from_base_recipe   s   


"r   )N)r6   N)rp   )'r|   r/   typingr   nemo_runr   $lightning.pytorch.callbacks.callbackr    nemo_run.core.serialization.yamlr   %nemo_run.run.torchx_backend.packagingr   .nemo.collections.common.tokenizers.huggingfacer   rO   r	   nemo.collections.llm.gpt.modelr
   &nemo.collections.llm.recipes.llama3_8br   nemo.lightning.baser   
nemo.utilsr   rb   r   strr   r   SlurmExecutorr-   r5   intr[   r^   boolri   ro   r   r   r   r   r   <module>   sL   
.
