o
    }oiQ                     @   s   d dl Z d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZ e d
eZdedeje fddZdejdeje defddZdedeje
 defddZdee dedB fddZdS )    N)List)Callback)AutoTokenizer)SquadDataModule)GPTModel)MegatronCommOverlapCallback)DEFAULT_NEMO_CACHE_HOME)logging	NEMO_HOME
model_namereturnc                 C   s6   dt  dddddg}td| tjt| dd	S )
a8  
    HuggingFace tokenizer.

    Args:
        model_name (str): corresponds to HuggingFace-AutoTokenizer's 'pretrained_model_name_or_path' input argument.
                For more details please refer to-
                huggingface.co/docs/transformers/v4.47.1/en/model_doc/auto#transformers.AutoTokenizer
    zE`AutoTokenizer` first searches for tokenizer files locally stored in .zK(from env var `NEMO_HOME`- can be changed using '-nh/--nemo_home' CLI arg).zbIf files are missing locally, `AutoTokenizer` will try downloading from HuggingFace. In this case-zkmake sure env vars 'TRANSFORMERS_OFFLINE':'0' and 'HF_TOKEN':'<token_value>' are set in your sbatch script.zPBoth of these will be set automatically if you provide '-hf/--hf_token' CLI arg. T)pretrained_model_nameuse_fast)DEFAULT_NEMO_HOMEr	   warningjoinrunConfigr   )r   log_msg r   M/home/ubuntu/.local/lib/python3.10/site-packages/scripts/performance/utils.pyhf_tokenizer   s   

r   executormodelsourcec                 C   sD   ddl m} ddlm} || }d|_d|_tj|||dd|dfS )a  
    Downloads/Acceses checkpoint to be used for fine-tuning. `import_ckpt` first tries find the nemo checkpoint in
    <NEMO_HOME>/models/. For eg: for llama3 8b, the path will look like- <NEMO_HOME>/models/meta-llama/Meta-Llama-3-8B
    If missing, tries to downloads at the same location from HuggingFace and converts it nemo format.

    Args:
        source (str): HuggingFace URL. For eg- hf://meta-llama/Meta-Llama-3-70B
    r   )deepcopy)import_ckpt   F)r   r   	overwriteimport_ckpt_exp)copyr   nemo.collections.llmr   ntasks_per_nodenodesr   Partial)r   r   r   r   r   import_executorr   r   r   import_ckpt_experiment8   s   	r(   hf_model_uridata_configc                 C   sZ   t dt jtd}| dd}|j d}t j|dd||}t j|o,t j|S )z
    This method is used for fine-tuning. It checks if packed train data for a partiular
    sequence length exists locally. This is needed to set data flag (force_redownload=True)
    which avoids experiment crash in case files are missing.
    NEMO_DATASETS_CACHEdatasets/z--z_metadata.jsonlsquadpacked)	osgetenvpathr   r   replace
seq_lengthexistsisfile)r)   r*   datasets_dir	model_dirmetadata_filenametrain_pack_metadata_filepathr   r   r   isfile_train_pack_metadataL   s
   r;   	callbacksc                 C   s,   | rt | D ]\}}|jtkr|  S qdS )a   
    nemo.lightning.Trainer has a list of callbacks defined. This method identifies index of MegatronCommOverlapCallback
    from the list defined in recipes in nemo.collections.llm.recipes. The index is needed to override ddp communication
    params
    N)	enumerate__fn_or_cls__r   )r<   idxcallbackr   r   r   get_comm_overlap_callback_idx[   s   
rA   )r0   typingr   nemo_runr   $lightning.pytorch.callbacks.callbackr   .nemo.collections.common.tokenizers.huggingfacer   #nemo.collections.llm.gpt.data.squadr   nemo.collections.llm.gpt.modelr   &nemo.collections.llm.recipes.llama3_8br   nemo.lightning.baser   
nemo.utilsr	   r1   r   strr   r   SlurmExecutorr(   boolr;   intrA   r   r   r   r   <module>   s   