o
    }oi"                     @   s@  d dl Z d dlmZ d dlmZ d dlZd dlZd dl	m
Z d dlmZ d dlmZ d dlmZ deded	ed
edejf
ddZd
edejfddZG dd dejZG dd dejZG dd dejZG dd dejZdeddfddZ	d(dejdededed eddfd!d"Z d#ej!fd$d%Z"eG d&d' d'ej#Z$dS ))    N)	dataclass)	lightning)llm)SentencePieceTokenizer)logging	data_pathtokenizer_pathindex_mapping_dir
seq_lengthreturnc              
   C   s*   t jt|d}t jtj| ||ddd|dS )z/Single shard dataset tokenized by SentencePiece)
model_path       i  )paths	tokenizerr
   micro_batch_sizeglobal_batch_sizeseedr	   )runConfigr   r   PreTrainingDataModule)r   r   r	   r
   r    r   P/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/llm/common.py
train_data   s   r   c              
   C   s   t jtjd| ddddddS )zSmall 145m modeli        i
     gZd;O?)rotary_baser
   
num_layershidden_sizeffn_hidden_sizenum_attention_headsinit_method_std)r   r   r   Llama3Config8B)r
   r   r   r   small_llama_cfg-   s   r$   c                   @   s8   e Zd ZdZdefddZdejdejddfd	d
Z	dS )StopBeforeEnda4  Preemptively stop training at a given global step. Allows stopping training before reaching
    the max steps. Useful for testing checkpoint save and resume.

    Args:
        stop_on_step (int): Stop training when trainer.global_step reaches this value.
            Checked at the start of every step.
    stop_on_stepc                 C   s
   || _ d S N)r&   )selfr&   r   r   r   __init__D   s   
zStopBeforeEnd.__init__trainer	pl_moduler   Nc                 C   sP   |j | jkr$td|j  d| j d d|_|j |j dkr&d|_d S d S d S )NzGlobal step z >= z, signaling Trainer to stop.Tr   )global_stepr&   r   infoshould_stopval_check_intervallimit_val_batches)r(   r*   r+   outputsbatch	batch_idxr   r   r   on_train_batch_endG   s   
z StopBeforeEnd.on_train_batch_end)
__name__
__module____qualname____doc__intr)   plTrainerLightningModuler4   r   r   r   r   r%   ;   s    r%   c                   @   s*   e Zd ZdZdejdejddfddZdS )4AssertOptimizerParamGroupsHaveAtLeastTwoWeightDecaysa  Sanity test weight decay settings in optimizer param groups.

    Background:
        The Megatron/NeMo optimizer splits parameters into groups by whether or not the parameter
        should have weight decay applied. A typlical rule is that `bias` terms and `layer_norm`
        terms for example should not have weight decay applied. This callback checks for the
        existance of two distinct weight decay settings across optimizers and param groups related
        to a bug adddressed in https://github.com/NVIDIA/NeMo/pull/12123.
    r*   r+   r   Nc           	      C   sn   i }t |jD ]\}}t |jD ]\}}d| d| }|d ||< qqtt| dks5J d| d S )Noptz;pgweight_decay   zFAll weight decays in optimizer param groups should not be equal. Got: )	enumerate
optimizersparam_groupslensetvalues)	r(   r*   r+   weight_decaysoioptimpiparam_groupkeyr   r   r   on_train_start]   s   zCAssertOptimizerParamGroupsHaveAtLeastTwoWeightDecays.on_train_start)r5   r6   r7   r8   r:   r;   r<   rM   r   r   r   r   r=   R   s    
r=   c                       sH   e Zd ZdZdef fddZdd Zdejdej	d	d
fddZ
  ZS )MCoreModelAttributeValidatorzOWalk through submodules and verify user-specified attributes like parallelisms.	attr_dictc                       t    || _d S r'   superr)   rO   r(   rO   	__class__r   r   r)   k      

z%MCoreModelAttributeValidator.__init__c              	   C   sP   | j  D ] \}}t||r%t||}||ks%J d| d| d| dqd S )NzKey z for model (z) does not match z! from provided attribute mapping.)rO   itemshasattrgetattr)r(   targetkv	model_valr   r   r   _check_attrso   s   


z)MCoreModelAttributeValidator._check_attrsr*   r+   r   Nc                    s.   dt jjdt jjf fdd}|j| d S )Nmoduler   c                    s   t | dr | j | S )Nconfig)rX   r^   r`   )r_   r(   r   r   walk_fnx   s   
z<MCoreModelAttributeValidator.on_train_start.<locals>.walk_fn)torchnnModulemodelwalk)r(   r*   r+   rb   r   ra   r   rM   w   s   z+MCoreModelAttributeValidator.on_train_start)r5   r6   r7   r8   dictr)   r^   r:   r;   r<   rM   __classcell__r   r   rT   r   rN   h   s
    "rN   c                       sJ   e Zd ZdZdef fddZdejdejddfd	d
Z	dddZ
  ZS )MiscAttributeValidatorzCPlace for any miscellaneous attribute assertions. Extend as needed.rO   c                    rP   r'   rQ   rS   rT   r   r   r)      rV   zMiscAttributeValidator.__init__r*   r+   r   Nc                 C   s|   d| j v r:|jjjjd d j}|j| j d ks&J d|j d| j d  || j d ks<J d| d| j d  d S d S )N	max_stepslr_scheduler	schedulerzTrainer max_steps z did not match provided zScheduler max_steps )rO   rf   rI   rl   
_schedulerrk   )r(   r*   r+   	sched_maxr   r   r   rM      s   
z%MiscAttributeValidator.on_train_start
pl.Trainerpl.LightningModulec                 C   s2   d| j v r|jjjjj}|| j d ksJ d S d S )Nr&   )rO   fit_loop
epoch_loopbatch_progresstotal	completed)r(   r*   r+   total_stepsr   r   r   on_train_end   s
   
z#MiscAttributeValidator.on_train_end)r*   rp   r+   rq   r   N)r5   r6   r7   r8   rh   r)   r:   r;   r<   rM   rx   ri   r   r   rT   r   rj      s
    
rj   	ckpt_pathc                 C   s   t j| }t j| d}t j|sJ d| t jt j|ds,J d| t jt j|ds>J d| dd t |D }tj	 }t
|d	| kscJ d
d	|  dt
| d S )Nweightsz Weights not found in checkpoint z	common.ptz"No 'common.pt' file in checkpoint zmetadata.jsonz&No 'metadata.json' file in checkpoint c                 S   s   g | ]	}| d r|qS )z.distcpendswith).0shardr   r   r   
<listcomp>   s    z%verify_distcp_dir.<locals>.<listcomp>   z)Wrong number of .distcp files, Expected: z Found: )ospathbasenamejoinisdirisfilelistdirrc   distributedget_world_sizerD   )ry   	ckpt_nameweights_dirshards
world_sizer   r   r   verify_distcp_dir   s   $
r   T
model_ckptrk   r/   exp_dir
dist_ckptsc           
      C   sH  t j|d}t |}| jrtdd |D sJ d|| | j }| jdkrEt||ksDt|| j| j ksDJ d| d| j dnt||ksSJ d| d	|D ]L}t j||}	d
|vsiJ d| |drd| j	v rd|d  |v sJ d| d| |rt j
|	sJ dt|	 qUt j|	sJ dqUdS )zEnsures that the provided checkpoint directory has
    - correct number of checkpoints
    - no more than top-k checkpoints
    - no unfinished checkpoints
    - a checkpoint for the last step
    - all checkpoints in the correct format
    checkpointsc                 S   s   g | ]}| d qS )-lastr{   )r}   cr   r   r   r      s    z#verify_ckpt_dir.<locals>.<listcomp>z(No -last checkpoint found after trainingr   z	Expected z checkpoints or at most top z checkpoints besides '-last'z checkpointsz-unfinishedzIUnfinished checkpoint found. Something went wrong with saving checkpoint r   stepzstep=r@   zLast checkpoint z not for final step zCheckpoint is not correct typeN)r   r   r   r   	save_lastany
save_top_krD   r|   filenamer   r   r   )
r   rk   r/   r   r   ckpt_dirckptsexpected_countr   ry   r   r   r   verify_ckpt_dir   s.   

"
&
r   	precisionc                    s   dt jdd f fdd}|S )Ntensorr   c                    s   | j  ksJ d S r'   )dtype)r   r   r   r   verify_precision   s   z1create_verify_precision.<locals>.verify_precision)rc   Tensor)r   r   r   r   r   create_verify_precision   s   r   c                   @   sV   e Zd ZU dZeed< dZeed< dZeed< dZeed< d	Z	eed
< dZ
eed< dS )Llama3ConfigCIi   r
   r   r   r   r   i   r       r!   i  
vocab_sizeN)r5   r6   r7   r
   r9   __annotations__r   r   r    r!   r   r   r   r   r   r      s   
 r   )T)%r   dataclassesr   lightning.pytorchpytorchr:   nemo_runr   rc   nemor   nlnemo.collectionsr   "nemo.collections.common.tokenizersr   
nemo.utilsr   strr9   r   r   	GPTConfigr$   Callbackr%   r=   rN   rj   r   ModelCheckpointboolr   r   r   r#   r   r   r   r   r   <module>   sV   

*