o
    wi|)                     @   s   d dl mZmZmZmZmZmZ d dlmZ	 d dl
Z
d dlmZ d dlmZ d dlmZ de
jdeeef fdd	Zde
jdeeef fd
dZG dd deZG dd deZdS )    )AnyCallableDictListOptionalUnionN)Callback)MegatronOptimizerModule)loggingtensorreturnc                 C   s"   t | tjrdt| jiS ddiS )zReturns tensor's precision	Precisionnot-a-tensor)
isinstancetorchTensorstrdtyper    r   g/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/lightning/pytorch/callbacks/debugging.pycollect_precision   s   r   c                 C   s,   t | tjrt| jt| jdS dddS )z"Returns tensor's shape & precision)Shaper   r   )r   r   r   r   shaper   r   r   r   r   collect_precision_and_shape!   s   
r   c                   @   s   e Zd ZdZeedfdeeej	gee
eef  f  deeej	gee
eef  f  deee ef fddZdejd	ejd
dfddZdS )ParameterDebuggera  
    Debugging tool to help inspect parameters and gradients at any callback event.

    This callback handles the boilerplate needed to iterate over the model parameters and gradients,
    and applies user specified functions to them. These functions can be used to log attributes or
    apply asserts on the param and grad tensors. Attributes are logged in a table, with a row for each parameter name.
    Default behavior is to log the precision and shapes of each parameter and its gradient.

    Args:
        param_fn: Function to apply to model parameters. Can be used to apply assertions on the tensor,
            or return a mapping of labels and values to log for each parameter.
        grad_fn: Function to apply to model gradients. Can be used to apply assertions on the tensor,
            or return a mapping of labels and values to log for each gradient.
        log_on_hooks: PTL callback hook name or list of hook names on which to apply param_fn and grad_fn.
            See `PTL docs <https://lightning.ai/docs/pytorch/stable/extensions/callbacks.html#hooks>`_ for more info
            on callback hooks. Note that some hooks that occur before the model is constructed are invalid.

    Example:
        >>> fn = lambda x: {"Norm": str(x.norm(2).item())}
        >>> callback = ParameterDebugger(param_fn=fn, log_on_hooks=["on_train_start", "on_train_end"])
        >>> trainer = Trainer(callbacks=[callback])
    on_train_startparam_fngrad_fnlog_on_hooksc                 C   s\   || _ || _tg d}t|tr|g}|D ]}||v s$J d||t| || j qd S )N)#teardown
on_fit_endon_sanity_check_starton_sanity_check_endon_train_batch_starton_train_batch_endon_train_epoch_starton_train_epoch_endon_validation_epoch_starton_validation_epoch_endon_test_epoch_starton_test_epoch_endon_predict_epoch_starton_predict_epoch_endon_validation_batch_starton_validation_batch_endon_test_batch_starton_test_batch_endon_predict_batch_starton_predict_batch_endr   on_train_endon_validation_starton_validation_endon_test_starton_test_endon_predict_starton_predict_endon_exceptionon_save_checkpointon_load_checkpointon_before_backwardon_after_backwardon_before_optimizer_stepon_before_zero_gradzTHook {} supplied to log_on_hooks is not valid or can not be used. Valid hooks are {})r   r   setr   r   formatsetattr_apply_user_funcs)selfr   r   r   valid_hooks	hook_namer   r   r   __init__A   s   
(
zParameterDebugger.__init__trainer	pl_moduler   Nc              	      s  dt jdtt j f fdd}g g g }}}  D ]@\}	}
||
}|	dddd}|| t|
|g| j| jg||gD ]\}}}|durZ|durU||| q@|i  q@qt	g t	g }}|D ]}|durt|
|  qg|D ]}|dur|
|  qwt|st|rd	d
lm} | }|d| tddg||g||gD ]0\}}}|D ](}g }|D ]}|dur|||d q|d q|g kr||| | qqd|_td|   dS dS )z
        Iterate over model parameters, find gradient tensor, apply and collect outputs of
        param_fn and grad_fn, and log outputs in a table.
        paramr   c                    sB   t t ddts| jS  jD ]}| |jv r|j|  j  S qdS )zJIf using MCore optimizer, search the grad buckets for param's grad tensor.optimN)r   getattrr	   gradbuffersparam_to_bucket	grad_data)rL   bufrK   r   r   find_grad_tensor   s   

z=ParameterDebugger._apply_user_funcs.<locals>.find_grad_tensorzmodule. z.weightNr   )PrettyTable	ParameterzParam zGrad l
)r   r   r   named_parametersreplaceappendzipr   r   rB   updatekeysanyprettytablerW   
add_columngetalignr
   info
get_string)rF   rJ   rK   argskwargsrU   	names_colparams_outputgrads_output
param_nameparam_tensorgrad_tensor
short_namer   fnout_col
param_keys	grad_keysoutputrW   debug_tableprefixr`   output_listk
col_to_logr   rT   r   rE   z   sX   



z#ParameterDebugger._apply_user_funcs)__name__
__module____qualname____doc__r   r   r   r   r   r   r   r   r   r   rI   plTrainerLightningModulerE   r   r   r   r   r   )   s    
9r   c                
   @   s\   e Zd ZdZddedefddZdd	d
eeef fddZ	dd	ddde
ded
df
ddZdS )ModelTrainingStateCallbacka  
    Callback to detect model training state corruption after validation loop.

    This callback monitors whether all model components maintain their training
    state consistently before and after validation. Designed to catch issues
    where some modules are left in eval() mode after PTL validation loop.

    Args:
        val_check_interval: Interval at which validation occurs. Default: 10.
        strict: If True, raises an exception when corruption is detected. Default: False.
    
   Fval_check_intervalstrictc                 C   s   || _ || _d | _d| _d S )NF)r   r   _pre_validation_state_expecting_check)rF   r   r   r   r   r   rI      s   
z#ModelTrainingStateCallback.__init__rJ   z
pl.Trainerr   c                    s|   ddl m} ddlm  |j}t||r"|j}t|tr|n|g}n
t|tr)|n|g}t fdd|D }|t	|dS )z2Get training/eval module counts from model chunks.r   )MegatronParallelunwrap_modelc                 3   s*    | ]}t d d  | D V  qdS )c                 s   s    | ]}|j rd V  qdS )   N)training).0mr   r   r   	<genexpr>   s    zKModelTrainingStateCallback._get_training_state.<locals>.<genexpr>.<genexpr>N)summodules)r   chunkr   r   r   r      s   ( zAModelTrainingStateCallback._get_training_state.<locals>.<genexpr>)training_modules
num_chunks)
 nemo.lightning.megatron_parallelr   nemo.utils.model_utilsr   modelr   pipelinelistr   len)rF   rJ   r   r   chunksmodel_chunkstraining_countr   r   r   _get_training_state   s   
z.ModelTrainingStateCallback._get_training_staterK   zpl.LightningModulebatch	batch_idxNc           	      C   s   |j }| jr7| jr7| jd }| |d }||kr1d| d| d| d}| jr,t|t| d| _d| _|d | j d	krK| || _d
| _dS dS )z/Monitor training state before/after validation.r   z=Model training state corruption detected! Before validation: z% training modules, after validation: z training modules (step )NFr   r   T)	global_stepr   r   r   r   RuntimeErrorr
   warningr   )	rF   rJ   rK   r   r   step	pre_count
post_countmsgr   r   r   r$      s,   


z/ModelTrainingStateCallback.on_train_batch_start)r   F)r{   r|   r}   r~   intboolrI   r   r   r   r   r$   r   r   r   r   r      s    r   )typingr   r   r   r   r   r   lightning.pytorchpytorchr   r   lightning.pytorch.callbacksr   %nemo.lightning.pytorch.optim.megatronr	   
nemo.utilsr
   r   r   r   r   r   r   r   r   r   r   <module>   s     