o
    }oi+                     @   s   d dl mZ d dlmZmZ d dlmZmZmZm	Z	 d dl
Z
d dlmZ d dlmZ d dlmZ d dlmZ ed	Zd
efddZeG dd dZG dd deZdd ZdgZdS )    )contextmanager)	dataclassfields)	GeneratorLiteralTypeVarUnionN)	Precision)Module)	Optimizer)loggingAnyT	optimizerc                 c   sZ    dd }z ddl m} t| j|r| jj}n| jg}t||E dH  W dS    td)zExtract optimizer configurations from a Megatron optimizer.

    Args:
        optimizer: A torch.optim.Optimizer instance

    Yields:
        Optimizer configurations
    c                 S   s   | j S N)config)x r   b/home/ubuntu/.local/lib/python3.10/site-packages/nemo/lightning/pytorch/plugins/mixed_precision.py<lambda>)   s    z"get_optim_config.<locals>.<lambda>r   )ChainedOptimizerNz/Failed to extract optimizer config from module.)megatron.core.optimizerr   
isinstancemcore_optimizerchained_optimizersmap
ValueError)r   extract_configr   optsr   r   r   get_optim_config    s   	
r   c                   @   sP  e Zd ZU dZdZeed< dZeed< dZeed< dZ	e
jed< dZe
jed< dZe
jed	< dZeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZ eed< dZ!eed < dZ"eed!< dZ#eed"< dZ$eed#< dS )$DtypeConfigzConfiguration class for mixed precision training settings.

    Contains settings for FP32/FP16/BF16 training, FP8 training.
    Ffp32fp16bf16Nparams_dtypepipeline_dtypeautocast_dtypeautocast_enabledTgrad_reduce_in_fp32fp8delayed
fp8_recipefirst_last_layers_bf16r   
fp8_margin   fp8_amax_history_lenmost_recentfp8_amax_compute_algo	fp8_wgradfp8_dot_product_attentionfp8_multi_head_attention	fp8_paramfp8_param_gatherr   
loss_scaleinitial_loss_scalemin_loss_scaleloss_scale_window
hysteresisnum_layers_at_start_in_bf16num_layers_at_end_in_bf16)%__name__
__module____qualname____doc__r    bool__annotations__r!   r"   r#   torchdtyper$   r%   r&   r'   r(   strr*   r+   r,   intr.   r0   r1   r2   r3   r4   r5   r6   floatr7   r8   r9   r:   r;   r<   r   r   r   r   r   6   s8   
 r   c                3       s~  e Zd ZdZ																				
				d?ded dejdejdejdededededede	de	dedededededede
d e
d!e
d"e	d#e	d$e	d%e	d&df2 fd'd(Zd)ed&efd*d+Zd,ed&efd-d.Zd/ed&efd0d1Zd/ed&efd2d3Zed&ed4 fd5d6Z	7	d@d,ed8ee	e
f d&dfd9d:Zd,ed8ee	e
f d&dfd;d<Zd,ed8ee	e
f d&dfd=d>Z  ZS )AMegatronMixedPrecisionzPlugin for mixed precision training with Megatron models.

    Handles conversion of model parameters and inputs/outputs between different precisions,
    and manages mixed precision training settings.
    NFTr)   r   r-   r/                 ?     	precision)16-mixed
bf16-mixed32r#   r$   r%   r&   r'   r(   r*   r+   r,   r.   r0   r1   r2   r3   
fp8_paramsr5   fp16_loss_scalefp16_initial_loss_scalefp16_min_loss_scalefp16_loss_scale_windowfp16_hysteresisr;   r<   returnc                    s\  |d urt d |d ur||krtd|}n|d u rd}t|tr't|}|dv r.tjntj}t	d$i d|dv d|dv d	|dv d
|pHtjd|pM|d|pR|d|d|d|d|d|	d|
d|d|d|d|d|d|d|d|d|d|d|d|d|d || _
t   | j
jrd!| _d S | j
jrd"| _d S d#| _d S )%Nz^fp8_params is deprecated and will be removed in a future release, use fp8_param_gather insteadzaGetting conflicting values for fp8_params and fp8_param_gather. Please only set fp8_param_gather.F)r"   rO   r    )r    rP   r!   )r!   z
fp16-mixed16rN   r"   r#   r$   r%   r&   r'   r(   r*   r+   r,   r.   r0   r1   r2   r3   r4   r5   r;   r<   r6   r7   r8   r9   r:   rN   rO   z32-truer   )r   warningr   r   rF   rE   rC   bfloat16float32r   dtype_configsuper__init__r!   rM   r"   )selfrM   r#   r$   r%   r&   r'   r(   r*   r+   r,   r.   r0   r1   r2   r3   rQ   r5   rR   rS   rT   rU   rV   r;   r<   rD   	__class__r   r   r^   b   s   
	




zMegatronMixedPrecision.__init__modulec                 C   s   ddl m} ddlm} | jjs| jjrA||j}| jj|_| jj|_t|dr7t	|j|s5|||j|_|S t	||sA|||}|S )zConvert the module parameters to the precision type this plugin handles.

        This is optional and depends on the precision limitations during optimization.

        r   )Float16Module)get_model_configrb   )
 megatron.core.transformer.modulerc   megatron.core.utilsrd   r\   r!   r"   rb   hasattrr   )r_   rb   rc   rd   r   r   r   r   convert_module   s   





z%MegatronMixedPrecision.convert_moduler   c                 C   s>   t |D ]}|j| jjksJ d|j| jjksJ dq|S )zConvert the optimizer parameters to the precision type this plugin handles.

        This is optional and depends on the precision limitations during optimization.

        z BF16 model/optim config mismatchz FP16 model/optim config mismatch)r   r"   r\   r!   )r_   r   optim_configr   r   r   convert_optimizer   s   z(MegatronMixedPrecision.convert_optimizerdatac                 C      |S )zConvert model inputs (forward) to the floating point precision type of this plugin.

        Note: MegatronStrategy will take care of only doing this when:
            parallel_state.is_pipeline_first_stage()

        r   r_   rk   r   r   r   convert_input      z$MegatronMixedPrecision.convert_inputc                 C   rl   )zConvert outputs to the floating point precision type expected after model's forward.

        Note: MegatronStrategy will take care of only doing this when:
            parallel_state.is_pipeline_last_stage()

        r   rm   r   r   r   convert_output   ro   z%MegatronMixedPrecision.convert_output)NNNc                 c   s    zdV  W dS w )zINo explicit precision casting. Inputs are supposed to be manually casted.Nr   )r_   r   r   r   forward_context   s   z&MegatronMixedPrecision.forward_context        clip_valc                 C   s   |dkrt ddS )a  Clip gradients. Raises error if clip_val > 0, otherwise it is a no-op.

        Args:
            optimizer: The optimizer to clip gradients for
            clip_val: The value to clip gradients to
            gradient_clip_algorithm: The algorithm to use for clipping

        Raises:
            ValueError: If clip_val > 0 since gradient clipping is handled by Mcore's optimizer
        rr   zbGradient clipping is handled in Mcore's optimizer. Use the clip_grad attribute in OptimizerConfig.N)r   )r_   r   rs   gradient_clip_algorithmr   r   r   clip_gradients   s
   z%MegatronMixedPrecision.clip_gradientsc                 C      dS )zClip gradients by value - it is a no-op.

        Args:
            optimizer: The optimizer to clip gradients for
            clip_val: The value to clip gradients to
        Nr   r_   r   rs   r   r   r   clip_grad_by_value  ro   z)MegatronMixedPrecision.clip_grad_by_valuec                 C   rv   )zClip gradients by norm - it is a no-op.

        Args:
            optimizer: The optimizer to clip gradients for
            clip_val: The value to clip gradients to
        Nr   rw   r   r   r   clip_grad_by_norm  ro   z(MegatronMixedPrecision.clip_grad_by_norm)NNNFTNr)   Fr   r-   r/   TFFNNNrI   rJ   rK   rL   r   r   )rr   N)r=   r>   r?   r@   r   rC   rD   rA   rE   rF   rG   r^   r
   rh   r   rj   r   rn   rp   r   r   rq   r   ru   rx   ry   __classcell__r   r   r`   r   rH   [   s    		
Q		


&	rH   c                 C   s   t |drt| |j|_t| D ]5}t ||jsqt||j}t| |j}||krEt||j| tdt	|j
 d|j d| d|  q|S )zUpdate a config object with dtype settings from dtype_config.

    Args:
        dtype_config: Source of dtype settings
        config: Config object to update

    Returns:
        Updated config object
    __io__z
Overwrote .z  z -> )rg   "update_config_with_dtype_overridesr{   r   namegetattrsetattrr   debugtyper=   )r\   r   fieldold_valnew_valr   r   r   r}     s   

*r}   )
contextlibr   dataclassesr   r   typingr   r   r   r   rC   #lightning.pytorch.plugins.precisionr	   torch.nnr
   torch.optimr   
nemo.utilsr   r   r   r   rH   r}   __all__r   r   r   r   <module>   s    $ =
