o
    }oic                     @   s   d dl mZ d dlmZmZmZmZmZ d dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZmZmZmZ d d	lmZ erJd d
lmZ edZedddZG dd de
ZeededefddZdS )    )contextmanager)TYPE_CHECKING	GeneratorLiteralOptionalTypeVarN)MixedPrecision)nn)	Optimizer)	to_fabric)DtypeConfigMegatronMixedPrecisionget_optim_config"update_config_with_dtype_overrides)logging)ModelParallelConfigAnyTConfigTr   )boundc                3   @   s0  e Zd ZdZ																							
	d9ded dejdejdejdededede	e dede
de
de
de
dedededededed ed!ed"ed#e
d$e
d%df2d&d'Zd(ed%efd)d*Zd(ed%efd+d,Zd-ed%efd.d/Zd0ejd%ejfd1d2Zd3ed%efd4d5Zed%ed6 fd7d8ZdS ):FabricMegatronMixedPrecisionzFabric plugin for mixed precision training with Megatron models.

    Handles precision conversions and mixed precision training settings
    in the Fabric training framework.
    NFTr      most_recent              ?     	precision)16-mixed
bf16-mixed32params_dtypepipeline_dtypeautocast_dtypeautocast_enabledgrad_reduce_in_fp32fp8
fp8_recipefirst_last_layers_bf16num_layers_at_start_in_bf16num_layers_at_end_in_bf16
fp8_marginfp8_amax_history_lenfp8_amax_compute_algo	fp8_wgradfp8_dot_product_attentionfp8_multi_head_attention
fp8_paramsfp8_param_gatherfp16_loss_scalefp16_initial_loss_scalefp16_min_loss_scalefp16_loss_scale_windowfp16_hysteresisreturnc                 C   sT  |d urt d |d ur||krtd|}n|d u rd}t|tr't|}|dv r.tjntj}t	d$i d|dv d|dv d	|dv d
|pHtjd|pM|d|pR|d|d|d|d|d|	d|
d|d|d|d|d|d|d|d|d|d|d|d|d|d || _
| j
jrd!| _n| j
jrd"| _nd#| _d | _d S )%Nz^fp8_params is deprecated and will be removed in a future release, use fp8_param_gather insteadzaGetting conflicting values for fp8_params and fp8_param_gather. Please only set fp8_param_gather.F)bf16r   fp32)r9   r   fp16)r:   z
fp16-mixed16r   r8   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   	fp8_paramr1   
loss_scaleinitial_loss_scalemin_loss_scaleloss_scale_window
hysteresisr   r   z32-true )r   warning
ValueError
isinstanceintstrtorchbfloat16float32r   dtype_configr:   r   r8   scaler)selfr   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   dtyperB   rB   Q/home/ubuntu/.local/lib/python3.10/site-packages/nemo/lightning/fabric/plugins.py__init__1   s   
	

z%FabricMegatronMixedPrecision.__init__datac                 C      |S )zConvert model inputs (forward) to the floating point precision type of this plugin.

        Note: MegatronStrategy will take care of only doing this when:
            mpu.is_pipeline_first_stage()

        rB   rM   rQ   rB   rB   rO   convert_input      z*FabricMegatronMixedPrecision.convert_inputc                 C   rR   )zConvert outputs to the floating point precision type expected after model's forward.

        Note: MegatronStrategy will take care of only doing this when:
            mpu.is_pipeline_first_stage()

        rB   rS   rB   rB   rO   convert_output   rU   z+FabricMegatronMixedPrecision.convert_outputconfigc                 C   s   t | j|S )zConvert the config to the precision type this plugin handles.

        This is optional and depends on the precision limitations during optimization.
        )r   rK   )rM   rW   rB   rB   rO   convert_config   s   z+FabricMegatronMixedPrecision.convert_configmodulec                 C   s   t |ds|S ddlm} ddlm} | jjs| jjrH||j}| jj|_| jj|_t |dr>t	|j|s<|||j|_|S t	||sH|||}|S )zConvert the module parameters to the precision type this plugin handles.

        This is optional and depends on the precision limitations during optimization.

        rY   r   )Float16Module)get_model_config)
hasattr megatron.core.transformer.modulerZ   megatron.core.utilsr[   rK   r:   r8   rY   rE   )rM   rY   rZ   r[   rW   rB   rB   rO   convert_module   s   






z+FabricMegatronMixedPrecision.convert_module	optimizerc                 C   s>   t |D ]}|j| jjksJ d|j| jjksJ dq|S )zConvert the optimizer parameters to the precision type this plugin handles.

        This is optional and depends on the precision limitations during optimization.
        z BF16 model/optim config mismatchz FP16 model/optim config mismatch)r   r8   rK   r:   )rM   r`   optim_configrB   rB   rO   convert_optimizer   s   z.FabricMegatronMixedPrecision.convert_optimizer)NNNc                 c   s    zdV  W dS w )zINo explicit precision casting. Inputs are supposed to be manually casted.NrB   )rM   rB   rB   rO   forward_context   s   z,FabricMegatronMixedPrecision.forward_context)NNNFTNNFr   r   r   r   r   TFFNNNr   r   r   r   )__name__
__module____qualname____doc__r   rH   rN   boolrG   r   rF   floatrP   r   rT   rV   r   rX   r	   Moduler_   r
   rb   r   r   rc   rB   rB   rB   rO   r   *   s    		

Q		
r   pluginr7   c                 C   s   t | jdS )N)r   )r   r   )rk   rB   rB   rO   !_convert_megatron_mixed_precision   s   rl   )
contextlibr   typingr   r   r   r   r   rH   "lightning.fabric.plugins.precisionr   r	   torch.optimr
    nemo.lightning.fabric.conversionr   .nemo.lightning.pytorch.plugins.mixed_precisionr   r   r   r   
nemo.utilsr   #megatron.core.model_parallel_configr   r   r   r   registerrl   rB   rB   rB   rO   <module>   s"    