o
    wi=                     @   sj  d dl mZ d dlZd dlmZ d dlmZmZm	Z	 ej
j			 						
d%dededededededee dedeje	 fddZej
j		 				d&dedededee dededeje	 fddZej
j			d'dedededeje	 fdd Zej
j		 			d(dedededee dedeje	 fd!d"Zej
j		d)dededeje	 fd#d$ZdS )*    )OptionalN)OptimizerConfig)CosineAnnealingSchedulerMegatronOptimizerModulePytorchOptimizerModule
bf16-mixed  ?ffffff?-C6?      ?	precisionwarmup_stepsconstant_steps
adam_beta1
adam_beta2max_lrmin_lr	clip_gradreturnc           
      C   s^   t jtd|d| dk| dk||dd|d}|dur|nd| }t jt|||d	}	t jt||	d
S )zU
    Creates a distributed fused Adam optimizer with cosine annealing scheduler.
    adam皙?r   z16-mixedh㈵>T)
	optimizerlrweight_decaybf16fp16r   r   adam_epsuse_distributed_optimizerr   Nr   r   r   )configlr_scheduler)runConfigr   r   r   )
r   r   r   r   r   r   r   r   opt_cfgsched r'   d/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/recipes/optim/adam.py,distributed_fused_adam_with_cosine_annealing   s2   r)   r   {Gz?Fr   foreachc              
   C   sD   ddl m} tjttj|||dd|dtjt| ||pd| ddS )	z[
    Creates a PyTorch Adam optimizer with a cosine annealing learning rate scheduler.
    r   Adamr	   g+?:0yE>r   r   betasepsr+   r   r    optimizer_fnr"   )torch.optimr-   r#   r$   r   Partialr   )r   r   r   r   r   r+   r-   r'   r'   r(   "pytorch_adam_with_cosine_annealingB   s$   
r7   Tr   c              
   C   s,   ddl m} tjttj|| |dd|ddS )zE
    Creates a PyTorch Adam optimizer with a flat learning rate.
    r   r,   r.   r/   r0   r4   )r5   r-   r#   r$   r   r6   )r   r   r+   r-   r'   r'   r(   pytorch_adam_with_flat_lrc   s   	r9   c              
   C   sD   ddl m} tjttj|||ddddtjt| ||pd| dd	S )
z\
    Creates a Transformer Engine fused Adam optimizer with cosine annealing scheduler.
    r   	FusedAdamr.   r/   Tr   r   r1   r2   master_weightsr   r    r3   )%transformer_engine.pytorch.optimizersr;   r#   r$   r   r6   r   )r   r   r   r   r   r-   r'   r'   r(   te_adam_with_cosine_annealing{   s$   
r?   c              
   C   s,   ddl m} tjttj|| |dddddS )zV
    Creates a Transformer Engine fused Adam optimizer with a flat learning rate.
    r   r:   r.   r/   Tr<   r8   )r>   r;   r#   r$   r   r6   )r   r   r-   r'   r'   r(   te_adam_with_flat_lr   s   r@   )r   r   r   r	   r
   r   Nr   )r   r   r   Nr*   F)r   r*   T)r   r   r   Nr*   )r   r*   )typingr   nemo_runr#   megatron.core.optimizerr   nemo.lightning.pytorch.optimr   r   r   clifactorystrintfloatr$   r)   boolr7   r9   r?   r@   r'   r'   r'   r(   <module>   s   	* 