o
    ίi                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
 e  ZeG dd dZded	ed
ededef
ddZded	edededef
ddZded	ed
ededededefddZded	ed
ededededefddZded
efddZdejded
efddZdS )    N)	dataclass)partial)nn)AdamWlr_schedulerc                   @   s   e Zd ZU dZeed< dZeed< dZeed< dZeed< d	Z	eed
< dZ
eed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dS )	OptimArgsga2U0*3?lrg?weight_decayg:0yE>epsilong?beta1gffffff?beta2      ?clipcosine	scheduleri  warmuplr_min_ratiocycle_lengthcosine_thetai'  annealing_stepdecay_fraction      ?
exp_factorN)__name__
__module____qualname__r   float__annotations__r	   r
   r   r   r   r   strr   intr   r   r   r   r   r    r    r    >/home/ubuntu/.local/lib/python3.10/site-packages/core/optim.pyr      s   
 r   stepr   n_steps	min_ratioreturnc                 C   sP   | |k rt | | }|S | |kr$t | | ||  }|| d|  }|S |}|S )N   )r   )r"   r   r#   r$   r   sr    r    r!   	lr_linear"   s   r(   r   c                 C   s2   | |k rt | | }|S t|| | |  |}|S )N)r   max)r"   r   r   r$   r   r    r    r!   lr_inv_sqrt-   s
   r*   r   thetac           	      C   s   | ||  d d d }| |k rt | | }|S | |kr@t | | ||  }|dd|  |ttj||  |  d   }|S |}|S )N   r&   r   )r   mathcospi)	r"   r   r#   r   r+   r$   signr   r'   r    r    r!   	lr_cosine5   s   r2   r   c           
      C   s   | t ||  d }t || | }t || }| |k r$t| | }	|	S | || kr.d}	|	S | || krP| |krP| ||  } d| | d|  d| |    }	|	S |}	|	S )z
    UNDERSTANDING WARMUP-STABLE-DECAY LEARNING RATES: A RIVER VALLEY LOSS LANDSCAPE PERSPECTIVE
    https://arxiv.org/pdf/2410.05192
    r&   r   )r   r   )
r"   r   r#   r   r   r$   	cycle_numcurr_n_stepsdecay_lengthr   r    r    r!   lr_wsdJ   s    r6   argsc                 C   s   | j dkrdd }|S | j dkrtt| j|| jd}|S | j dkr.tt| j| j| jd}|S | j dkrCtt| j|| j| j	| jd	}|S | j d
kr`| j
| jk sPJ tt| j|| j
| j| jd}|S td| j  )Nconstantc                 S   s   dS )Nr   r    )xr    r    r!   <lambda>n   s    zbuild_lr_fn.<locals>.<lambda>linear)r   r#   r$   inv_sqrt)r   r   r$   r   )r   r#   r   r+   r$   wsd)r   r#   r   r   r$   zUnknown scheduler: )r   r   r(   r   r   r*   r   r2   r   r   r   r6   NotImplementedError)r7   r#   lr_fnr    r    r!   build_lr_fnl   sL   
!




r@   modelc                 C   sX   t d t|  |j|j|jf|j|jdd}t	||}t
||}t d ||fS )NzStarting build of optimizer...T)r   betasr	   epsfusedzDone with build of optimizer.)loggerinfor   
parametersr   r   r   r	   r
   r@   r   LambdaLR)rA   r7   r#   	optimizerr?   r   r    r    r!   build_optimizer   s   




rJ   )loggingr.   dataclassesr   	functoolsr   torchr   torch.optimr   r   	getLoggerrE   r   r   r   r(   r*   r2   r6   r@   ModulerJ   r    r    r    r!   <module>   sV   

"&