o
    i2                     @   s.  d dl mZ d dlZd dlmZ d dlmZ d dlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ G dd deZdedededededee dededededededefddZG dd deZG dd deZG d d! d!eZG d"d# d#eZG d$d% d%eZG d&d' d'eZG d(d) d)eZdS )*    )OptionalN)Tensor)DTensor)	Optimizer   )_fp32_to_bf16_sr)OptimState4bit)OptimState8bit)OptimStateFp8c                       s   e Zd Z		d fddZdeddf fddZ fdd	Zed
ede	de
fddZd
ede	fddZe dddZ  ZS )	_AdamBasereturnNc                   s   d|kst d|d|kst d|d|d   kr"dk s,n t d|d d|d   kr8dk sBn t d|d t|||||d	}
t ||
 || _|| _|	| _d S )
N        zInvalid learning rate: {}zInvalid epsilon value: {}r   g      ?z%Invalid beta parameter at index 0: {}r   z%Invalid beta parameter at index 1: {})lrbetasepsweight_decayamsgrad)
ValueErrorformatdictsuper__init__
block_sizebf16_stochastic_roundis_adamw)selfparamsr   r   r   r   r   r   r   r   defaults	__class__ F/home/ubuntu/.local/lib/python3.10/site-packages/torchao/optim/adam.pyr      s&   
z_AdamBase.__init__param_groupc                    sD   t  | | jd }t|d ts tj|d tjd|d< d S d S )Nr   )dtype)r   add_param_groupparam_groups
isinstancer   torchtensorfloat32)r   r"   groupr   r    r!   r%   5   s
   
z_AdamBase.add_param_groupc                    s(   t  | | jD ]}|dd q	d S )Nr   F)r   __setstate__r&   
setdefault)r   stater+   r   r    r!   r,   =   s   
z_AdamBase.__setstate__psignedr   c                 C   s   t N)NotImplementedErrorr/   r0   r   r    r    r!   _subclass_zerosC   s   z_AdamBase._subclass_zerosc                 C   s   t |tr	| n|}| dkr#| | j dkr#| ||| j}nt|}t |tr=tj||j	|j
d|j| d}||j}|S )Ni   r   F)local_tensordevice_mesh
placements	run_checkshapestride)r'   r   to_localnumelr   r4   r(   
zeros_like
from_localr6   r7   r9   r:   todevice)r   r/   r0   local_poutr    r    r!   _new_bufferG   s   

z_AdamBase._new_bufferc                 C   s  d }|d urt   | }W d    n1 sw   Y  t jj  | jD ]}|d D ]}|jd u r5q-|j}|jr?td| j	| }t
|dkrmt d|d< | |d|d< | |d|d	< |d
 rm| |d|d< |d  d7  < t|d tstdt jtddd| ||d |d |d	 |dd |d |d d |d d |d |d | j| jo|jt ju  q-q'W d    |S 1 sw   Y  |S )Nr   z Sparse gradient is not supportedr   r   stepTexp_avgF
exp_avg_sqr   max_exp_avg_sqr   r   zulr was changed to a non-Tensor object. If you want to update lr, please use optim.param_groups[0]['lr'].fill_(new_lr))	fullgraphdynamicr   r   r   )r(   enable_grad_dynamoutilsdisable_cache_limitr&   grad	is_sparseRuntimeErrorr.   lenr)   rC   r'   r   compilesingle_param_adamdetachgetr   r   r$   bfloat16)r   closurelossr+   r/   rN   r.   r    r    r!   rD   e   s\   



	



00z_AdamBase.stepr   Nr1   )__name__
__module____qualname__r   r   r%   r,   staticmethodr   boolintr4   rC   r(   no_gradrD   __classcell__r    r    r   r!   r      s    !r   r/   rN   rD   rE   rF   rG   r   beta1beta2r   r   IS_ADAMWBF16_STOCHASTIC_ROUNDc                 C   s  |   }|  }|r|||	 |  }n||	|  }d||  }d||  }|  |d| }|  | d| }|| || |d urat|  |}|| | |  |
 }n
| |  |
 }||||  |  }|r| t| d S | | d S )Nr   )floatlerpsquarecopy_r(   maximumsqrtr   )r/   rN   rD   rE   rF   rG   r   rb   rc   r   r   rd   re   p_f32grad_f32bias_correction1bias_correction2exp_avg_f32exp_avg_sq_f32max_exp_avg_sq_f32denomr    r    r!   rS      s(   


rS   c                       N   e Zd Z					dddd		d fd
dZedededefddZ  Z	S )Adam8bitMbP?g?g+?:0yE>r   F   r   r   r   Nc          	         .   t  j||||||||dd	 tjd d S )NFr   r   r   ztorchao.optim.Adam8bitr   r   r(   _C_log_api_usage_once	r   r   r   r   r   r   r   r   r   r   r    r!   r         zAdam8bit.__init__r/   r0   r   c                 C      t | j||| jS r1   r	   zerosr9   r@   r3   r    r    r!   r4         zAdam8bit._subclass_zerosrv   rw   rx   r   FrY   
rZ   r[   r\   r   r]   r   r^   r_   r4   ra   r    r    r   r!   ru          	 ru   c                       rt   )Adam4bitrv   rw   rx   r   F   rz   r   Nc          	         r{   )NFr|   ztorchao.optim.Adam4bitr}   r   r   r    r!   r      r   zAdam4bit.__init__r/   r0   r   c                 C   r   r1   r   r   r9   r@   r3   r    r    r!   r4     r   zAdam4bit._subclass_zerosr   rY   r   r    r    r   r!   r      r   r   c                       rt   )AdamFp8rv   rw   rx   r   Fry   rz   r   Nc          	         r{   )NFr|   ztorchao.optim.AdamFp8r}   r   r   r    r!   r     r   zAdamFp8.__init__r/   r0   r   c                 C      t | j|| jS r1   r
   r   r9   r@   r3   r    r    r!   r4   ,     zAdamFp8._subclass_zerosr   rY   r   r    r    r   r!   r     r   r   c                       rt   )	AdamW8bitrv   rw   rx   {Gz?Fry   rz   r   Nc          	         r{   )NTr|   ztorchao.optim.AdamW8bitr}   r   r   r    r!   r   2  r   zAdamW8bit.__init__r/   r0   r   c                 C   r   r1   r   r3   r    r    r!   r4   K  r   zAdamW8bit._subclass_zerosrv   rw   rx   r   FrY   r   r    r    r   r!   r   1  r   r   c                       rt   )	AdamW4bitrv   rw   rx   r   Fr   rz   r   Nc          	         r{   )NTr|   ztorchao.optim.AdamW4bitr}   r   r   r    r!   r   Q  r   zAdamW4bit.__init__r/   r0   r   c                 C   r   r1   r   r3   r    r    r!   r4   j  r   zAdamW4bit._subclass_zerosr   rY   r   r    r    r   r!   r   P  r   r   c                       rt   )AdamWFp8rv   rw   rx   r   Fry   rz   r   Nc          	         r{   )NTr|   ztorchao.optim.AdamWFp8r}   r   r   r    r!   r   p  r   zAdamWFp8.__init__r/   r0   r   c                 C   r   r1   r   r3   r    r    r!   r4     r   zAdamWFp8._subclass_zerosr   rY   r   r    r    r   r!   r   o  r   r   c                       s2   e Zd Z					ddd	d fd	d
Z  ZS )_AdamWrv   rw   rx   r   F)r   r   Nc                   s&   t  j||||||td|dd	 dS )zAdamW optimizer that supports quantized training (parameter is quantized). This optimizer should
        only be used with torchao's quantized training.infTr|   N)r   r   rf   )r   r   r   r   r   r   r   r   r   r    r!   r     s   
z_AdamW.__init__r   rY   )rZ   r[   r\   r   ra   r    r    r   r!   r     s    	
r   )typingr   r(   r   torch.distributed._tensorr   torch.optimr   quant_utilsr   subclass_4bitr   subclass_8bitr	   subclass_fp8r
   r   rf   r^   rS   ru   r   r   r   r   r   r   r    r    r    r!   <module>   sZ    	

1