o
    ߗi                     @   sv   U d dl mZmZmZmZ d dlZd dlm  mZ	 d dlm
Z
 d dlmZ g Zee ed< ejjG dd dZdS )    )DictListOptionalTupleN)Tensor)2_scripted_functional_optimizer_deprecation_warning__all__c                   @   s   e Zd Z									ddee dedeeef d	ed
edededededefddZdede	e fddZ
dee	e  fddZdS )_FunctionalAdamWMbP?g?g+?:0yE>{Gz?Fparamslrbetasepsweight_decayamsgradmaximizeforeachfused_allow_empty_param_listc                 C   s   t dd d|kstd| d|kstd| d|d   kr'dk s1n td|d  d|d	   kr=dk sGn td
|d	  d|ksRtd| |||d |d	 |d| _|| _|| _|| _|	| _tj	t
tjt
ttjf f i | _t|dkr|
stdd|i| _d S )N   )
stacklevel        zInvalid learning rate: zInvalid epsilon value: r   g      ?z#Invalid beta parameter at index 0:    z#Invalid beta parameter at index 1: zInvalid weight_decay value: )r   r   beta1beta2r   z%optimizer got an empty parameter listr   )r   
ValueErrordefaultsr   r   r   r   torchjitannotater   r   strstatelenparam_group)selfr   r   r   r   r   r   r   r   r   r    r(   f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/torch/distributed/optim/functional_adamw.py__init__   s2   
$z_FunctionalAdamW.__init__paramgradc                 C   sv  g }g }g }g }g }g }t |}	|d ur|| || || jvrVi | j|< | j| }
t d|
d< t j|t jd|
d< t j|t jd|
d< | jrVt j|t jd|
d< | j| }
||
d  ||
d  | jrs||
d  ||
d  t  3 t	j
||||||| j| j| jd | jd | jd	 | jd
 | jd | j| jd d |	d W d    d S 1 sw   Y  d S )Nr   stepmemory_formatexp_avg
exp_avg_sqmax_exp_avg_sqr   r   r   r   r   r   r   r   r   r   r   r   r   r   
grad_scale	found_infhas_complex)r    
is_complexappendr$   tensor
zeros_likepreserve_formatr   no_gradFadamwr   r   r   r   )r'   r+   r,   params_with_gradgradsexp_avgsexp_avg_sqsmax_exp_avg_sqsstate_stepsr6   r$   r(   r(   r)   
step_paramG   sh   










"z_FunctionalAdamW.step_param	gradientsc                 C   s  | j d }g }g }g }g }g }g }t|t|kr,tddt| d dt|  d}	t| j d |D ]t\}
}|d ur|	t|
O }	||
 || |
| jvri | j|
< | j|
 }td|d< tj	|
tj
d	|d
< tj	|
tj
d	|d< | jrtj	|
tj
d	|d< | j|
 }||d
  ||d  | jr||d  ||d  q6t 3 tj||||||| j| j| jd | jd | jd | jd | jd | j| jd d |	d W d    d S 1 sw   Y  d S )Nr   zEthe gradients passed in does not equal to the size of the parameters!zParams length: z. zGradients length: Fr   r-   r.   r0   r1   r2   r   r   r   r   r   r3   )r&   r%   r   zipr    r7   r8   r$   r9   r:   r;   r   r<   r=   r>   r   r   r   r   )r'   rF   r   r?   r@   rA   rB   rC   rD   r6   r+   gradientr$   r(   r(   r)   r-      s   










"z_FunctionalAdamW.stepN)	r
   r   r   r   FFFFF)__name__
__module____qualname__r   r   floatr   boolr*   r   rE   r-   r(   r(   r(   r)   r	      sB    
	

-=r	   )typingr   r   r   r   r    torch.optim._functionaloptim_functionalr=   r   ,torch.distributed.optim._deprecation_warningr   r   r#   __annotations__r!   scriptr	   r(   r(   r(   r)   <module>   s   