o
    ,iPO                  !   @   s.  d dl mZmZ d dlZd dlmZ d dlmZ ddlmZm	Z	m
Z
mZmZmZmZmZmZmZ ddgZG d	d deZd
de
 de de	 d e_							d)dee dee dee dee dee dee dee dedee dededededededef ddZd d! Zdee dee dee dee dee dee dedededededededefd"d#Zdee dee dee dee dee dee dedededededededefd$d%Zdee dee dee dee dee dee dedededededededed&dfd'd(ZdS )*    )ListOptionalN)Tensor)$_get_fused_kernels_supported_devices   )
_default_to_fused_or_foreach_differentiable_doc_foreach_doc_get_scalar_dtype
_get_value_maximize_doc_use_grad_for_differentiable_view_as_real	OptimizerParamsTAdagradadagradc                       s   e Zd Z						ddddddeded	ed
edededee dededee f fddZ fddZdd Z	dd Z
edddZ  ZS )r   {Gz?r   绽|=NF)maximizedifferentiablefusedparamslrlr_decayweight_decayinitial_accumulator_valueepsforeachr   r   r   c                   s~  d|kst d| d|kst d| d|ks!t d| d|ks,t d| d|ks7t d| t||||||||	|
d	}t || |
rx|	rStdd	| _t   d
 t fdd| j	D srtd  d|rxtd| j	D ]A}|d D ]:}| j
| }|d rtjdt|d d|jdntjdt d|d< t|rt||n|}tj||tjd|d< qq{d S )Ng        zInvalid learning rate: zInvalid lr_decay value: zInvalid weight_decay value: z)Invalid initial_accumulator_value value: zInvalid epsilon value: )	r   r   r   r   r   r   r   r   r   z)`fused` does not support `differentiable`Tcudac                 3   s4    | ]}|d  D ]}|j j v ot|V  qqdS )r   N)devicetypetorchis_floating_point).0pgpfused_supported_devices Q/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/torch/optim/adagrad.py	<genexpr>G   s    z#Adagrad.__init__.<locals>.<genexpr>zX`fused=True` requires all the params to be floating point Tensors of supported devices: .z0`fused` and `foreach` cannot be `True` together.r   r   r)   is_fused)dtyper    r/   step)memory_formatsum)
ValueErrordictsuper__init__RuntimeError_step_supports_amp_scalingr   removeallparam_groupsstater"   zerosr
   r    tensor
is_complexcomplex	full_likepreserve_format)selfr   r   r   r   r   r   r   r   r   r   defaultsgroupr&   r=   
init_value	__class__r'   r*   r7      sx   


zAdagrad.__init__c                    s   t  | d }| jD ]}|dd  |dd |dd |dd }qt| j }t|dko;t	|d d }|sS|D ]}tj
t|d t|dd	|d< q@d S d S )
Nr   r   Fr   r   r   r1   r-   r0   )r6   __setstate__r<   
setdefaultlistr=   valueslenr"   	is_tensorr?   floatr
   )rD   r=   r   rF   state_valuesstep_is_tensorsrH   r)   r*   rJ   h   s$   

zAdagrad.__setstate__c                 C   s4   | j D ]}|d D ]}| j| }|d   q	qd S )Nr   r3   )r<   r=   share_memory_)rD   rF   r&   r=   r)   r)   r*   share_memory}   s   

zAdagrad.share_memoryc           
      C   s~   d\}}|d D ]2}|j d ur:||j jO }|t|O }|| ||j  | j| }	||	d  ||	d  q||fS )N)FFr   r3   r1   )grad	is_sparser"   r@   appendr=   )
rD   rF   params_with_gradgrads
state_sumsstate_stepshas_sparse_gradhas_complexr&   r=   r)   r)   r*   _init_group   s   


zAdagrad._init_groupc           
      C   s   d}|durt   | }W d   n1 sw   Y  | jD ]A}g }g }g }g }| |||||\}}	t|||||d |d |d |d ||d |d |d |	|d	 t| d
dt| ddd q |S )zPerform a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   r   r   r   r   r   r   
grad_scale	found_inf)r   r   r   r   r]   r   r   r   r^   r   r`   ra   )r"   enable_gradr<   r_   r   getattr)
rD   closurelossrF   rY   rZ   r[   r\   r]   r^   r)   r)   r*   r1      s@   




zAdagrad.step)r   r   r   r   r   NN)__name__
__module____qualname__r   rP   r   boolr7   rJ   rU   r_   r   r1   __classcell__r)   r)   rH   r*   r      sJ    

Pa[  Implements Adagrad algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma \text{ (lr)}, \: \theta_0 \text{ (params)}, \: f(\theta)
                \text{ (objective)}, \: \lambda \text{ (weight decay)},                          \\
            &\hspace{12mm}    \tau \text{ (initial accumulator value)}, \: \eta\text{ (lr decay)}\\
            &\textbf{initialize} :  state\_sum_0 \leftarrow \tau                          \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
            &\hspace{5mm} \tilde{\gamma}    \leftarrow \gamma / (1 +(t-1) \eta)                  \\
            &\hspace{5mm} \textbf{if} \: \lambda \neq 0                                          \\
            &\hspace{10mm} g_t \leftarrow g_t + \lambda \theta_{t-1}                             \\
            &\hspace{5mm}state\_sum_t  \leftarrow  state\_sum_{t-1} + g^2_t                      \\
            &\hspace{5mm}\theta_t \leftarrow
                \theta_{t-1}- \tilde{\gamma} \frac{g_t}{\sqrt{state\_sum_t}+\epsilon}            \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `Adaptive Subgradient Methods for Online Learning
    and Stochastic Optimization`_.
    aH  
    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-2)
        lr_decay (float, optional): learning rate decay (default: 0)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        initial_accumulator_value (float, optional): initial value of the
            sum of squares of gradients (default: 0)
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-10)
        z	
        a  
        fused (bool, optional): whether the fused implementation (CPU only) is used.
            Currently, `torch.float64`, `torch.float32`, `torch.float16`, and `torch.bfloat16`
            are supported. (default: None). Please note that the fused implementations does not
            support sparse or complex gradients.
    .. _Adaptive Subgradient Methods for Online Learning and Stochastic
        Optimization: http://jmlr.org/papers/v12/duchi11a.html

    Fr   rZ   r[   r\   r   r`   ra   r]   r   r   r^   r   r   r   r   r   c                C   s   t dd |D std|du r|du rt| |	dd\}}|du r$d}|du r*d}|r5tj r5td|r@tj r@td|rJtj sJt}n|rTtj sTt}nt}|| ||||||||||	|
||d	 dS )
ztFunctional API that performs Adagrad algorithm computation.

    See :class:`~torch.optim.Adagrad` for details.
    c                 s   s    | ]	}t |tjV  qd S rf   )
isinstancer"   r   )r$   tr)   r)   r*   r+     s    zadagrad.<locals>.<genexpr>zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsNF)	use_fusedz6torch.jit.script not supported with foreach optimizersz4torch.jit.script not supported with fused optimizers
r   r   r   r   r]   r   r   r^   r`   ra   )	r;   r8   r   r"   jitis_scripting_fused_adagrad_multi_tensor_adagrad_single_tensor_adagrad)r   rZ   r[   r\   r   r`   ra   r]   r   r   r^   r   r   r   r   r   _funcr)   r)   r*   r      sJ   

c                 C   s   |   }t|||S rf   )sizer"   sparse_coo_tensor)rV   grad_indicesrM   rw   r)   r)   r*   _make_sparse=  s   rz   c             	   C   s  |d u r|d u s
J t | |||D ]\}}}}|d7 }t|}|s#|n| }|dkr8|jr1td|j||d}|d|d |   }|jrz| }| }| }|t	|||
d ||}|  |	}|jt	|||| | d qt|}|rt|}t|}t|}|j||dd |r| |	 }n| |	}|j||| d |rt|}t|}qd S )Nr   r   z;weight_decay option is not compatible with sparse gradientsalpha   value)zipr   rW   r8   addcoalesce_indices_valuesadd_rz   powsparse_masksqrt_r"   r@   view_as_realaddcmul_sqrtaddcdiv_view_as_complex)r   rZ   r[   r\   r`   ra   r   r   r   r   r]   r   r   r^   paramrV   	state_sumstep_tr1   clrry   grad_valuesstd
std_valuesr@   r)   r)   r*   rt   B  sJ   






rt   c                   s  |rJ d|d u r|d u sJ t | dkrd S t| |||g}| D ]\\}}}}}|
o7tdd |D }|rMt|||| ||	d|||||d q%|rUt||| |r\t|}|d j	rotj
|tjddd	dd
 nt
|d |dkr|rtj
|||d
 ntj|||d
} fdd|D }tj|||dd t|}t
||	 |dks|rt|| |}nt||}t||| q%d S )Nz#_foreach ops don't support autogradr   c                 s   s    | ]}|j V  qd S rf   )rW   )r$   rV   r)   r)   r*   r+     s    
z(_multi_tensor_adagrad.<locals>.<genexpr>Tro   g      ?cpu)r    r{   r   c                    s&   g | ]}  d t |d     qS )r   )r   )r$   r1   r   r   r)   r*   
<listcomp>  s    z)_multi_tensor_adagrad.<locals>.<listcomp>r~   )rN   r   "_group_tensors_by_device_and_dtyperM   anyrt   r   r"   _foreach_negis_cpu_foreach_add_r?   _foreach_add_foreach_addcmul__foreach_sqrt_foreach_mul__foreach_mul_foreach_addcdiv_)r   rZ   r[   r\   r`   ra   r   r   r   r   r]   r   r   r^   grouped_tensorlistsdevice_paramsdevice_gradsdevice_state_sumsdevice_state_stepsru   device_has_sparse_grad	minus_clrr   	numeratorr)   r   r*   rs     s|   



rs   returnc                C   s>  | sd S |
s|rt d|rt d|d ur|j|ind }|d ur&|j|ind }t| |||g}| D ]g\\}}\\}}}}}d\}}|d ur^|d ur^||vrZ|j|dd||< || }|d urw|d urw||vrs|j|dd||< || }t|d tj||||||||	|||d |d urt	||gt
|  q5d S )Nz5`fused` does not support sparse grad or complex paramz<adagrad with fused=True does not support differentiable=True)NNT)non_blockingr   )r   r   r   r   r   r`   ra   )r8   r    r   r   itemstor"   r   _fused_adagrad__foreach_sub_rN   )r   rZ   r[   r\   r`   ra   r   r   r   r   r]   r   r   r^   grad_scale_dictfound_inf_dictgrouped_tensorsr    ru   r   r   r   r   device_grad_scaledevice_found_infr)   r)   r*   rr     sf   
rr   )NNNFNFF)typingr   r   r"   r   torch.utils._foreach_utilsr   	optimizerr   r   r	   r
   r   r   r   r   r   r   __all__r   __doc__rj   rP   r   rz   rt   rs   rr   r)   r)   r)   r*   <module>   s(  0 *9

J	

=	

f	
