o
    ziE                     @   s   d Z ddlmZ ddlmZmZmZmZmZm	Z	m
Z
mZ ddlZddlmZmZ ddlmZ ddlmZ ddlmZ ddlZdd	lmZ dd
lmZ ddlmZ ddlmZ ddlm Z m!Z! ddl"m#Z# eeeegef Z$G dd deZ%dS )zK
Stochastic Weight Averaging Callback
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )deepcopy)AnyCallableDictListLiteralOptionalUnioncastN)Tensornn)LRScheduler)SWALR)override)Callback)DeepSpeedStrategy)FSDPStrategy)MisconfigurationException)rank_zero_inforank_zero_warn)LRSchedulerConfigc                   @   s  e Zd Zddddedfdeeee f deeef ded	e	d
 de
e de
eejef  fddZedefddZedefddZedddefddZedddddeddfddZedBddZedBdd Zeddd!eddfd"d#ZedBd$d%ZedCd(d)ZdDd*d+ZdEd,d-Zed.dd/dd0ededdf
d1d2Zed3ed4ed5edefd6d7Z ede!eef fd8d9Z"ed:e!eef ddfd;d<Z#edFd=d>Z$d?eddfd@dAZ%dS )GStochasticWeightAveragingg?
   cosNcpuswa_lrsswa_epoch_startannealing_epochsannealing_strategy)r   linearavg_fndevicec                 C   sF  d}t |tr|dk rt|t |tr&d|  kr!dks&t| t|t |ttf }t |to6|dk}	t |toEtdd |D  }
|sL|	sL|
rPtd|dur\t|s\td|durot |tjt	fsotd	| d| _
|| _|| _|| _|| _|p| j| _|| _d| _d| _d
| _d| _d| _d| _d| _i | _|  dS )a	  Implements the Stochastic Weight Averaging (SWA) Callback to average a model.

        Stochastic Weight Averaging was proposed in ``Averaging Weights Leads to
        Wider Optima and Better Generalization`` by Pavel Izmailov, Dmitrii
        Podoprikhin, Timur Garipov, Dmitry Vetrov and Andrew Gordon Wilson
        (UAI 2018).

        This documentation is highly inspired by PyTorch's work on SWA.
        The callback arguments follow the scheme defined in PyTorch's ``swa_utils`` package.

        For a SWA explanation, please take a look
        `here <https://pytorch.org/blog/pytorch-1.6-now-includes-stochastic-weight-averaging>`_.

        .. warning::  This is an :ref:`experimental <versioning:Experimental API>` feature.

        .. warning:: ``StochasticWeightAveraging`` is currently not supported for multiple optimizers/schedulers.

        .. warning:: ``StochasticWeightAveraging`` is currently only supported on every epoch.

        See also how to :ref:`enable it directly on the Trainer <advanced/training_tricks:Stochastic Weight Averaging>`

        Arguments:

            swa_lrs: The SWA learning rate to use:

                - ``float``. Use this value for all parameter groups of the optimizer.
                - ``List[float]``. A list values for each parameter group of the optimizer.

            swa_epoch_start: If provided as int, the procedure will start from
                the ``swa_epoch_start``-th epoch. If provided as float between 0 and 1,
                the procedure will start from ``int(swa_epoch_start * max_epochs)`` epoch

            annealing_epochs: number of epochs in the annealing phase (default: 10)

            annealing_strategy: Specifies the annealing strategy (default: "cos"):

                - ``"cos"``. For cosine annealing.
                - ``"linear"`` For linear annealing

            avg_fn: the averaging function used to update the parameters;
                the function must take in the current value of the
                :class:`AveragedModel` parameter, the current value of :attr:`model`
                parameter and the number of models already averaged; if None,
                equally weighted average is used (default: ``None``)

            device: if provided, the averaged model will be stored on the ``device``.
                When None is provided, it will infer the `device` from ``pl_module``.
                (default: ``"cpu"``)

        zBswa_epoch_start should be a >0 integer or a float between 0 and 1.   r   c                 s   s"    | ]}|d kot |tV  qdS )r   N)
isinstancefloat).0lr r'   e/home/ubuntu/.local/lib/python3.10/site-packages/pytorch_lightning/callbacks/stochastic_weight_avg.py	<genexpr>l   s     z5StochasticWeightAveraging.__init__.<locals>.<genexpr>zCThe `swa_lrs` should a positive float, or a list of positive floatsNz The `avg_fn` should be callable.z8device is expected to be a torch.device or a str. Found F)r#   intr   r$   listallcallabletorchr!   str
n_averaged_swa_epoch_start_swa_lrs_annealing_epochs_annealing_strategyr    _avg_fn_device_model_contains_batch_norm_average_model_initialized_swa_scheduler_scheduler_state_init_n_averaged_latest_update_epochmomenta)selfr   r   r   r   r    r!   err_msg
wrong_typewrong_float
wrong_listr'   r'   r(   __init__(   s@   <z"StochasticWeightAveraging.__init__returnc                 C   s    t | jtsJ t| jd dS )Nr"   r   )r#   r2   r+   maxr@   r'   r'   r(   	swa_start   s   z#StochasticWeightAveraging.swa_startc                 C   s
   | j d S Nr"   )_max_epochsrH   r'   r'   r(   swa_end   s   
z!StochasticWeightAveraging.swa_end	pl_modulepl.LightningModulec                 C   s   t dd |  D S )Nc                 s   s     | ]}t |tjjjV  qd S N)r#   r   modules	batchnorm
_BatchNorm)r%   moduler'   r'   r(   r)      s    zJStochasticWeightAveraging.pl_module_contains_batch_norm.<locals>.<genexpr>)anyrP   )rM   r'   r'   r(   pl_module_contains_batch_norm   s   z7StochasticWeightAveraging.pl_module_contains_batch_normtrainer
pl.Trainerstagec                 C   s&   t |jttfrtdt|| _d S )Nz.SWA does not currently support sharded models.)r#   strategyr   r   r   r   r9   )r@   rV   rM   rX   r'   r'   r(   setup   s   zStochasticWeightAveraging.setupc                 C   s   t |jdkrtdt |jdkrtd|jd usJ t| jtr,t|j| j | _| 	|| _
|j| _| j
rI|jjd usAJ |j jd7  _| jd urU| | d S d S )Nr"   z'SWA currently works with 1 `optimizer`.z;SWA currently not supported for more than 1 `lr_scheduler`.)len
optimizersr   lr_scheduler_configs
max_epochsr#   r2   r$   r+   rU   r8   rK   fit_loopr<   _clear_schedulersr@   rV   rM   r'   r'   r(   on_fit_start   s   
z&StochasticWeightAveraging.on_fit_startc              	   C   s  | j s| j|j  kr| jkrn nd| _ | jd usJ | j| jp$|j| _|jd }t	| j
tr<| j
gt|j | _
t| j
|jD ]\}}||d< qC|jd usSJ ttt|| j
| j| j| jdkre|jndd| _| jd urx| j| j n
|j| jkrtd t| j}|jdksJ |jd	ksJ |jr|jd }|jdks|jd	krtd
|  td|jjj  d| jjj  d ||jd< n|j!| | j"d u rt#j$| j%t#j&|jd| _"| j|j  kr| jkrn n%|j| j'kr| j"d usJ | jd usJ | (| j|| j"| j) |j| _'|j| jd	 krQ| jd us"J | *| j| | +| |j, j-d	7  _-d|j,_.|j/| _0t	|j,j-t1sJJ d|j,j-|_/d S d S )NTr   
initial_lrr   r*   )swa_lranneal_epochsanneal_strategy
last_epochzSWA is initializing after swa_start without any checkpoint data. This may be caused by loading a checkpoint from an older version of PyTorch Lightning.epochr"   z3SWA is currently only supported every epoch. Found zSwapping scheduler `z` for ``)dtyper!   z)Iterable-style datasets are not supported)2r:   rI   current_epochrL   r9   tor7   r!   r\   r#   r3   r$   r[   param_groupszipr^   r
   r   r   r4   r5   r;   r<   load_state_dictr   r   interval	frequencyr]   r   	scheduler	__class____name__appendr1   r/   tensorr=   longr>   update_parametersr6   transfer_weightsreset_batch_norm_and_save_stater_   max_batches_skip_backwardaccumulate_grad_batches_accumulate_grad_batchesr+   )r@   rV   rM   	optimizerr&   groupdefault_scheduler_cfgscheduler_cfgr'   r'   r(   on_train_epoch_start   st   $






 
z.StochasticWeightAveraging.on_train_epoch_startargsc                 G   s   d|j _d S )NF)r_   r|   )r@   rV   r   r'   r'   r(   on_train_epoch_end  s   z,StochasticWeightAveraging.on_train_epoch_endc                 C   s   | j r/|jd | jd kr/| j|_|j jd8  _|jjd us!J |j jd8  _|   d S |jd | jkrG| j	d us>J | 
| j	| d S d S rJ   )r8   rk   rL   r~   r}   r_   r{   r^   reset_momentar9   ry   ra   r'   r'   r(   on_train_end  s   z&StochasticWeightAveraging.on_train_endsrc_pl_moduledst_pl_modulec                 C   s6   t |  | D ]\}}| ||j q	d S rO   )rn   
parametersdetachcopy_rl   r!   )r   r   	src_param	dst_paramr'   r'   r(   ry     s   z*StochasticWeightAveraging.transfer_weightsc                 C   s   i | _ | D ]J}t|tjjjsq|jdusJ tj|j|j	|jj
d|_|jdus-J tj|j|j	|jj
d|_|j| j |< d|_|jdusJJ | jd9  _qdS )z_Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L140-L154.N)r!   rj   r   )r?   rP   r#   r   rQ   rR   running_meanr/   
zeros_liker!   rj   running_var	ones_likemomentumnum_batches_tracked)r@   rM   rS   r'   r'   r(   rz     s*   z9StochasticWeightAveraging.reset_batch_norm_and_save_statec                 C   s   | j D ]}| j | |_qdS )z_Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L164-L165.N)r?   r   )r@   	bn_moduler'   r'   r(   r   6  s   
z'StochasticWeightAveraging.reset_momentaaverage_modelmodelr1   c           
      C   sl   t |  | D ]&\}}|j}| }| |}|dkr!|n|||||}	||	 q	|d7 }dS )z_Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L104-L112.r   r"   N)rn   r   r!   r   rl   r   )
r   r   r1   r    p_swap_modelr!   p_swa_p_model_srcr'   r'   r(   rx   ;  s   z+StochasticWeightAveraging.update_parametersaveraged_model_parametermodel_parameternum_averagedc                 C   s   | ||  |d   S )z]Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L95-L97.r"   r'   )r   r   r   r'   r'   r(   r    H  s   z StochasticWeightAveraging.avg_fnc                 C   sP   | j d u rdn| j  | j| jd u rd n| j | jd u r!d dS | j dS )Nr   )r1   latest_update_epochscheduler_stateaverage_model_state)r1   itemr>   r;   
state_dictr9   rH   r'   r'   r(   r   M  s   z$StochasticWeightAveraging.state_dictr   c                 C   s0   |d | _ |d | _|d | _| |d  d S )Nr1   r   r   r   )r=   r>   r<   _load_average_model_state)r@   r   r'   r'   r(   ro   V  s   


z)StochasticWeightAveraging.load_state_dictc                 C   s*   | j rt| j dksJ | j   d S d S rJ   )r]   r[   clear)rV   r'   r'   r(   r`   ]  s   	z+StochasticWeightAveraging._clear_schedulersmodel_statec                 C   s   | j d u rd S | j | d S rO   )r9   ro   )r@   r   r'   r'   r(   r   j  s   
z3StochasticWeightAveraging._load_average_model_state)rV   rW   rM   rN   rF   N)r   rN   r   rN   rF   N)rM   rN   rF   N)rF   N)rV   rW   rF   N)&rt   
__module____qualname__r/   r!   r	   r$   r   r+   r   r   _AVG_FNr0   rE   propertyrI   rL   staticmethodboolrU   r   rZ   rb   r   r   r   r   ry   rz   r   r   rx   r    r   r   ro   r`   r   r'   r'   r'   r(   r   '   sx    

_S

r   )&__doc__copyr   typingr   r   r   r   r   r   r	   r
   r/   r   r   torch.optim.lr_schedulerr   torch.optim.swa_utilsr   typing_extensionsr   pytorch_lightningpl$pytorch_lightning.callbacks.callbackr   pytorch_lightning.strategiesr   !pytorch_lightning.strategies.fsdpr   &pytorch_lightning.utilities.exceptionsr   %pytorch_lightning.utilities.rank_zeror   r   !pytorch_lightning.utilities.typesr   r   r   r'   r'   r'   r(   <module>   s"   (