o
    i;                     @   sD   d dl mZ d dlZd dlmZmZ d dlmZ G dd deZdS )    )TypeN)	OptimizerParamsT)get_available_devicesc                   @   sv   e Zd Zejjfddddedee de	de
dd	f
d
dZe dddZdddZedd Zdd Zdd Zd	S )CPUOffloadOptimizerFi   )offload_gradientsminimal_sizeparamsoptimizer_classr   r   returnNc                   s  |t jju rd|vr|jdd t|}t|dkrtdt|d ts*d|ig}|_	d_
g _t _t _t d _jd	v sJJ d
tt j _t _ fdd}|D ]b}|d}g }	|D ]F}
|
jsqqk|
 j	k r~|	|
 qkt j|
ddd}t j|dd|_|j|
 dd |j|
< |
| |d|i|gfi |j|
< qkt|	dkrjd|	i| q`tjdkr|jfi |_
dS dS )a  Offload optimizer to CPU for single-GPU training. This will reduce GPU memory by the size of optimizer state.
        Optimizer step will be done on CPU.

        Args
            params: a list of parameters or parameter groups.
            optimizer_class: constructor of the base optimizer. Defaults to :class:`torch.optim.AdamW`.
            offload_gradients: free GPU gradients once they are moved to CPU. Not compatible with gradient accumulation.
            minimal_size: tensors smaller than this are kept on the GPU, to avoid excessively many small transfers.
            kwargs: other keyword arguments to be passed to the base optimizer e.g. `lr`, `weight_decay`.
        fusedT)r   r   z%optimizer got an empty parameter listr	   N)cudaxpuz.CPU Offload currently only supports CUDA & XPUc                    s   | j d urXj|  }jttj  ttjj |j j| j dd W d    n1 s4w   Y  | j	v rBj	| = j
 j	| <  rZ| j j d | _ d S d S d S NTnon_blocking)gradparam_d2h_mapstreamwait_streamgetattrtorchdevicecurrent_streamcopy_queuerecord_eventrecord_stream)p_devicep_hostr   self M/home/ubuntu/.local/lib/python3.10/site-packages/torchao/optim/cpu_offload.pybackward_hookA   s   



z3CPUOffloadOptimizer.__init__.<locals>.backward_hookcpu)r   
pin_memory)r'   r   )r   optimAdamWupdatelistlen
ValueError
isinstancedictr   d_optd_param_groupsr   
optim_dictr   r   r   Streamr   r   poprequires_gradnumelappend
empty_liker   r   detach"register_post_accumulate_grad_hook)r"   r	   r
   r   r   kwargsparam_groupsr%   param_groupretained_paramsr   r    r#   r!   r$   __init__   sX   




zCPUOffloadOptimizer.__init__c              	   C   s   d }|d ur	| }| j d ur| j   | j D ]5\}}|  | j|   | j| }tt| j	
| j
 |j|dd W d    n1 sHw   Y  q| j
  | j  |S r   )r0   stepr   itemssynchronizer2   r   r   r   r   r   r   clear)r"   closurelossr   grad_d2h_eventr    r#   r#   r$   r@   t   s    




zCPUOffloadOptimizer.stepTc                 C   s>   |sJ | j  D ]}d |_q	| jd ur| jj|d d S d S )N)set_to_none)r   keysr   r0   	zero_grad)r"   rG   r   r#   r#   r$   rI      s   
zCPUOffloadOptimizer.zero_gradc                 C   s   t dd | j D | jdS )Nc                 s   s    | ]}|j V  qd S N)r<   .0r(   r#   r#   r$   	<genexpr>   s    z3CPUOffloadOptimizer.param_groups.<locals>.<genexpr>)start)sumr2   valuesr1   )r"   r#   r#   r$   r<      s   z CPUOffloadOptimizer.param_groupsc                 C   s0   ddd | j  D i}| jr| j |d< |S )N	offloadedc                 S   s   g | ]}|  qS r#   )
state_dictrK   r#   r#   r$   
<listcomp>   s    z2CPUOffloadOptimizer.state_dict.<locals>.<listcomp>	on-device)r2   rP   r0   rR   )r"   rR   r#   r#   r$   rR      s
   zCPUOffloadOptimizer.state_dictc                 C   sV   t | j |d D ]	\}}|| q
| jr!| j|d  d S d|v r)tdd S )NrQ   rT   zPloaded state dict has a 'on-device' parameter group not present in the optimizer)zipr2   rP   load_state_dictr0   r-   )r"   rR   r(   optim_state_dictr#   r#   r$   rV      s   z#CPUOffloadOptimizer.load_state_dictrJ   )T)__name__
__module____qualname__r   r(   r)   r   r   r   boolintr?   no_gradr@   rI   propertyr<   rR   rV   r#   r#   r#   r$   r      s.    
a


r   )	typingr   r   torch.optim.optimizerr   r   torchao.utilsr   r   r#   r#   r#   r$   <module>   s
   