o
    oi>v                     @   s   d dl mZmZ d dlmZ d dlmZ d dlm	Z	 d dl
Z
d dlmZ G dd dZG dd	 d	ZG d
d de
jjZG dd deZG dd deZdS )    )abcdefaultdict)deepcopy)chain)OptionalNc                   @   s   e Zd Zdd ZdS )MockArgsc                 C   s   |D ]
}t | |||  qd S N)setattr)selfinitial_datakey r   P/home/ubuntu/.local/lib/python3.10/site-packages/bitsandbytes/optim/optimizer.py__init__   s   zMockArgs.__init__N)__name__
__module____qualname__r   r   r   r   r   r      s    r   c                   @   sJ   e Zd ZdZdZdd Zdd Zedd Zd	d
 Z	dddZ
dd ZdS )GlobalOptimManagerzK
    A global optimizer manager for enabling custom optimizer configs.
    Nc                 C      t d)NzCall get_instance() instead)RuntimeErrorr
   r   r   r   r         zGlobalOptimManager.__init__c                 C   s"   i | _ i | _d | _d| _g | _d S )NF)
pid2configindex2config	optimizeruses_config_overridemodule_weight_config_tripler   r   r   r   
initialize   s
   
zGlobalOptimManager.initializec                 C   s&   | j d u r| | | _ | j   | j S r   )	_instance__new__r   )clsr   r   r   get_instance&   s   

zGlobalOptimManager.get_instancec                 C   sr   t |}t|d tsd|ig}t|D ]"\}}t|d D ]\}}t|| jv r5| jt| | j||f< qqd S )Nr   params)list
isinstancedict	enumerateidr   r   )r
   r"   param_groupsgroup_indexgroupp_indexpr   r   r   register_parameters-   s   
z&GlobalOptimManager.register_parametersc                 C   s   d| _ t|tjjr|g}t|tjr|g}|dur(|dur(|du s$J ||i}|durJ|D ]}t|| jv rB| jt| | q.|| jt|< q.dS dS )a  
        Override initial optimizer config with specific hyperparameters.

        The key-values of the optimizer config for the input parameters are overridden
        This can be both, optimizer parameters like `betas` or `lr`, or it can be
        8-bit specific parameters like `optim_bits` or `percentile_clipping`.

        Arguments:
           parameters (`torch.Tensor` or `list(torch.Tensors)`):
             The input parameters.
           key (`str`):
             The hyperparamter to override.
           value:
             The hyperparameter values.
           key_value_dict (`dict`):
             A dictionary with multiple key-values to override.

        Example:

        ```py
        import torch
        import bitsandbytes as bnb

        mng = bnb.optim.GlobalOptimManager.get_instance()

        model = MyModel()
        mng.register_parameters(model.parameters()) # 1. register parameters while still on CPU

        model = model.cuda()
        # use 8-bit optimizer states for all parameters
        adam = bnb.optim.Adam(model.parameters(), lr=0.001, optim_bits=8)

        # 2. override: the parameter model.fc1.weight now uses 32-bit Adam
        mng.override_config(model.fc1.weight, 'optim_bits', 32)
        ```
        TN)	r   r$   torchnn	ParameterTensorr'   r   update)r
   
parametersr   valuekey_value_dictr,   r   r   r   override_config7   s   %z"GlobalOptimManager.override_configc                 C   s   | j |||f d S r   )r   append)r
   module
param_nameconfigr   r   r   register_module_overridel   s   z+GlobalOptimManager.register_module_override)NNN)r   r   r   __doc__r   r   r   classmethodr!   r-   r6   r;   r   r   r   r   r      s    


5r   c                       s   e Zd Zd fdd	Zdd Z fddZdd
dZdd Zdd Ze	
 dddZdd Zdd Zdd Ze	jfddZdd Z  ZS ) Optimizer8bit    Fc                    sX   t  || d| _i | _|| _tj | _t	 | _
h d| _|dkr*|   dS dS )az  
        Base 8-bit optimizer class.

        Arguments:
            params (`torch.tensor`):
                The input parameters to optimize.
            optim_bits (`int`, defaults to 32):
                The number of bits of the optimizer state.
            is_paged (`bool`, defaults to `False`):
                Whether the optimizer is a paged optimizer or not.
        F>   max1max2qmap1qmap2state1state2absmax1absmax2new_max1new_max2	gnorm_vec	unorm_vec   N)superr   initialized	name2qmapis_pagedFGlobalPageManagerr!   page_mngr   mngnon_castable_tensor_keys	fill_qmap)r
   r"   defaults
optim_bitsrP   	__class__r   r   r   q   s   

zOptimizer8bit.__init__c                 C   s(   t jdd| jd< t jdd| jd< d S )NT)signeddynamicFudynamic)rQ   create_dynamic_maprO   r   r   r   r   rV      s   zOptimizer8bit.fill_qmapc                    s   t  | d S r   )rM   __setstate__)r
   staterY   r   r   r_      s   zOptimizer8bit.__setstate__Tc                    s(  t |}j}|d }t|t|krtddd |D }dd |D }tdd t||D r5tddd	 ttd
d |D tdd |D D } fdd tt	}|d 
 D ]\}	}
|	|v ru||	 } ||
||< qa|
||	< qadd fddt||D }||d dS )a1  Load an optimizer state.

        Arguments:
            state_dict (`dict`):
                An optimizer state (should be returned from a call to `state_dict`) to load.
            move_to_device (`bool`, defaults to `True`):
                Whether to move the optimizer's state to the device.
        r(   z<loaded state dict has a different number of parameter groupsc                 s       | ]	}t |d  V  qdS r"   Nlen.0gr   r   r   	<genexpr>       z0Optimizer8bit.load_state_dict.<locals>.<genexpr>c                 s   ra   rb   rc   re   r   r   r   rh      ri   c                 s   s    | ]	\}}||kV  qd S r   r   )rf   p_lens_lenr   r   r   rh      ri   z]loaded state dict contains a parameter group that doesn't match the size of optimizer's groupc                 S   s   i | ]\}}||qS r   r   )rf   old_idr,   r   r   r   
<dictcomp>   s    z1Optimizer8bit.load_state_dict.<locals>.<dictcomp>c                 s       | ]}|d  V  qdS rb   r   re   r   r   r   rh          c                 s   rn   rb   r   re   r   r   r   rh      ro   c                    s   t |tjr  r|jtjkr| j}|S t |tr?| D ]\}}|j	v r5r4| j
||< q! |||< q!|S t |tjrSt| fdd|D S |S )zBMake a deep copy of value, casting all tensors to device of param.c                 3   s    | ]} |V  qd S r   r   )rf   v)castparamr   r   rh      s    z>Optimizer8bit.load_state_dict.<locals>.cast.<locals>.<genexpr>)r$   r.   r1   is_floating_pointdtypeuint8tor%   itemsrU   devicecontainer_abcsIterabletype)rr   r4   krp   )rq   move_to_devicer
   )rr   r   rq      s   

z+Optimizer8bit.load_state_dict.<locals>.castr`   c                 S   s   | d |d< |S )Nr"   r   )r*   	new_groupr   r   r   update_group   s   z3Optimizer8bit.load_state_dict.<locals>.update_groupc                    s   g | ]	\}} ||qS r   r   )rf   rg   ng)r   r   r   
<listcomp>   s    z1Optimizer8bit.load_state_dict.<locals>.<listcomp>)r`   r(   N)r   r(   rd   
ValueErroranyzipr   from_iterabler   r%   rw   r_   )r
   
state_dictr}   groupssaved_groups
param_lens
saved_lensid_mapr`   r|   rp   rr   r(   r   )rq   r}   r
   r   r   load_state_dict   s6   

zOptimizer8bit.load_state_dictc           	      C   s   t | jD ];\}}t |d D ]0\}}|| jv r?| j| }| D ]\}}t|tjr>t|dd}|s>||j	| j| |< q!qqd S )Nr"   rP   F)
r&   r(   r`   rw   r$   r.   r1   getattrrv   rx   )	r
   gindexr*   pindexr,   valuesr|   rp   rP   r   r   r   to_gpu   s   

zOptimizer8bit.to_gpuc           
      C   s   | j jD ]]\}}}t||}|d usJ t|tjs"t|tjs"J d}t| jD ]7\}}|r1 n0t|d D ](\}}	|r? n!t	|	t	|kr_|| j j
t	|	< | j j
t	|	 | j j||f< d}q7q)qd S )NFr"   T)rT   r   r   r$   r.   r1   r0   r&   r(   r'   r   r   )
r
   r8   attrr:   pmodulefoundr   r*   r   r,   r   r   r   check_overrides   s"   
zOptimizer8bit.check_overridesNc           	      C   s   d}|durt   | }W d   n1 sw   Y  g }| js-|   |   d| _t| jD ]:\}}t|d D ]/\}}|jdu rFq<| j| }t	|dkrY| 
|||| | | | |||| t j  q<q2| jrut j  |S )zPerform a single optimization step.

        Arguments:
            closure (`Callable`, *optional*, defaults to `None`):
                A closure that reevaluates the model and returns the loss.
        NTr"   r   )r.   enable_gradrN   r   r   r&   r(   gradr`   rd   
init_stateprefetch_stateupdate_stepcudasynchronizerP   )	r
   closureloss	overflowsr   r*   r   r,   r`   r   r   r   step  s0   





zOptimizer8bit.stepc                 C   s   i }|d |d< |d |d< |d |d< |d |d< | d|d< | d|d< | d|d< | jj|d< | jj|d	< | jj|d
< | jj|d< | jj|d< | jj|d< ||f| jj	v rf|
| jj	||f  |S )Nbetasepsweight_decaylralphat_alphat_beta3rX   min_8bit_sizepercentile_clipping
block_wise	max_unorm
skip_zeros)getargsrX   r   r   r   r   r   rT   r   r2   )r
   r   r   r*   r:   r   r   r   
get_config,  s"   zOptimizer8bit.get_configc                 C   r   )Nz(init_state method needs to be overriddenNotImplementedErrorr
   r*   r,   r   r   r   r   r   r   @  r   zOptimizer8bit.init_statec                 C   r   )Nz-The update_step method needs to be overriddenr   r   r   r   r   r   C  r   zOptimizer8bit.update_stepc                 C   sX   | j r	| dk rtj|||jdS tj|j||jd}t|d | j	j
| |S )Ng     j@rt   rx   r   )rP   numelr.   
zeros_likerx   rQ   	get_pagedshapefillrS   paged_tensorsr7   )r
   r,   rt   buffr   r   r   get_state_bufferF  s   zOptimizer8bit.get_state_bufferc                 C   s\   | j r(| j| }|d }t|dd}|r*t|d  d|v r,t|d  d S d S d S d S )NrD   rP   FrE   )rP   r`   r   rQ   prefetch_tensor)r
   r,   r`   s1rP   r   r   r   r   P  s   
zOptimizer8bit.prefetch_state)r?   F)Tr   )r   r   r   r   rV   r_   r   r   r   r.   no_gradr   r   r   r   float32r   r   __classcell__r   r   rY   r   r>   p   s    %
J&
r>   c                       sn   e Zd Z												
	
			ddee dee f fddZe dd Ze dd Z	  Z
S )Optimizer2StateMbP??g+?:0yE>        r?   N   d   TFr   r   c              	      sD  d|kst d| d|kst d| t|tr1|dddd d}dd	 |D }tt|D ]}d||   krEd
k sRn t d| d||  q7d|ks^t d| t|||||||d}t	 
|||| |du ri }||d< d|d< |	|d< |
|d< ||d< ||d< ||d< t|| _n|| _|| _dS )ah  
        Base 2-state update optimizer class.

        Arguments:
            optimizer_name (`str`):
                The name of the optimizer.
            params (`torch.tensor`):
                The input parameters to optimize.
            lr (`float`, defaults to 1e-3):
                The learning rate.
            betas (`tuple`, defaults to (0.9, 0.999)):
                The beta values for the optimizer.
            eps (`float`, defaults to 1e-8):
                The epsilon value for the optimizer.
            weight_decay (`float`, defaults to 0.0):
                The weight decay value for the optimizer.
            optim_bits (`int`, defaults to 32):
                The number of bits of the optimizer state.
            args (`object`, defaults to `None`):
                An object with additional arguments.
            min_8bit_size (`int`, defaults to 4096):
                The minimum number of elements of the parameter tensors for 8-bit optimization.
            percentile_clipping (`int`, defaults to 100):
                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
            block_wise (`bool`, defaults to `True`):
                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
            max_unorm (`float`, defaults to 0.0):
                The maximum value to normalize each block with.
            skip_zeros (`bool`, defaults to `False`):
                Whether to skip zero values for sparse gradients and models to ensure correct updates.
            is_paged (`bool`, defaults to `False`):
                Whether the optimizer is a paged optimizer or not.
            alpha (`float`, defaults to 0.0):
                The alpha value for the AdEMAMix optimizer.
            t_alpha (`Optional[int]`, defaults to `None`):
                Number of iterations for alpha scheduling with AdEMAMix.
            t_beta3 (`Optional[int]`, defaults to `None`):
                Number of iterations for beta scheduling with AdEMAMix.

        r   Invalid learning rate: Invalid epsilon value: ( ),c                 S   s   g | ]}t |qS r   )float)rf   br   r   r   r     s    z,Optimizer2State.__init__.<locals>.<listcomp>      ? Invalid beta parameter at index : Invalid weight_decay value: )r   r   r   r   r   r   r   NrX   r   r   r   r   r   r   )r   r$   strreplacestripsplitrangerd   r%   rM   r   r   r   optimizer_name)r
   r   r"   r   r   r   r   rX   r   r   r   r   r   r   rP   r   r   r   irW   rY   r   r   r   \  s:   <

zOptimizer2State.__init__c           
      C   sT  |  |||}|d dkrtj}n|d dkrtj}n	td|d  | |d k r/tj}| j| }d|d< |tjkrR| j|tjd|d	< | j|tjd|d
< n|tjkr|d dkrd| jvrg| 	  | jd 
|j| jd< | jd 
|j| jd< | j|tjd|d	< | jd |d< | j|tjd|d
< | jd |d< |d r| }|d }	|	|d dkrdnd7 }	tj|	ftj|jd|d< tj|	ftj|jd|d< n0tjdtj|jd|d< tjdtj|jd|d< tjdtj|jd|d< tjdtj|jd|d< |d dk rtjd|jd|d< |d d kr(tjd|jd|d!< d S d S )"NrX   r?   rL   (Amount of optimizer bits not supported: r   r   r   rt   rD   rE   r\   r]   rB   rC   r         r   rF   rG   r   r@   rH   rA   rI   r   r   r   rx   rJ   r   r   rK   r   r.   r   ru   r   r   r`   r   rO   rV   rv   rx   zeros
r
   r*   r,   r   r   r:   rt   r`   nblocksr   r   r   r     sL   


zOptimizer2State.init_statec                 C   s  |j  |_ |j |_| j| }|j}| |||}|d  d7  < |d }|d dk r=t||d ||d \}	}
}nd}|d jtj	krtj
| j|||d |d d	 |d
 ||d |d |d d t|d dkrr|d d nd|d |d ||d dkr|d nd |d |d d d S |d jtjkr|d stj| j|||d |d |d d	 |d d |d
 ||d |d |d |d |d |d |d |d ||d dkr|d nd |d d |d |d |d< |d< |d |d |d< |d< d S |d jtjkrR|d rTtj| j|||d |d |d d	 |d d t|d dkr/|d d nd|d |d
 ||d |d |d |d |d |d ||d d  d S d S d S )!Nr   r   r   r   rJ   r   rD   r   r   r   r   rE         r   r   r   r   rK   r   r   r   r   rB   rC   r@   rA   rH   rI   )gnorm_scalerK   r   rF   rG   r   r   )data
contiguousr   r`   r   rQ   r   rt   r.   r   optimizer_update_32bitr   rd   ru   optimizer_update_8bitoptimizer_update_8bit_blockwiser
   r*   r,   r   r   r`   r   r:   r   current_gnorm
clip_valuer   r   r   r   r     s   







 
zOptimizer2State.update_step)r   r   r   r   r?   Nr   r   Tr   FFr   NN)r   r   r   r   intr   r.   r   r   r   r   r   r   rY   r   r   [  s2    `
3r   c                       sV   e Zd Z												
	
d fdd	Ze dd Ze dd Z  ZS )Optimizer1Stater   r   r   r   r   r?   Nr   r   TFc                    s  d|kst d| d|kst d| tt|D ]}d||   kr*dk s7n t d| d||  qd|ksCt d| t||||d}t |||| |d	u r|i }||d
< d|d< |	|d< |
|d< ||d< ||d< ||d< t|| _n|| _|| _d	S )a  
        Base 1-state update optimizer class.

        Arguments:
            optimizer_name (`str`):
                The name of the optimizer.
            params (`torch.tensor`):
                The input parameters to optimize.
            lr (`float`, defaults to 1e-3):
                The learning rate.
            betas (`tuple`, defaults to (0.9, 0.0)):
                The beta values for the optimizer.
            eps (`float`, defaults to 1e-8):
                The epsilon value for the optimizer.
            weight_decay (`float`, defaults to 0.0):
                The weight decay value for the optimizer.
            optim_bits (`int`, defaults to 32):
                The number of bits of the optimizer state.
            args (`object`, defaults to `None`):
                An object with additional arguments.
            min_8bit_size (`int`, defaults to 4096):
                The minimum number of elements of the parameter tensors for 8-bit optimization.
            percentile_clipping (`int`, defaults to 100):
                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
            block_wise (`bool`, defaults to `True`):
                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
            max_unorm (`float`, defaults to 0.0):
                The maximum value to normalize each block with.
            skip_zeros (`bool`, defaults to `False`):
                Whether to skip zero values for sparse gradients and models to ensure correct updates.
            is_paged (`bool`, defaults to `False`):
                Whether the optimizer is a paged optimizer or not.
        r   r   r   r   r   r   r   )r   r   r   r   NrX   r   r   r   r   r   r   )	r   r   rd   r%   rM   r   r   r   r   )r
   r   r"   r   r   r   r   rX   r   r   r   r   r   r   rP   r   rW   rY   r   r   r   Q  s0   2
zOptimizer1State.__init__c           
      C   s  |  |||}|d dkrtj}n|d dkrtj}n	td|d  | |d k r/tj}| j| }d|d< |tjkrH| j|tjd|d	< no|tjkr|d dkrhd
| jvr\| 	  | jd
 
|j| jd
< | j|tjd|d	< | jd
 |d< |d r| }|d }	|	|d dkrdnd7 }	tj|	ftj|jd|d< ntjdtj|jd|d< tjdtj|jd|d< |d dk rtjd|jd|d< |d dkrtjd|jd|d< d S d S )NrX   r?   rL   r   r   r   r   r   rD   r\   rB   r   r   r   r   rF   r   r@   rH   r   r   r   r   rJ   r   r   rK   r   r   r   r   r   r     s>   



zOptimizer1State.init_statec                 C   s0  |j  |_ |j |_| j| }|j}| |||}|d  d7  < |d }|d dk r=t||d ||d \}	}
}nd}|d jtj	kr~tj
| j|||d |d d	 |d
 ||d d |d d dd|d ||d dkrr|d nd |d |d d d S |d jtjkr|d stj| j|||d d |d d	 |d d |d
 ||d |d d |d d |d d |d ||d dkr|d nd |d d |d |d |d< |d< d S |d jtjkr|d rtj| j|||d d |d d	 |d d dd|d
 ||d |d d |d d |d ||d d d S d S d S )Nr   r   r   r   rJ   r   rD   r   r   r   r   r   r   r   rK   r   r   r   rB   r@   rH   )r   rF   r   )r   r   r   r`   r   rQ   r   rt   r.   r   r   r   ru   r   r   r   r   r   r   r     s   








zOptimizer1State.update_step)r   r   r   r   r?   Nr   r   Tr   FF)	r   r   r   r   r.   r   r   r   r   r   r   rY   r   r   P  s$    N
+r   )collectionsr   ry   r   copyr   	itertoolsr   typingr   r.   bitsandbytes.functional
functionalrQ   r   r   optim	Optimizerr>   r   r   r   r   r   r   <module>   s   [ l v