o
    i#                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlmZm	Z	 ddl
mZmZmZmZ ejjZejjZejjZd dlmZ edd	d
d Zedd	dd ZG dd deZeejjdd Zeejjdd Zeejjdd Zeejjdd Zej jej jej!jej!jej"jgZ#e	dre#$ej%j ee#dd Zeej&jdd Zeej'jdd Zeeg dS )    N)Tensor)add_safe_globals)return_and_correct_aliasing)TorchAOBaseTensortorch_version_at_least   )create_dynamic_mapdequant_with_qmapquantize_4bit_with_qmapscale_tensor)	lru_cache)maxsizec                   C   s   t dddS )NT      )r    r   r   O/home/ubuntu/.local/lib/python3.10/site-packages/torchao/optim/subclass_4bit.pyget_qmap_signed"   s   r   c                   C   s   t jddddddd   S )Nr   r      cpudevice)torchlinspacetolistr   r   r   r   get_qmap_unsigned'   s   r   c                	   @   s   e Zd Zg dZededededefddZdedededefdd	Zd
d Z	e
	dddZdddZe
ddedefddZdd ZdS )OptimState4bit)codesscaleqmapr   r   r   signedc                 C   s   t j| ||jdS )Nr   )r   _make_wrapper_subclassr   )clsr   r   r   r   shaper   r   r   __new__/   s   zOptimState4bit.__new__c                 C   st   |j tju sJ |jdksJ |jdksJ |j tju sJ || _|| _|| _|| _|| _	|
 d |
  | _dS )aA  Create quantized 4-bit optimizer state as proposed in https://arxiv.org/abs/2309.01507

        Args
            codes: quantized and packed 4-bit data stored as uint8.
            scale: scale data for block-wise quantization.
            qmap: lookup table that maps between quantized value (code) and float value.
            signed: whether the tensor is signed or unsigned.
            shape: shape of original float tensor.

        NOTE: To get block-wise scale, the original float tensor is first reshape to (-1, block_size).
        Thus, the last dimension of the original float tensor is not necessarily divisible by block size.
        Given `codes` and `scale`, `block_size` is calculated as `codes.numel() * 2 // scale.numel()`.
        The extra `* 2` is because `codes` is 4-bit data packed in 8-bit storage.
        r      N)dtyper   uint8ndimfloat32r   r   r   r   _shapenumel
block_size)selfr   r   r   r   r"   r   r   r   __init__3   s   zOptimState4bit.__init__c                 C   s   | j | j| jgfS N)tensor_attrsr   r)   r,   r   r   r   __tensor_flatten__M   s   z!OptimState4bit.__tensor_flatten__Nc                    s"   | g  fdd| j D |R  S )Nc                    s   g | ]} | qS r   r   ).0nametensor_data_dictr   r   
<listcomp>U   s    z7OptimState4bit.__tensor_unflatten__.<locals>.<listcomp>)r/   )r!   r5   tensor_attributes
outer_sizeouter_strider   r4   r   __tensor_unflatten__P   s
   z#OptimState4bit.__tensor_unflatten__c                 C   sL   t j| jd? | jd@ gdd}t|| j| j}|d ur ||}|| jS )Nr      )dim)	r   stackr   r	   r   r   toviewr)   )r,   output_dtyper   
float_datar   r   r   
dequantizeX   s
   
zOptimState4bit.dequantizeT   r+   c           
      C   sx   t |tr|fn|}t|}tj|d tj|d}tj|| |d}|r(t nt }tj	|tj
|d}	| |||	||S )Nr$   )r%   r   r   )
isinstanceintmathprodr   zerosr&   r   r   tensorr(   )
r!   r"   r   r+   r   n_elemsr   r   	qmap_listr   r   r   r   rI   _   s   
zOptimState4bit.zerosc                 C   s:   | j j d| j d| j dt| j d| j d| j dS )Nz(signed=z, block_size=z, shape=z	, device=z, requires_grad=))	__class____name__r   r+   tupler"   r   requires_gradr0   r   r   r   __repr__j   s   zOptimState4bit.__repr__)NNr.   )TrD   N)rO   
__module____qualname__r/   staticmethodr   boolr#   r-   r1   classmethodr:   rC   rF   rI   rR   r   r   r   r   r   ,   s    

r   c           	      C   s   |d }|d }t |tr6t |tr6|j|jkr$|j|jkr$|j|jks&J |j|j |j|j |S t |trht|	d|j\}}t
||j}|j|d d d d> |dd d B  |j| |S ||  |S )Nr   r   r<   r$   r   )rE   r   r   r+   r)   r   copy_r   r   r@   r
   r   rC   )	functypesargskwargsdstsrc
scaled_srcr   r   r   r   r   _q   s"   
(r`   c                 C   s`   | dd }t|d jj|d|d jj|d|d jj|d|d j|d j}t| |||S )Nr   r   r   )	getr   r   r?   r   r   r   r"   r   )rY   rZ   r[   r\   r   outr   r   r   r`      s   c                 C   s   dd |D }| |i |S )Nc                 S   s"   g | ]}t |tr| n|qS r   )rE   r   rC   )r2   xr   r   r   r6      s   " z_.<locals>.<listcomp>r   rY   rZ   r[   r\   r   r   r   r`      s   c                 C   s|   |\}}t |jt |krt|j|j|j|j|jS t|dkr5|d dkr5t|j|j|j|j|	 fS t
|jj d)Nr   r   r<   z4 only supports .view() with same shape or shape=[-1])rP   r"   r   r   r   r   r   r)   lenr*   
ValueErrorrN   rO   )rY   rZ   r[   r\   rc   r"   r   r   r   r`      s   z
2.11.0.devc                 C   s   |d }t |tstdt| | |jg|dd  R i |}| |jg|dd  R i |}|jd |  |j  f|jdd   }t|||j	 |j
|S )Nr   z%expecting a OptimState4bit but found r   )rE   r   rf   typer   r   r)   r*   r   cloner   )rY   rZ   r[   r\   rc   r   r   r"   r   r   r   r`      s   
  ,c                 C   s*   |d j  o|d j o|d j S )Nr   )r   	is_pinnedr   r   rd   r   r   r   r`      s
   c              
   C   s*  |d d \}}}}t |dkr|d nd}|dkrtd|dkr&td|j}	t|jdd  }
||
 |	 dksC||
 |	 dkrUtd|j d|	 d| d	| d
	|j||
 d ||
 d  }|j||
 |	 ||
 |	  }|jd |  |j  f|jdd   }t	|||j
 |j|S )Nr   r   r   z+Only support aten.slice along the first dimz#Only support aten.slice with step=1zInvalid start or end for shape=z and block_size=zD. Make sure start and end align with block boundary. Received start=z, end=.r$   )re   rf   r+   rG   rH   r"   r   r   r*   r   r   rh   r   )rY   rZ   r[   r\   rc   r=   startendstepr+   strider   r   r"   r   r   r   r`      s(    ,)(rG   r   r   torch.serializationr   torch.utils._python_dispatchr   torchao.utilsr   r   quant_utilsr   r	   r
   r   opsatenc10d_functional_c10d_functional	functoolsr   r   r   r   
implementsrX   defaultr`   _to_copylerpScalarr@   all_gather_into_tensorwait_tensordetach_optim_state_4bit_c10d_opsappend_wrap_tensor_autogradri   slicer   r   r   r   <module>   sN   

E






	
!