o
    i                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlmZm	Z	 ddl
mZmZmZmZ ejjZejjZejjZd dlmZ edd	d
d Zedd	dd ZG dd deZeejjdd Zeejjdd Zeejjdd Zeejjdd Zej jej jej!jej!jej"jgZ#e	dre#$ej%j ee#dd Zeej&jdd Zeej'jdd Zeeg dS )    N)Tensor)add_safe_globals)return_and_correct_aliasing)TorchAOBaseTensortorch_version_at_least   )create_dynamic_mapdequant_with_qmapquantize_8bit_with_qmapscale_tensor)	lru_cache)maxsizec                   C   
   t ddS )NTsignedr    r   r   O/home/ubuntu/.local/lib/python3.10/site-packages/torchao/optim/subclass_8bit.pyget_qmap_signed      
r   c                   C   r   )NFr   r   r   r   r   r   get_qmap_unsigned#   r   r   c                	   @   s   e Zd Zg dZededededefddZdedededefdd	Zd
d Z	e
	dddZdddZe
ddedefddZdd ZdS )OptimState8bit)codesscaleqmapr   r   r   r   c                 C   s   t j| |j|jdS )Ndevice)r   _make_wrapper_subclassshaper   )clsr   r   r   r   r   r   r   __new__+   s   zOptimState8bit.__new__c                 C   s\   |j tju sJ |jdksJ |j tju sJ || _|| _|| _|| _|	 |	  | _
dS )a  Create quantized 8-bit optimizer state as proposed in https://arxiv.org/abs/2110.02861

        Args
            codes: quantized 8-bit data stored as uint8. Has the same shape as the original float tensor.
            scale: scale data for block-wise quantization.
            qmap: lookup table that maps between quantized value (code) and float value.
            signed: whether the tensor is signed or unsigned.

        NOTE: To get block-wise scale, the original float tensor is first reshape to (-1, block_size).
        Thus, the last dimension of the original float tensor is not necessarily divisible by block size.
        Given `codes` and `scale`, `block_size` is calculated as `codes.numel() // scale.numel()`.
        r   N)dtypetorchuint8ndimfloat32r   r   r   r   numel
block_size)selfr   r   r   r   r   r   r   __init__/   s   zOptimState8bit.__init__c                 C   s   | j | jgfS N)tensor_attrsr   r(   r   r   r   __tensor_flatten__E   s   z!OptimState8bit.__tensor_flatten__Nc                    s"   | g  fdd| j D |R  S )Nc                    s   g | ]} | qS r   r   ).0nametensor_data_dictr   r   
<listcomp>M   s    z7OptimState8bit.__tensor_unflatten__.<locals>.<listcomp>)r+   )r   r1   tensor_attributes
outer_sizeouter_strider   r0   r   __tensor_unflatten__H   s
   z#OptimState8bit.__tensor_unflatten__c                 C   s(   t | j| j| j}|d ur||}|S r*   )r	   r   r   r   to)r(   output_dtype
float_datar   r   r   
dequantizeP   s   
zOptimState8bit.dequantizeT   r'   c           	      C   sX   t j|t j|d}t j| | |d}|rt nt }t j|t j|d}| ||||S )N)r!   r   r   )r"   zerosr#   r&   r   r   tensorr%   )	r   r   r   r'   r   r   r   	qmap_listr   r   r   r   r<   V   s
   zOptimState8bit.zerosc                 C   s:   | j j d| j d| j dt| j d| j d| j dS )Nz(signed=z, block_size=z, shape=z	, device=z, requires_grad=))	__class____name__r   r'   tupler   r   requires_gradr,   r   r   r   __repr__^   s   zOptimState8bit.__repr__)NNr*   )Tr;   N)rA   
__module____qualname__r+   staticmethodr   boolr    r)   r-   classmethodr6   r:   intr<   rD   r   r   r   r   r   (   s    
r   c           	      C   s   |d }|d }t |tr0t |tr0|j|jkr|j|jks J |j|j |j|j |S t |trQt||j\}}t||j	}|j| |j| |S ||
  |S )Nr   r   )
isinstancer   r   r'   r   copy_r   r   r
   r   r:   )	functypesargskwargsdstsrc
scaled_srcr   r   r   r   r   _e   s   
rT   c                 C   sX   | dd }t|d jj|d|d jj|d|d jj|d|d j}t| |||S )Nr   r   r   )getr   r   r7   r   r   r   r   )rM   rN   rO   rP   r   outr   r   r   rT   |   s   c                 C   s   dd |D }| |i |S )Nc                 S   s"   g | ]}t |tr| n|qS r   )rK   r   r:   )r.   xr   r   r   r2      s   " z_.<locals>.<listcomp>r   rM   rN   rO   rP   r   r   r   rT      s   c                 C   s$   |\}}t |j||j|j|jS r*   )r   r   viewr   r   r   )rM   rN   rO   rP   rW   r   r   r   r   rT      s   z
2.11.0.devc                 C   sr   |d }t |tstdt| t| |jg|dd  R i || |jg|dd  R i ||j |jS )Nr   z%expecting a OptimState8bit but found r   )	rK   r   
ValueErrortyper   r   r   cloner   )rM   rN   rO   rP   rW   r   r   r   rT      s   
c                 C   s*   |d j  o|d j o|d j S )Nr   )r   	is_pinnedr   r   rX   r   r   r   rT      s
   c              
   C   s   |d d \}}}}t |dkr|d nd}|dkrtd|dkr&td|j}	t|jdd  }
||
 |	 dksC||
 |	 dkrUtd|j d|	 d| d	| d
	t|j|| |j||
 |	 ||
 |	  |j	
 |jS )N   r   r   z+Only support aten.slice along the first dimz#Only support aten.slice with step=1zInvalid start or end for shape=z and block_size=zD. Make sure start and end align with block boundary. Received start=z, end=.)lenrZ   r'   mathprodr   r   r   r   r   r\   r   )rM   rN   rO   rP   rW   dimstartendstepr'   strider   r   r   rT      s,    )(ra   r"   r   torch.serializationr   torch.utils._python_dispatchr   torchao.utilsr   r   quant_utilsr   r	   r
   r   opsatenc10d_functional_c10d_functional	functoolsr   r   r   r   
implementsrL   defaultrT   _to_copylerpScalarrY   all_gather_into_tensorwait_tensordetach_optim_state_8bit_c10d_opsappend_wrap_tensor_autogradr]   slicer   r   r   r   <module>   sN   

=






	
