o
    i                     @   s`  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlmZm	Z	 ej
jZej
jZej
jZejZdedefdd	ZG d
d deZeejjdd Zeejjdd Zeejjdd Zeejjdd ZejjejjejjejjejjgZe	dreej j eedd Zeej!jdd Zeej"jdd Zeeg dS )    N)Tensor)add_safe_globals)return_and_correct_aliasing)TorchAOBaseTensortorch_version_at_leastinput
block_sizec                 C   s`   | j }| d|} |  ddttj }| |dd } | 	td}|||fS )Ng-q=   )
shapeviewabsamaxcliptorchfinfoDTYPEmaxto)r   r   r   scalecodes r   N/home/ubuntu/.local/lib/python3.10/site-packages/torchao/optim/subclass_fp8.pyquantize_fp8   s    r   c                   @   sz   e Zd ZddgZededefddZdedefddZdd Ze			dd
dZ
dddZe	ddefddZdd Zd	S )OptimStateFp8r   r   c                 C   s   t j| |j|jdS )Ndevice)r   _make_wrapper_subclassr   r   )clsr   r   r   r   r   __new__%   s   zOptimStateFp8.__new__c                 C   s>   |j tu sJ |jdksJ || _|| _| |  | _dS )a  Create quantized FP8 optimizer state.

        Args
            codes: quantized FP8 E4M3FN data. Has the same shape as the original float tensor.
            scale: scale data for block-wise quantization.

        NOTE: To get block-wise scale, the original float tensor is first reshape to (-1, block_size).
        Thus, the last dimension of the original float tensor is not necessarily divisible by block size.
        Given `codes` and `scale`, `block_size` is calculated as `codes.numel() // scale.numel()`.
        r
   N)dtyper   ndimr   r   numelr   )selfr   r   r   r   r   __init__)   s
   zOptimStateFp8.__init__c                 C   s
   | j g fS Ntensor_attrsr#   r   r   r   __tensor_flatten__:   s   
z OptimStateFp8.__tensor_flatten__Nc                    s"   | g  fdd| j D |R  S )Nc                    s   g | ]} | qS r   r   ).0nametensor_data_dictr   r   
<listcomp>B   s    z6OptimStateFp8.__tensor_unflatten__.<locals>.<listcomp>r&   )r   r-   tensor_attributes
outer_sizeouter_strider   r,   r   __tensor_unflatten__=   s
   z"OptimStateFp8.__tensor_unflatten__c                 C   sF   | j  }|d| j| jdd }|d ur||}|| j jS )Nr	   r
   )r   floatr   r   r   r   r   )r#   output_dtype
float_datar   r   r   
dequantizeE   s
   

zOptimStateFp8.dequantize   r   c                 C   s0   t j|t|d}t j| | |d}| ||S )N)r    r   r   )r   zerosr   r"   )r   r   r   r   r   r   r   r   r   r8   M   s   
zOptimStateFp8.zerosc              
   C   s2   | j j d| j dt| j d| j d| j d
S )Nz(block_size=z, shape=z	, device=z, requires_grad=))	__class____name__r   tupler   r   requires_gradr(   r   r   r   __repr__S   s   zOptimStateFp8.__repr__)NNr%   )r7   N)r;   
__module____qualname__r'   staticmethodr   r   r$   r)   classmethodr2   r6   intr8   r>   r   r   r   r   r   "   s    
r   c                 C   s   |d }|d }t |tr*t |tr*|j|jksJ |j|j |j|j |S t |trEt||j\}}|j| |j| |S ||  |S )Nr   r
   )
isinstancer   r   r   copy_r   r   r6   )functypesargskwargsdstsrcr   r   r   r   r   _Z   s   

rL   c                 C   s@   | dd }t|d jj|d|d jj|d}t| |||S )Nr   r   r   )getr   r   r   r   r   )rF   rG   rH   rI   r   outr   r   r   rL   o   s   c                 C   s   dd |D }| |i |S )Nc                 S   s"   g | ]}t |tr| n|qS r   )rD   r   r6   )r*   xr   r   r   r.   |   s   " z_.<locals>.<listcomp>r   rF   rG   rH   rI   r   r   r   rL   z   s   c                 C   s   |\}}t |j||jS r%   )r   r   r   r   )rF   rG   rH   rI   rO   r   r   r   r   rL      s   z
2.11.0.devc                 C   sf   |d }t |tstdt| t| |jg|dd  R i || |jg|dd  R i |S )Nr   z$expecting a OptimStateFp8 but found r
   )rD   r   
ValueErrortyper   r   )rF   rG   rH   rI   rO   r   r   r   rL      s   
c                 C   s   |d j  o|d j S )Nr   )r   	is_pinnedr   rP   r   r   r   rL      s   c              
   C   s   |d d \}}}}t |dkr|d nd}|dkrtd|dkr&td|j}	t|jdd  }
||
 |	 dksC||
 |	 dkrUtd|j d|	 d| d	| d
	t|j|| |j||
 |	 ||
 |	  S )N   r
   r   z+Only support aten.slice along the first dimz#Only support aten.slice with step=1zInvalid start or end for shape=z and block_size=zD. Make sure start and end align with block boundary. Received start=z, end=.)	lenrQ   r   mathprodr   r   r   r   )rF   rG   rH   rI   rO   dimstartendstepr   strider   r   r   rL      s(    )#rW   r   r   torch.serializationr   torch.utils._python_dispatchr   torchao.utilsr   r   opsatenc10d_functional_c10d_functionalfloat8_e4m3fnr   rC   r   r   
implementsrE   defaultrL   _to_copylerpScalarr   all_gather_into_tensorwait_tensordetach_optim_state_fp8_c10d_opsappend_wrap_tensor_autogradrS   slicer   r   r   r   <module>   sF   8








