o
    im                     @   s  d dl mZmZmZmZ d dlZd dlmZ d dl	m
Z
mZ d dlmZ dZej o0ejjduZejejejejhZe 	d+dejdejd	efd
dZe ddejdfdejdededee dejf
ddZ e ddejddfdejdejdededee d	edejfddZ!dejdejfddZ"dejdejdejfddZ#dejdejdeedf fddZ$dd  Z%d!ed"edefd#d$Z&dejd%eeee f dejfd&d'Z'd(ejfd)d*Z(dS ),    )IterableOptionalTupleUnionN)AsyncCollectiveTensor
all_reduce)ScalingGranularityg-q=Famaxfloat8_dtyperound_scales_to_power_of_2c                 C   sZ   |  tj} |tv rt|jtj| td }| tj}nt	d| |r+t
|}|S )zConverts the amax value of a tensor to the fp8 scale.
    Args:
        amax: The amax value of the tensor.
        float8_dtype: The float8 dtype.
        round_scales_to_power_of_2: if true, round scaling factor down to the nearest power of 2.
    )minUnsupported float8_dtype: )totorchfloat64	FP8_TYPESfinfomaxclampEPSfloat32
ValueError_round_scale_down_to_power_of_2)r	   r
   r   res r   O/home/ubuntu/.local/lib/python3.10/site-packages/torchao/float8/float8_utils.pyamax_to_scale   s   r   xreduce_amaxscaling_granularityaxiswise_dimreturnc                 C   s   |t ju rtt| }n|t ju sJ d|d usJ dtjt| |dd}|rWt rW|d ur8|	 nd }|d u rFt
tt n|}t|d|}t|trW| }|S )NunsupportedT)dimkeepdimMAX)r   
TENSORWISEr   r   absAXISWISEr	   distis_initialized	get_grouplistrangeget_world_sizer   
isinstancer   wait)r   r   device_meshr   r    r	   pggroupr   r   r   tensor_to_amax8   s   

r4   	hp_tensorc                 C   s   t | ||||}t|||dS )a  
    Compute scaling factor for the given high precision tensor.

    Args:
        hp_tensor: high precision tensor
        float8_dtype: the float8 dtype to use
        reduce_amax: whether to reduce the max(abs(hp_tensor)) value across distributed ranks
        scaling_granularity: Defines the scaling granularity
        axiswise_dim: if axiswise granularity is used, defines the dim to scale across
        round_scales_to_power_of_2: if true, round scaling factor down to the nearest power of 2.
    )r   )r4   r   )r5   r
   r   r1   r   r    r   r	   r   r   r   tensor_to_scaleU   s   r6   c                 C   s<   |t v rt|j}| j| |d} | |S td| )a  Converts a tensor to a saturated fp8 tensor.

    Note:
        The default behavior in PyTorch for casting to `float8_e4m3fn`
        and `e5m2` is to not saturate. In this context, we should saturate.
        A common case where we want to saturate is when the history of a
        tensor has a maximum value of `amax1`, and the current amax value
        is `amax2`, where `amax1 < amax2`. This is common when using delayed
        scaling.
    )r   r   r   )r   r   r   r   r   r   r   )r   r
   	max_valuer   r   r   to_fp8_saturatedv   s
   
r8   yc                 C   s.   t j| }t j| | }dt ||  S )zComputes the error between two tensors in dB.

    For more details see:
        https://en.wikipedia.org/wiki/Signal-to-noise_ratio

    Args:
        x: The original tensor.
        y: The tensor to compare to the original tensor.
       )r   linalgvector_normlog10)r   r9   PsPnr   r   r   compute_error   s   
r@   tensor.c                 C   sb   |t v rt|j}ntd| | jj| jd}t||k	 
 }|dk	 
 }||fS )zCalculate FP8 tensor stats

    Args:
        tensor: The tensor to calculate stats for.
        float8_dtype: The float8 dtype.

    Returns:
        A tuple containing the number of zeros and the number of max values.
    r   )dtyper   )r   r   r   r   r   _datar   _orig_dtyper'   sumitem)rA   r
   FP8_MAXtensor_orig_typenum_maxnum_zeror   r   r   fp8_tensor_statistics   s   rK   c                 C   s0   t | dks
J d| d | d ko| d dkS )N   z%is_row_major only supports 2D tensorsr      )len)strider   r   r   is_row_major   s   rP   sizealignment_valuec                 C   s   d| d |  | S )a  
    Returns the minimum alignment value that is greater than or equal to the given size.

    Args:
        size: The size of the data to be aligned.
        alignment_value: The alignment value to be used.

    Returns:
        int: The minimum alignment value that is greater than or equal to the given size.

    Usage:
    ```
        >>> _get_min_alignment(10, 8)
        16
    ```
    rM   r   )rQ   rR   r   r   r   _get_min_alignment   s   rS   dimsc                 C   s~   |   dksJ | j\}}t|tr|f}d|v rt|dn|}d|v r)t|dn|}|| }|| }tjj| d|d|fS )aP  
    Pads a 2D tensor with zeros to ensure that its dimensions are multiples of 16, which is required `torch._scaled_mm`

    Args:
        tensor: The tensor to pad.
        dims: Dimensions to pad.

    Returns:
        torch.Tensor: The padded tensor.

    Usage:
    ```
        >>> pad_tensor_for_matmul(torch.randn((10, 10)), dims=0).shape
        torch.Size([16, 10])
        >>> pad_tensor_for_matmul(torch.randn((10, 10)), dims=1).shape
        torch.Size([10, 16])
        >>> pad_tensor_for_matmul(torch.randn((10, 10)), dims=(0, 1)).shape
        torch.Size([16, 16])
    ```
    rL   r      rM   )	r#   shaper/   intrS   r   nn
functionalpad)rA   rT   dim1dim2dim1_aligneddim2_alignedpad_dim1pad_dim2r   r   r   pad_tensor_for_matmul   s   

ra   scalec                 C   s*   | j tjks
J dttt| S )Nzscale must be float32 tensor)rB   r   r   exp2floorlog2)rb   r   r   r   r      s   r   )F))typingr   r   r   r   r   torch.distributeddistributedr)   )torch.distributed._functional_collectivesr   r   torchao.float8.configr   r   cudais_availableversionhipIS_ROCMfloat8_e4m3fnfloat8_e5m2float8_e4m3fnuzfloat8_e5m2fnuzr   no_gradTensorrB   boolr   r&   rW   r4   r6   r8   r@   rK   rP   rS   ra   r   r   r   r   r   <module>   s    


(