o
    ãÊiá  ã                   @   sÖ   d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlmZ de
jdejddfd	ejd
ejdedede
dedee dede	fdd„Zdededee fdd„ZejjG dd„ dejjƒƒZdS )z9
Utilities for scaling high precision tensors to float8.
é    )ÚOptionalN)ÚScalingGranularity)Útensor_already_casted_to_fp8)ÚFloat8TrainingTensorÚGemmInputRoleÚLinearMMConfigÚhp_tensor_and_scale_to_float8)Útensor_to_scaleFÚ	hp_tensorÚfloat8_dtypeÚlinear_mm_configÚreduce_amaxÚgemm_input_roleÚscaling_granularityÚaxiswise_dimÚround_scales_to_power_of_2Úreturnc	           
      C   s&   t | ||||||ƒ}	t| |	||||ƒS )a6  
    Given a high precision tensor `hp_tensor`,
    scales `hp_tensor` dynamically and returns a `Float8TrainingTensor` of the result.

    Args:
        hp_tensor: the tensor to convert
        float8_dtype: the float8 dtype to use
        linear_mm_config: Defines the configuration for the scaled_mm for
          the 3 fwd/bwd gemms of linear
        reduce_amax: whether to reduce the max(abs(hp_tensor)) value across distributed ranks
        gemm_input_role: Defines the role of this tensor (input, weight or grad_output) in
          the 3 fwd/bwd gemms of linear
        scaling_granularity: Defines the scaling granularity
        axiswise_dim: if axiswise granularity is used, defines the dim to scale across
        round_scales_to_power_of_2: if true, round scaling factor down to the nearest power of 2.
    )r	   r   )
r
   r   r   r   r   Údevice_meshr   r   r   Úscale© r   úW/home/ubuntu/.local/lib/python3.10/site-packages/torchao/float8/float8_scaling_utils.pyÚhp_tensor_to_float8_dynamic   s"   ù	úr   c                 C   s   |t ju r| S dS )a2  
    Convenience function which takes in an axiswise dim which is only relevant
    for axiswise scaing, and a scaling type.  The output is pass-through
    if scaling type is axiswise, and None otherwise.  This is done to keep the
    logic from choosing the axiswise dim out of the scaling function.
    N)r   ÚAXISWISE)r   r   r   r   r   Úget_maybe_axiswise_dimK   s   

r   c                   @   s4   e Zd ZdZededejfdd„ƒZedd„ ƒZ	dS )	ÚNoopFwToFloat8BwDynamiczR
    Forward: no-op
    Backward: convert to float8_e5m2 with dynamic scaling
    r   Útarget_dtypec                 C   s   || _ || _|S ©N)r   r   )ÚctxÚtensorr   r   r   r   r   Úforwarda   s   zNoopFwToFloat8BwDynamic.forwardc                 C   s>   t |ƒr	|d d fS t|| jƒ}t||| j| jtjƒ}|d d fS r   )r   r	   r   r   r   r   ÚGRAD_OUTPUT)r   ÚgradYÚgradY_scaleÚ
fp8_tensorr   r   r   Úbackwardl   s   
û
z NoopFwToFloat8BwDynamic.backwardN)
Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ústaticmethodr   ÚtorchÚdtyper   r$   r   r   r   r   r   Z   s    ýü
r   )r(   Útypingr   r*   Útorchao.float8.configr   Ú torchao.float8.distributed_utilsr   Ú%torchao.float8.float8_training_tensorr   r   r   r   Útorchao.float8.float8_utilsr	   ÚINPUTÚ
TENSORWISEÚTensorr+   ÚboolÚintr   r   Ú_dynamoÚallow_in_graphÚautogradÚFunctionr   r   r   r   r   Ú<module>   sR   
÷ÿþýüûùø	÷

ö.ÿþ
ý