o
    ãÊi<  ã                   @   sú   U d dl Z d dlZd dlmZ d dlmZmZ d dlZd dlm	Z	 e 
¡ Zejed< G dd„ de jƒZG dd	„ d	e jƒZeG d
d„ dƒƒZeƒ ZejZejZeddG dd„ dƒƒZeddG dd„ dƒƒZG dd„ de jƒZeddG dd„ dƒƒZdS )é    N)Ú	dataclass)ÚOptionalÚUnion)Úis_MI300Úloggerc                   @   ó    e Zd ZdZdZdZdd„ ZdS )ÚScalingTypezû
    Defines the type of scaling to use for casting to float8.

    Values:

    * ``DYNAMIC``: Compute scaling factor dynamically based on the tensor's values.
    * ``DISABLED``: Skip scaling for this tensor, leave it in its original precision.
    ÚdynamicÚdisabledc                 C   ó    | t ju rdS | t ju sJ ‚dS )NÚdynÚdis)r   ÚDYNAMICÚDISABLED©Úself© r   úI/home/ubuntu/.local/lib/python3.10/site-packages/torchao/float8/config.pyÚ	short_str"   ó   
zScalingType.short_strN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r   r   r   r   r   r      s
    	r   c                   @   r   )ÚScalingGranularityzþ
    Defines the granularity of scaling strategies for casting to float8.

    Values:

    * ``TENSORWISE``: A single scaling factor for the entire tensor.
    * ``AXISWISE``: Scaling factors computed along one axis of the tensor (rowwise scaling).
    Ú
tensorwiseÚaxiswisec                 C   r   )NÚtenÚaxs)r   Ú
TENSORWISEÚAXISWISEr   r   r   r   r   :   r   zScalingGranularity.short_strN)r   r   r   r   r   r    r   r   r   r   r   r   *   s
    
r   c                   @   s$   e Zd ZdZejZejZdd„ Z	dS )ÚFloat8TypeConfigzÊ
    Configuration for selecting the preferred float8 type pair, either e4m3fn/e5m2 or e4m3fnuz/e5m2fnuz.

    Currently, ROCm supports 1. fnuz variants in MI300. 2. OCP F8 variants in MI350/Navi4.
    c                 C   s8   t jjrt j ¡ rtƒ rt j| _t j| _	d S d S d S d S )N)
ÚtorchÚversionÚhipÚcudaÚis_availabler   Úfloat8_e4m3fnuzÚ
e4m3_dtypeÚfloat8_e5m2fnuzÚ
e5m2_dtyper   r   r   r   Ú__post_init__P   s   þzFloat8TypeConfig.__post_init__N)
r   r   r   r   r"   Úfloat8_e4m3fnr(   Úfloat8_e5m2r*   r+   r   r   r   r   r!   B   s
    r!   T)Úfrozenc                   @   sP   e Zd ZU dZejZeed< ej	Z
eed< dZeej ed< dd„ Zdd	„ ZdS )
Ú
CastConfigaâ  
    Configuration for casting a single tensor to float8.

    Args:
        scaling_type: The type of scaling to use. See :class:`ScalingType`.
            Default: ``ScalingType.DYNAMIC``
        scaling_granularity: The granularity of scaling. See :class:`ScalingGranularity`.
            Default: ``ScalingGranularity.TENSORWISE``
        target_dtype: The target float8 dtype (e.g., ``torch.float8_e4m3fn``).
            Default: ``None`` (will be set based on the recipe)
    Úscaling_typeÚscaling_granularityNÚtarget_dtypec                 C   s2   t dtdi| j }| j ¡ › d| j ¡ › d|› S )NÚe4m3Úe5m2Ú_)r(   r*   r2   r0   r   r1   )r   Údtyper   r   r   r   n   s    zCastConfig.short_strc                 C   sN   | j tju r| jtju sJ dƒ‚| jd u s#| jjr| jjdks%J dƒ‚d S d S )NzGonly dynamic scaling type is supported for axiswise scaling granularityé   z)must specify a 8-bit floating-point dtype)	r1   r   r    r0   r   r   r2   Úis_floating_pointÚitemsizer   r   r   r   r+   r   s   ÿ
ÿÿÿzCastConfig.__post_init__)r   r   r   r   r   r   r0   Ú__annotations__r   r   r1   r2   r   r"   r6   r   r+   r   r   r   r   r/   \   s   
 r/   c                   @   s   e Zd ZU dZdZeed< dS )ÚFloat8GemmConfigzë
    Configuration for a float8 gemm.

    Args:
        use_fast_accum: If True, use fast accumulation in lower precision.
            This can improve performance but may reduce numerical accuracy.
            Default: ``False``
    FÚuse_fast_accumN)r   r   r   r   r<   Úboolr:   r   r   r   r   r;   |   s   
 r;   c                   @   s   e Zd ZdZdZdZdZdS )ÚFloat8LinearRecipeNameaV  
    Pre-made recipes for common float8 training configurations.

    Values:

    * ``TENSORWISE``: Default, dynamic per-tensor scaling with the cuBLAS tensorwise kernel.
      Fastest option.
    * ``ROWWISE``: Dynamic rowwise scaling with the CUTLASS rowwise kernel.
      Uses e4m3 for activations, weights, gradients. Scales are rounded (floor) to
      the nearest power of two for increased accuracy.
    * ``ROWWISE_WITH_GW_HP``: A modification on rowwise scaling with increased accuracy
      for grad_weight by keeping grad_weight computation in high precision. Most accurate option.
    r   ÚrowwiseÚrowwise_with_gw_hpN)r   r   r   r   r   ÚROWWISEÚROWWISE_WITH_GW_HPr   r   r   r   r>      s
    r>   c                   @   sü   e Zd ZU dZeƒ Zeed< dZee ed< eƒ Z	eed< dZ
ee ed< eƒ Zeed< dZee ed< ed	d
Zeed< eƒ Zeed< eƒ Zeed< dZeed< dZeed< dZeed< dZeed< dZeed< dd„ Zedeeef dd fdd„ƒZdS )ÚFloat8LinearConfigz]
    Configuration for converting a `torch.nn.Linear` module to float8
    for training.
    Úcast_config_inputNÚ!cast_config_input_for_grad_weightÚcast_config_weightÚ!cast_config_weight_for_grad_inputÚcast_config_grad_outputÚ'cast_config_grad_output_for_grad_weightT)r<   Úgemm_config_outputÚgemm_config_grad_inputÚgemm_config_grad_weightFÚenable_fsdp_float8_all_gatherÚpad_inner_dimÚemulateÚ!force_recompute_fp8_weight_in_bwdÚround_scales_to_power_of_2c                 C   s€  | j d u rt | d| j¡ | jd u rt | d| j¡ | jd u r't | d| j¡ | jjt	j
kr:| jr:J d| jj› ƒ‚| j}| j}| j}| j }| j}| j}||df||df||dffD ]\}}}	|jtju }
|jtju }|
|ksvJ d|	› ƒ‚qZ||d	tf||d
tf||dtffD ]+\}}}}|jd u ršt |d|¡ |jd u r¦t |d|¡ |j|jks³J |› dƒ‚qˆ| jr¾t d¡ d S d S )NrE   rG   rI   zPenable_fsdp_float8_all_gather only supports tensorwise scaling granularity, got ÚoutputÚ
grad_inputÚgrad_weightz#incompatible operand precision for ÚinputÚweightÚgrad_outputr2   z< must be cast to the same dtype in both matmuls it's used inz¨`config.force_recompute_fp8_weight_in_bwd` is deprecated and will be removed in a future release. Please see https://github.com/pytorch/ao/issues/2251 for more details.)rE   ÚobjectÚ__setattr__rD   rG   rF   rI   rH   r1   r   r   rM   r0   r   r   r(   r*   r2   rP   r   Úwarning)r   Úcc_iÚcc_wÚcc_goÚcc_i_gwÚcc_w_giÚcc_go_gwÚcc1Úcc2Ú	gemm_nameÚis_disabled_1Úis_disabled_2Úoperand_nameÚdefault_dtyper   r   r   r+   ø   sb   
ÿ
ÿ
ýÿý
ÿ


ý

ÿÿÿz Float8LinearConfig.__post_init__Úrecipe_nameÚreturnc              	   C   s  t | ƒtkrdd„ tD ƒ}| |v sJ d| › d|› ƒ‚t| ƒ} | tju r'tƒ S | tju rIttjt	d}ttjt	d}ttjt	d}t|||ddS | tj
u rttjd}ttjd}ttjt	d}ttjd}ttjd	}ttjt	d
}t||||||ddS td| › ƒ‚)zÆ
        Input: `Float8LinearRecipeName` value, or a string representing a `Float8LinearRecipeName` value
        Output: a `Float8LinearConfig` configured to implement the specified recipe
        c                 S   s   g | ]}|j ‘qS r   )Úvalue)Ú.0Únr   r   r   Ú
<listcomp>E  s    z7Float8LinearConfig.from_recipe_name.<locals>.<listcomp>zrecipe_name z not in valid names )r1   r2   T)rD   rF   rH   rQ   )r1   )r0   )r0   r2   )rD   rF   rH   rE   rG   rI   rQ   zunknown recipe_name )ÚtypeÚstrr>   r   rC   rA   r/   r   r    r(   rB   r   r   ÚAssertionError)rh   Úvalid_namesr[   r\   r]   r_   r^   r`   r   r   r   Úfrom_recipe_name<  sZ   
ÿ

ÿÿÿû
ÿÿùz#Float8LinearConfig.from_recipe_name)r   r   r   r   r/   rD   r:   rE   r   rF   rG   rH   rI   r;   rJ   rK   rL   rM   r=   rN   rO   rP   rQ   r+   Ústaticmethodr   r>   ro   rr   r   r   r   r   rC   ³   s.   
 D
ÿþrC   )ÚenumÚloggingÚdataclassesr   Útypingr   r   r"   Útorchao.utilsr   Ú	getLoggerr   ÚLoggerr:   ÚEnumr   r   r!   Útype_configr(   r*   r/   r;   r>   rC   r   r   r   r   Ú<module>   s*   
&