o
    iQ                     @   s  d dl mZmZmZ d dlZd dlZdZdZdZdZ	ddd	d
ddddddd
dddddddddddddd
dddddd
ddddddddZ
d:dee fdd Zd!Z	"d;d#ed$ee d%ee d&eeejef  fd'd(Z	d:d)ejd*ejd+ejd%ee dee d&ejfd,d-Zd)ejd*ejd+ejd$ee d%ee dee fd.d/Z	d:d$ee d%ee d0edee fd1d2Z	"d;d#ed3ee d&eeejef  fd4d5Z	d:d3ee dee fd6d7Z	d:d)ejd*ejd+ejd3ee dee f
d8d9ZdS )<    )ListOptionalUnionNg      ?         g vCg 
`Cg   .YvBg(\?gq=
ףp?)bf16_peak_topsfp8_peak_topspeak_mem_bw_bytes_secpct_achievable_gemm_topspct_achievable_mem_bwg sCg s/Cg s?Cg   B)r   r	   fp4_peak_topsr
   r   r   g mvHCg=
ףp=?gzG?g ֒Cg ֒"Cg   xHBg  @Bg  @Bg  @Cg  BwC)r   r	   r   r
   )zNVIDIA H100zNVIDIA B200zNVIDIA B300 SXM6 ACzNVIDIA GB200zAMD Instinct MI300XzNVIDIA GeForce RTX 5090gpu_namec                 C   s   | d u r
t jd} t|  S )Nr   )torchcudaget_device_namegpu_name_to_specs)r    r   [/home/ubuntu/.local/lib/python3.10/site-packages/torchao/testing/training/roofline_utils.py	get_specsa   s   r   g>Ftensor_rolefloat8_recipe_namemx_recipe_namereturnc                    s  || }d}|dkrC|dkr)|rd}	nt | }	t | t|  }
|
}|	d|
|g}nP|r.d}	nt | }	t | dt |  }
|	d|
g}n6|dkr|dkrq|rTdt|  }	nt | t|  }	t | d }t | t|  }
|	||
g}n|r~dt|  t |  }	nt | t|  t |  }	t | t|  }|	|g}n|dkr|dv r|rdt|  }	nt | t|  }	|	g}n|dkrt | t|  }	d}t | t|  }
|	||
g}nJ d
|dkr|rdt|  }	nt | t|  }	|	g}n|dkr/|dv r|rdt|  }	nt | t|  }	t | t|  }n|dkr&t | t|  }	t| d }nJ d
|	|g}nJ|dv s;J d||dkr[|rJdt|  }	nt | t|  }	t | t|  }n|redt|  }	nt | t|  }	t | t|  }|	|g} fdd|D }dd |D }|S )a/  
    Calculates the roofline estimate of casting one of the gemm inputs
    (input, weight or grad_output) to float8 in fwd+bwd.

    Inputs: dim0 and dim1 (shape), tensor_role (input|weight|grad_output), recipe names
    Outputs: list of read/write traffic overhead in seconds, one for each kernel
    N
tensorwiseweightr   r   rowwiserowwise_with_gw_hp)inputgrad_outputFunsupported mxfp8_32x32_flexible_gemm_layoutmxfp8_32x32_weight)mxfp8_emulatedmxfp8_cublasmxfp8_cublas_rceilmxfp4_cutlasszunsupported mx_recipe_name=r&   c                        g | ]}| d    d  qS r
   r   r   .0xspecsr   r   
<listcomp>      z4get_tensor_memory_traffic_ovhd_s.<locals>.<listcomp>c                 S      g | ]}t |tqS r   sympyMaxKERNEL_LAUNCH_OVERHEAD_SECr)   r   r   r   r.   !      )BYTES_PER_EL_BF16BYTES_PER_EL_FLOAT8BYTES_PER_EL_FLOAT4)r-   dim0dim1r   r   r   fuse_with_prevnumel	res_byteskernel_1_rwkernel_3_rwkernel_4_rwkernel_2_rwres_sr   r,   r    get_tensor_memory_traffic_ovhd_sm   s   







rC   MKNc                 C   sx  t |}d|  | | }|tju r|d }n|tjtjfv r#|d }n|tju r-|d }nJ d| || |d  }	| | ||  }
| | }|d urw|dsXJ d	| |tjtjtjfv sfJ d
|drmdnd}|
| }|
| }
|tju r|
t |t  }n&|tjtjfv r|
t |t  }n|tju r|
t	 |t  }nJ d| ||d  |d  }t
|	|tS )Nr   r   r	   r   Fzunsupported dtype: r   )mxfp8mxfp4nvfp4zUnsupported recipe r    mx       r
   r   )r   r   bfloat16float8_e4m3fnfloat8_e5m2float4_e2m1fn_x2
startswithr6   r7   r8   r2   r3   r4   )rD   rE   rF   dtyper   r   r-   gemm_ops	peak_topscompute_gemm_time_s	num_reads
num_writes
block_sizenum_scale_readsbytes_rwmem_gemm_time_sr   r   r   get_individual_gemm_time_sympy&  sH   	






r\   c                 C   sd   |||}}}	|dkrt j}	t| |||||}
t| |||||}t|| ||	||}|
| | }|S )Nr   )r   rM   r\   )rD   rE   rF   rR   r   r   r   gemm_dtype_inputgemm_dtype_grad_inputgemm_dtype_grad_weightgemm_output_time_sgemm_grad_input_time_sgemm_grad_weight_time_stotalr   r   r   get_gemm_time_sympy^  s"   
rd   enable_fusion_modelingc              	   C   sb   t |}t|| |d|||d}t|||d||dd}	t|| |d|||d}
tg ||	|
}|S )Nr   )r   r   r   r;   r   Fr   )r   rC   sum)rD   rE   rF   r   r   re   r   r-   fwd_fp8_input_memfwd_fp8_weight_memgi_fp8_grad_output_memresr   r   r   get_float8_mem_sympy  s<   			
rk   recipe_namec                    st  |du sJ d|dksJ d|| }d}h d}| dkr1 t | }	t | t|  }
|	|
g}nw dkrH t | t|  }	|	t| 7 }	|	g}n` }|rh|d	rh t | t|  }	|	t| |d
  7 }	|	g}n@}|r|dsu|drt | t|  }	|dr|	t7 }	|drd
nd}|	t| ||  7 }	|	g}n	 td| d|  fdd|D }dd |D }|S )z
    Inference version of `get_tensor_memory_traffic_ovhd_s`.
    The only thing happening here is we quantize the activation.
    Fr    r   z*inference only quantizes input activationsN>   mxfp4*mxfp8*nvfp4*r   r   r   r   rG   rK   rH   rI   rL   zUnknown recipe name: z. Allowed recipes: c                    r'   r(   r   r)   r,   r   r   r.     r/   z>get_inference_tensor_memory_traffic_ovhd_s.<locals>.<listcomp>c                 S   r0   r   r1   r)   r   r   r   r.     r5   )r6   r7   BYTES_PER_EL_FLOAT32rQ   r8   
ValueError)r-   r9   r:   r   rl   r;   r<   r=   allowed_recipesr>   r?   namerX   rB   r   r,   r   *get_inference_tensor_memory_traffic_ovhd_s  sH   




rt   c                 C   s,   t |}t|| |d|dd}tg |}|S )Nr   F)r   rl   r;   )r   rt   rf   )rD   rE   rF   rl   r   r-   rg   rj   r   r   r   get_inference_float8_mem_sympy  s   ru   c                 C   s   t | |||||}|S N)r\   )rD   rE   rF   rR   rl   r   r`   r   r   r   get_inference_gemm_time_sympy  s   
rw   rv   )F)typingr   r   r   r2   r   r8   r7   r6   rp   r   strr   r4   SymbolfloatrC   r\   rd   boolrk   rt   ru   rw   r   r   r   r   <module>   s   P	

 @
8
)
<
U
