o
    پi                     @   sB  d dl mZmZmZ ddlZddlmZ ddlm	Z	 e
eZe
eZe
eZe
dZe
dZe
dZe
dZe
d	Ze
d
Ze
dZejdd Zejdd Zejdd Zejdd Zejdd Zejdd Zejdd Zejd$ddZejdd Zejd%dd Zejd!ej
fd"d#Z dS )&   )MAX_FINITE_FLOAT8E4B8MAX_FINITE_FLOAT8E4NVMAX_FINITE_FLOAT8E5    N)cuda_capability_geqg      ?g     @i%I7i%I;i;i%I?i 7c                 C      | t t jkr
tS | t t jkrtS | t t jkrtS | t t jkr(t	S | t t j
kr2tS t t d|  d d S NFz not supported in flexpoint)tl	constexprfloat8e5TL_MAX_FINITE_FLOAT8E5
float8e4nvTL_MAX_FINITE_FLOAT8E4NV
float8e4b8TL_MAX_FINITE_FLOAT8E4B8float8e4b15TL_MAX_FINITE_FLOAT8E4B15float16TL_MAX_FINITE_FLOAT16static_assertdtype r   ]/home/ubuntu/.local/lib/python3.10/site-packages/triton_kernels/numerics_details/flexpoint.py
max_finite      r   c                 C   r   r   )r	   r
   r   TL_RCP_MAX_FINITE_FLOAT8E5r   TL_RCP_MAX_FINITE_FLOAT8E4NVr   TL_RCP_MAX_FINITE_FLOAT8E4B8r   TL_RCP_MAX_FINITE_FLOAT8E4B15r   TL_RCP_MAX_FINITE_FLOAT16r   r   r   r   r   rcp_max_finite'   r   r!   c                 C   V   t tddd t | jt jkd t |jt jkd t jdd| |gt jddd	S )
ag  Wrapper for min.NaN.xorsign.abs.f32 PTX instruction.

    Computes the minimum of the absolute values of the two inputs and sets its sign to the XOR of the signs of the inputs.
    NaN inputs are propagated to the output.

    Requires CUDA compute capability 8.6+ (A100 and A30 Ampere GPUs don't support it, but A40/A16/A10/A2, Ada, and Hopper GPUs do).
          z=min.NaN.xorsign.abs.f32 requires CUDA compute capability 8.6+z/min.NaN.xorsign.abs.f32 requires float32 inputsz/{
    min.NaN.xorsign.abs.f32 $0, $1, $2;
    }=r,r,rT   r   is_purepackr	   r   r   r   float32inline_asm_elementwiseabr   r   r   sm86_min_nan_xorsign_abs_f327      	r0   c                 C   r"   )
ag  Wrapper for max.NaN.xorsign.abs.f32 PTX instruction.

    Computes the maximum of the absolute values of the two inputs and sets its sign to the XOR of the signs of the inputs.
    NaN inputs are propagated to the output.

    Requires CUDA compute capability 8.6+ (A100 and A30 Ampere GPUs don't support it, but A40/A16/A10/A2, Ada, and Hopper GPUs do).
    r#   r$   z=max.NaN.xorsign.abs.f32 requires CUDA compute capability 8.6+z/max.NaN.xorsign.abs.f32 requires float32 inputsz/{
    max.NaN.xorsign.abs.f32 $0, $1, $2;
    }r%   Tr&   r'   r*   r-   r   r   r   sm86_max_nan_xorsign_abs_f32P   r1   r2   c                 C   s   | d u rdS t | S )N      ?)r	   load)	scale_ptrr   r   r   
load_scalei   s   r6   c                 C   s   t |}| tj| S N)r6   tor	   r+   )xr5   scaler   r   r   flex_to_floatn   s   r;   c                 C   s   t | |}t | |}|S r7   )r	   minimummaximum)r9   limitresr   r   r   clipt   s   r@   c                 C   sT   t ddrt| |t}|jtjddd@ }|S | jtjddd@ }t||}|S )Nr#   r$   Tbitcasti)r   r	   reducer2   r8   uint32max)r9   axisx_absmaxmasked_abs_xr   r   r   nan_propagating_absmax_reduce{   s   
rI   c                 C   sP   t tj| dd}t|djtjdd}t|jj}t	||jtjdddS )NTcan_reorderi  rA   gKH9)
rI   r	   ravelr<   r8   r+   r!   r   
element_tyfma)r9   OutrG   RCP_MAX_VALUEr   r   r   compute_scale   s   rQ   returnc                 C   s*   |d urt | |}tj||dd d S d S )Nrelaxed)sem)rQ   r	   
atomic_max)r9   r5   rO   r:   r   r   r   update_scale   s   
rV   saturate_infsc                 C   s   |d ur|j  rdt| }nd| }nd}|d urG| jtjdd}tdtj}	|d ur6t|||	}ttj	|ddd}
t
||
 |d urV|d urVt|| d} t| || | | } |d urq|rqt|j j}t| |} | S )Nr3   TrA   g        rJ   r   )r   is_ptrr	   r4   r8   int32castwherexor_sumrL   
atomic_addrV   r   rM   r@   )r9   expected_scale_ptr_or_valactual_scale_ptrchecksum_scale_ptrmaskrO   rW   invscalex_int32zerochecksum_local
CLIP_VALUEr   r   r   float_to_flex   s,   



rg   r7   )rR   N)!numericsr   r   r   tritontriton.languagelanguager	   triton_kernels.target_infor   r
   r   r   r   r   r   r   r   r   r   r    jitr   r!   r0   r2   r6   r;   r@   rI   rQ   rV   rg   r   r   r   r   <module>   sL    


















