o
    پi{                     @   s   d dl mZmZmZ d dlZd dlmZ ejdej	fddZ
ejdej	dej	fdd	Zd
d Zdd Zejdd Zejdd ddd Zejeeddej	dej	dej	dej	dej	f
ddZdS )    )
load_scalefloat_to_flexupdate_scaleN
clip_lowerc                 C   s"   t | |}|rt | |}|S )N)tlminimummaximum)xlimitr   res r   Y/home/ubuntu/.local/lib/python3.10/site-packages/triton_kernels/swiglu_details/_swiglu.pyclip   s   r   
BLOCK_SIZENUM_THREADSc                 C   s(   t jt jt | ||| gddddS )NT)can_reorder   )axis)r   maxreshapeabs)r	   r   r   r   r   r   thread_local_absmax   s   (r   c                    sV   | j | j dd dfdddD }d fdddD }d	| d
| S )Nc                 S   s   d| v rdS | S )Nu8mxfp4r   )dtyper   r   r   <lambda>   s    zswiglu_repr.<locals>.<lambda>r	   c                    s"   g | ]} | d d  qS )r   Nr   .0i)convert_dtype	signaturer   r   
<listcomp>   s   " zswiglu_repr.<locals>.<listcomp>)OutAc                    s   g | ]} |  qS r   r   r   )	constantsr   r   r!      s    )BLOCK_MBLOCK_N_swiglu__)r    r$   join)specializationdtypesblocksr   )r$   r   r    r   swiglu_repr   s   r-   c                 C   sn   |d |d }}t  }|j d| d| d|d< |d |d }}| |  | |   |d	< |S )
NMNz [M = z, N = ]namer#   r"   bytes)dictr1   numelelement_size)gridkernelargsr.   r/   retr#   r"   r   r   r   swiglu_launch_metadata   s   $r:   c                 C   sr   |  tj| } |d urt| |dd} | tj| }|d ur&t||dd}| dt| |    }t|||S )NF)r   Tr   )tor   float32r   expfma)gelulinearscalealphar
   sr   r   r   compute_swiglu%   s   rD   c                 C   s   dS )N_swiglur   )r(   r   r   r   r   1   s    r   )reprc                 C   s<   t t | | jd | jd d df\}}t||d||S )Nr   r      g      ?)r   splitr   shaperD   )inputrB   r
   r?   r@   r   r   r   
_swiglu_fn1   s   ,rK   )rF   launch_metadatar
   r%   r&   EVEN_Nflexpoint_saturate_infc           )   	   C   s  |d urt |}|| d | }t t jj gdt j}t|}t|}t jt 	d|| t 
dddD ]}|| }|| }|| t d| }|| t d| }||k }||k }|| t dd| d  }||k } t | dg} |d | t dd|  }|d d d f |	 |d d d f |
  }!|rt j||! |d d d f dd}"n3|| | |krt j||! |d d d f dd}"n|d d d f | d d d f @ }#t j||! |#dd}"t t |"||df\}$}%t|$|%|||}&|d urt|&|&jt jj }'t ||'}t|&|d |d | |}&|r |d d d f n|d d d f |d d d f @ }(t | |d d d f |  |d d d f |  |&|( q7t|||  d S )Nr   g        r   rG   )
num_stages   )maskother)r   loadfullextracudanum_threadsr<   r   range
program_idnum_programsarangemax_constancyrH   r   rD   r   r4   r   r   storer   ))r"   OutExpectedScaleOutActualScaleOutChecksumScaler#   AScalerB   r.   r/   	stride_am	stride_anstride_outmstride_outnr
   NTokensr%   r&   rM   M_BLOCKSN_BLOCKSrN   	local_maxa_scaleout_expected_scalepidpid_mpid_noff_moff_nmask_mmask_npacked_off_npacked_mask_npacked_offsa_packedpacked_maska_gelua_linearoutabsmaxrQ   r   r   r   rE   7   sH   
(("" 
68rE   ))triton_kernels.numerics_details.flexpointr   r   r   tritontriton.languagelanguager   jit	constexprr   r   r-   r:   rD   rK   rE   r   r   r   r   <module>   s2    		

