o
    پiM	                  
   @   s   d dl Z d dlmZ dededefddZdededefdd	Zde jfd
dZde jdedee je jf fddZ	de jdedee je jf fddZ
de jdedee je jf fddZde jdededee je jf fddZdS )    N)Tuplexyreturnc                 C   s   | | d | S )N    r   r   r   r   H/home/ubuntu/.local/lib/python3.10/site-packages/deep_gemm/utils/math.pyceil_div   s   r
   c                 C   s   t | || S )N)r
   r   r   r   r	   align	   s   r   c              	   C   s6   |  d  dksJ tdtt|  S )Nr   g       @)viewamaxitemtorchpowceillog2abs)r   r   r   r	   ceil_to_ue8m0   s   r   	use_ue8m0c           	      C   s   |   dksJ | j\}}t|d}tj||f| j| jdd}| |d d d |f< ||dd}|	 
 jdd|dd}|d }|rMt|n|}|d	|d  tj||d d d |f  |fS )
N      dtypedevicer   r   dim-C6?      |@      ?)r   shaper   r   emptyr   r   fill_r   r   floatr   clampr   	unsqueezetofloat8_e4m3fn
contiguous)	r   r   mnpadded_nx_paddedx_viewx_amaxsfr   r   r	   per_token_cast_to_fp8   s   

":r1   c                 C   s   |   dkr| dd dksJ | j\}}| dd|}|  jddd|d}|d }|r8t|n|}|d	|	d  
tj|||fS )
Nr   r   r   r   r   r   r   r   r    )r   sizer!   r   r   r$   r   r%   r   r&   r'   r   r(   )r   r   r*   r+   r.   r/   r0   r   r   r	   per_channel_cast_to_fp8   s   "
"&r3   c           	      C   s   |   dksJ | j\}}tjt|dt|df| j| jd}| |d |d |f< |dd|dd d}|	 
 jdddd	}|d
 }|rNt|n|}|d|  tj}||d |d |f  ||d|dfS )Nr   r   r   r   r   )r      Tr   keepdimr   r   r    r   )r   r!   r   zerosr   r   r   r   r2   r   r$   r   r%   r   r'   r(   view_asr)   )	r   r   r*   r+   r-   r.   r/   r0   x_scaledr   r   r	   per_block_cast_to_fp8)   s   
$6r:   dimsc                    sr   t  fddt|  D }|   j|ddd}|d }|r't|n|}| d|  t	j
}|| fS )Nc                    s   g | ]
}|t  vr|qS r   )set).0ir;   r   r	   
<listcomp>7   s    z/per_custom_dims_cast_to_fp8.<locals>.<listcomp>Tr5   r   r   r    )tupleranger   r   r$   r   r%   r   r'   r   r(   squeeze)r   r;   r   excluded_dimsr/   r0   r9   r   r?   r	   per_custom_dims_cast_to_fp86   s   rE   )r   typingr   intr
   r   Tensorr   boolr1   r3   r:   rE   r   r   r   r	   <module>   s    $$$
,