o
    it*                     @   st  d dl mZ d dlZd dlmZ G dd deZG dd deZG dd deZG d	d
 d
eZG dd deZ	G dd deZ
G dd deZG dd deZdefddZdefddZejejejdejfdededededededed ed!edefd"d#Zd$ejjjjfd%d&ZG d'd( d(eZd)ejdefd*d+Zd,ejd)ejd-edefd.d/Zd0ejdej fd1d2Z!dS )3    )IntEnumNc                   @      e Zd ZdZdZdS )Majorr      N)__name__
__module____qualname__KMN r   r   T/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/cute/mma_sm100_desc.pyr          r   c                   @   r   )ScaleInr   r   N)r   r   r   OneNegr   r   r   r   r      r   r   c                   @   r   )Saturater   r   N)r   r   r   False_True_r   r   r   r   r      r   r   c                   @      e Zd ZdZdZdZdS )CFormatr   r      N)r   r   r   F16F32S32r   r   r   r   r          r   c                   @   r   )F16F32Formatr   r   r   N)r   r   r   r   BF16TF32r   r   r   r   r   %   r   r   c                   @   r   )S8Formatr   r   N)r   r   r   UINT8INT8r   r   r   r   r   +   r   r   c                   @       e Zd ZdZdZdZdZdZdS )MXF8F6F4Formatr   r            N)r   r   r   E4M3E5M2E2M3E3M2E2M1r   r   r   r   r"   0       r"   c                   @   s   e Zd ZdZdZdZdZdS )MaxShiftr   r   r   r#   N)r   r   r   NoShift	MaxShift8
MaxShift16
MaxShift32r   r   r   r   r,   8   s
    r,   returnc                 C   s~   | t ju rtjS | t ju rtjS | t ju rtjS | t j	u r tj
S | t ju r(tjS | t ju r0tjS | t ju r8tjS td| )zJ
    Map a CUTLASS scalar class to the 3-bit encoding for Matrix A/B.
    z)Unsupported CUTLASS scalar type for A/B: )cutlassInt8r   r    Uint8r   Float16r   r   BFloat16r   TFloat32r   FloatE4M3FNr"   r&   	FloatE5M2r'   	TypeErrorcutlass_typer   r   r   to_UMMA_formatD   s   






r=   c                 C   s>   | t ju rtjS | t ju rtjS | t ju rtjS td| )zG
    Map a CUTLASS scalar class to the 2-bit accumulator encoding.
    z1Unsupported CUTLASS scalar type for accumulator: )	r2   r5   r   r   Float32r   Int32r   r:   r;   r   r   r   to_C_format]   s   


r@   FMNa_majorb_majora_negb_negc_sat	is_sparse	max_shiftc                 C   sT  t t| }t t|}t t|}|dvrtd|dk s&|dks&|d@ r*td|d? }|d? }d	}|d	O }|t |
d
@ d> O }|t |	d
@ d> O }||d@ d> O }||d@ d> O }||d@ d> O }|t |d
@ d> O }|t |d
@ d> O }|t |d
@ d> O }|t |d
@ d> O }||d@ d> O }||d@ d> O }|t |d@ d> O }|d@ S )u   
    Build the 32-bit instruction descriptor for Blackwell MMA.
    All matrix/accumulator **types must be CUTLASS scalar classes** –
    passing integers is forbidden.
    )@         zM must be 64, 128 or 256   rL      u.   N must be a multiple of 8 in the range 8…256r$   r#   r   r   r   
               ?               l    )intr=   r@   
ValueError)a_typeb_typec_typerA   rB   rC   rD   rE   rF   rG   rH   rI   a_fmtb_fmtc_fmtm_dimn_dimdescr   r   r   make_instr_desco   s0   rd   opc              	   C   s`   t | j| j| j| jd | jd | jtjjj	j
jkrtjntj| jtjjj	j
jkr,tjS tjS )Nr   r   )rd   a_dtypeb_dtype	acc_dtype	shape_mnka_major_modecutenvgputcgen05mmaOperandMajorModer	   r   r
   b_major_mode)re   r   r   r   mma_op_to_idesc   s   rq   c                   @   r!   )
LayoutTyper   r   r   r$      N)r   r   r   SWIZZLE_NONESWIZZLE_128B_BASE32BSWIZZLE_128BSWIZZLE_64BSWIZZLE_32Br   r   r   r   rr      r+   rr   swizzlec                 C   s   t | }||dd |d }dd |dD \}}}|dkr8|dkr+td	tjtjtjtjd
| S |dkrI||fdkrFtdtj	S td)N<r   >c                 S   s   g | ]}t |qS r   )rY   ).0xr   r   r   
<listcomp>   s    z _layout_type.<locals>.<listcomp>,r$   r#   u/   Unexpected swizzle shift – want S==3 for M==4)r   r   r   r#   r%   )r   r   z.Only Swizzle<2,5,2> supported for 128B_BASE32Bz3Unsupported swizzle triple for UMMA smem descriptor)
strindexsplitrZ   rr   rt   rx   rw   rv   ru   )ry   swz_strinsideBrA   Sr   r   r   _layout_type   s$   r   layoutmajorc              
   C   s   t |}d}d}d}tjdtjdtjdtjdtjdi| }|tju r|tju r)dnd}t	
| ||f}	t	|	ds=td|	jd d }
|tjurQ|
dkrQtd|	jd d }||kr`td|	jd d |	jd d }}|tju rz||}}n^||}}nX|tjkrtd	t	| jd d dkstd
t	
| d}	t	|	dstd|	jd d }
|
|krtd|	jd d }|tjur|dkrtd|	jd d }||}}d}||d@ d> O }||d@ d> O }||d@ d> O }||d@ d> O }||d@ d> O }|t|d@ d> O }|d@ S )z
    Convert a 2-D *shared-memory* Cute layout into the Blackwell 64-bit
    smem-descriptor, without the smem start address.
    layout must correspond to layout of an uint128 tensor.
    r   r   r   r$   rM   )r   r   r   z9Not a canonical UMMA_MN Layout: Expected profile failure.z8Not a canonical UMMA_MN Layout: Expected stride failure.z+SWIZZLE_128B_BASE32B is invalid for Major-Kz>Not a canonical UMMA_K Layout: Expected MN-size multiple of 8.)rM   r   z8Not a canonical UMMA_K Layout: Expected profile failure.z7Not a canonical UMMA_K Layout: Expected stride failure.i?  rS       r#   .   rN   1   4   =   l    )r   rr   rt   rx   rw   rv   ru   r   r
   rk   logical_divideis_congruentrZ   stridesizeshaperY   )r   ry   r   layout_typeVERSIONLBO_MODEBASE_OFFSETswizzle_atom_mn_sizeswizzle_atom_k_sizecanonical_layout	stride_00	stride_10	stride_01	stride_11stride_byte_offsetleading_byte_offsetrc   r   r   r   make_smem_desc_base   sd   



r   
start_addrc                 C   s   |   d@ d? S )Ni r$   )toint)r   r   r   r   make_smem_desc_start_addr  s   r   )"enumr   r2   cutlass.cuterk   r   r   r   r   r   r   r"   r,   rY   r=   r@   r   r   r-   boolrd   rl   rm   rn   MmaOprq   rr   Swizzler   Layoutr   Pointerr?   r   r   r   r   r   <module>   sX   	

4I