o
    پi1                     @   s   d dl mZmZmZmZ d dlmZ d dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ eG d
d dZG dd dZdS )    )OptionalTupleDictAny)	dataclassN)Tensor)Int32)from_dlpackmake_ptr)torch2cute_dtype_map)VarlenArguments)TileSchedulerOptionsc                   @   sL   e Zd ZU ee ed< dZee ed< dZee	 ed< dZ
eej ed< dS )GemmTensorInfotensorNdtypemajorcute_tensor)__name__
__module____qualname__r   r   __annotations__r   r   r   strr   cute r   r   L/home/ubuntu/.local/lib/python3.10/site-packages/quack/gemm_wrapper_utils.pyr      s
   
 r   c                   @   s  e Zd ZededededdfddZededeed	f deddfd
dZ	ededeeeef defddZ
e	dAdee dee deeeef dedeej f
ddZe						dBdededee dee deeeef  dee dee dee deeeeeeeef f fddZe	dCdeeef d ed!eddfd"d#Zedeeef ddfd$d%Zedeeef d&eeeeeef f ddfd'd(Zedeeef d&eeeeeef f ddfd)d*Ze			+dDd,ed-ee d.ee d/edef
d0d1Ze	2	dEdee dee dee d,ed3eeeef deeef d4ed5edee fd6d7Zed8d9deeef d:ee d;eeef d3eeeef d5ed<ed=ed>eed	f defd?d@ZdS )FGemmWrapperBaser   namendimreturnNc                 C   sB   |   |kr	| jsJ | d| d| jtv sJ d| d S )Nz must be a zD CUDA tensorzUnsupported dtype for )dimis_cudar   r   )r   r   r   r   r   r   validate_tensor   s   &zGemmWrapperBase.validate_tensorexpected_shape.c                 C   s(   | j |ksJ | d| d| j  d S )Nz must have shape z, got )shape)r   r"   r   r   r   r   validate_shape   s   zGemmWrapperBase.validate_shapedimsc                 C   s   |  ddkr|d S |d S )N   r   )stride)r   r%   r   r   r   get_major_order%   s   zGemmWrapperBase.get_major_order   r   assumed_alignc                 C   s8   | d u rd S ||d krdnd}t |  |dj|dS )Nr&   r   r*   leading_dim)r	   detachmark_layout_dynamic)r   r   r%   r*   r-   r   r   r   create_cute_tensor+   s   z"GemmWrapperBase.create_cute_tensorABDCadditional_tensorscu_seqlens_mcu_seqlens_kA_idxc                 C   s  |d ur|d urJ d|j | j ksJ d|d u}|rF|d us(|d us(J d|j tjks6J d|j  | dksFJ d|  d|d ur|  dksZJ d	|   d| d
ksjJ d|  d|rw|jd }	| j\}
}n| j\}	}|j\}}}||ksJ d| d| |j|d fksJ d|d  d|j |	}|	|f}d}n|d ur|  dksJ d|   d| dksJ d|  d|r| j\}}
|jd }n| j\}}|j\}}||ksJ d| d| |jd d }|j|d fksJ d|d  d|j |}|||f}d
}n:t| dd
 t|dd
 | j\}}}|j\}
}}||ksHJ d| d| t||||fd |||f}d
}|df|dffD ]3\}}|d ur| |ksJ | d| d|  d|j|ksJ | d|j d| qat| t|t|t|d}|r|	 D ]9\}}|d ur| |ksJ | d| d|  d|j|ksJ | d|j d| t|||< q|||||fS )Nz:Only one of cu_seqlens_m and cu_seqlens_k can be specifiedz A and B must have the same dtypez-gather_A requires either varlen_m or varlen_kzA_idx must be int32, got r&   zA_idx must be 1D, got r3      z&A must be 2D when using varlen_m, got    z B must be 3D with varlen_m, got r   zK dimension mismatch: A has z, B has zcu_seqlens_m must have shape (z,), got z&A must be 2D when using varlen_k, got z B must be 2D with varlen_k, got zK dimension mismatch: expected zcu_seqlens_k must have shape (r1   r2   r4   z	 must be zD for this mode, got z shape z doesn't match expected r1   r2   r3   r4   )
r   torchint32r   r#   r   r!   r$   r   items)r1   r2   r3   r4   r5   r6   r7   r8   gather_Atotal_M_KLNK_BMdc_shapedc_ndimtotal_Kr   r   tensorsr   r   r   validate_and_prepare_tensors;   s      


  






z,GemmWrapperBase.validate_and_prepare_tensorsFrJ   varlen_mvarlen_kc                 C   sl   |rdg}n	|rddg}nd }|   D ] \}}|jd ur3|jjdkr3|d u s*||v r3|jddd|_qd S )Nr2   r3   r4   r:   r&   r9   r   )r>   r   r   permute)rJ   rL   rM   tensors_to_permuter   infor   r   r   permute_tensors   s   
zGemmWrapperBase.permute_tensorsc                 C   s.   |   D ]\}}|jd urt|jj |_qd S N)r>   r   r   r   )rJ   r   rP   r   r   r   extract_dtypes   s
   
zGemmWrapperBase.extract_dtypesmajor_configsc                 C   sD   |  D ]\}}|| v r| | jd urt| | j|| | _qd S rR   )r>   r   r   r(   r   )rJ   rT   r   r%   r   r   r   determine_major_orders   s
   z&GemmWrapperBase.determine_major_ordersc                 C   s@   |   D ]\}}|jd ur||v rt|j|j|| |_qd S rR   )r>   r   r   r0   r   r   )rJ   rT   r   rP   r   r   r   create_cute_tensors   s   z#GemmWrapperBase.create_cute_tensors   max_active_clusterstile_count_semaphorebatch_idx_permutemax_swizzle_sizec                 C   sT   t t| |d urtt| tjjddnd |d ur#t|ddjddnd t|dS )N   r+   r   r,   )rY   rZ   r[   )	r   r   r
   data_ptrr   AddressSpacegmemr	   r/   )rX   rY   rZ   r[   r   r   r   create_scheduler_args   s   z%GemmWrapperBase.create_scheduler_argsr   cluster_shape_mnknum_epi_tensormapspingpongc                 C   s0  | d u r
|d u r
d S |d |d  }|| }	| d ur2||sdnd }
|d j d ur1|
|s.dnd7 }
n|d u r8dnd}
d}|
dkrb| d urG| jn|j}tj|	|
|ftj|d}t|ddjdd	d
}nd }t| d urst| ddjddnd |d urt|ddjddnd ||d urt|ddjdddS d dS )Nr   r&   r9   r3   r)   )r   device   r+   )r   r&   r9   )modestride_orderr\   r,   )mCuSeqlensMmCuSeqlensKmTensormapsmAIdx)	r   rd   r<   emptyint64r	   mark_compact_shape_dynamicr   r/   )r6   r7   r8   rX   ra   rJ   rb   rc   cluster_size
num_blocksnum_tensormapstensormap_sizerd   
tensormapstensormaps_cuter   r   r   create_varlen_args   sF   z"GemmWrapperBase.create_varlen_argsr;   )key_tensor_names
activationtile_shape_mn
persistenthas_semaphorerv   c                G   s   g }	|D ]}
|
| v r|	 | |
 j q|	 | |	||g |D ]}
|
| v r/|	 | |
 j q!|	|||g |	| t|	S rR   )appendr   extendr   tuple)rJ   rw   rx   ra   rc   ry   rz   rv   args	key_partsr   r   r   r   get_compile_key&  s   

zGemmWrapperBase.get_compile_key)r)   )NNNNNN)FF)NNrW   )r   F)r   r   r   staticmethodr   r   intr!   r   r$   r(   r   r   r0   r   r   rK   boolrQ   rS   rU   rV   r   r`   r   ru   r   r   r   r   r   r      s
   $"	k


	
	;



	
r   )typingr   r   r   r   dataclassesr   r<   r   cutlass.cuter   cutlassr   cutlass.cute.runtimer	   r
   quack.cute_dsl_utilsr   quack.varlen_utilsr   quack.tile_schedulerr   r   r   r   r   r   r   <module>   s   