o
    پi                     @   s  d dl Z d dlZd dlZd dlZzddlmZ e D ]\ZZee j	vr)ee j	e< qW n	 e
y4   Y nw ddlmZ defddZdad:d	d
ZdefddZedZedZedZedZedZedZedZedZedZedZedZedZedZeZedZ edZ!edZ"edZ#edZ$ed Z%ed!Z&ed"Z'ed#Z(ed$Z)ed%Z*ed&Z+ed'Z,ed(Z-ed)Z.ed*Z/ed+Z0ed,Z1ed-Z2ed.Z3ed/Z4ed0Z5ed1Z6ed2Z7ed3Z8dd4lm9Z9 dd5lm:Z: dd6l:T d7d8 Z;e  	 e;  dS );    N   )persistent_envs)deep_gemm_cppreturnc                  C   s   t jdpt jd} | d u rTz0tt jd }tjddg|d d}t j	
t j	
|} W d    n1 s:w   Y  W n tyS   d} t j	| sQd } Y nw | d usZJ | S )	N	CUDA_HOME	CUDA_PATHwwhichnvcc)stderrz
z/usr/local/cuda)osenvirongetopendevnull
subprocesscheck_outputdecoderstrippathdirname	Exceptionexists)	cuda_homer   r
    r   F/home/ubuntu/.local/lib/python3.10/site-packages/deep_gemm/__init__.py_find_cuda_home   s    r   Fc                  C   s6   t rd S tjtjt} tjj	| t
  da d S )NT)_dg_initializedr   r   r   abspath__file__torchops	deep_gemminitr   )library_rootr   r   r   _ensure_initialized%   s
   r%   namec                    s   t tjj|   fdd}|S )Nc                     s   t    | i |S )N)r%   )argskwargsfuncr   r   _fn0   s   z_wrap_op.<locals>._fn)getattrr    r!   r"   )r&   r+   r   r)   r   _wrap_op.   s   r-   set_num_smsget_num_smsset_compile_modeget_compile_modeset_tc_utilget_tc_utilfp8_gemm_ntfp8_gemm_nnfp8_gemm_tnfp8_gemm_tt m_grouped_fp8_gemm_nt_contiguous m_grouped_fp8_gemm_nn_contiguousm_grouped_fp8_gemm_nt_masked k_grouped_fp8_gemm_nt_contiguous k_grouped_fp8_gemm_tn_contiguousbf16_gemm_ntbf16_gemm_nnbf16_gemm_tnbf16_gemm_tt!m_grouped_bf16_gemm_nt_contiguous!m_grouped_bf16_gemm_nn_contiguousm_grouped_bf16_gemm_nt_masked!k_grouped_bf16_gemm_tn_contiguouscublaslt_gemm_ntcublaslt_gemm_nncublaslt_gemm_tncublaslt_gemm_ttfp8_gemm_nt_skip_head_midfp8_mqa_logitsget_paged_mqa_logits_metadatafp8_paged_mqa_logitseinsum!transform_sf_into_required_layoutget_tma_aligned_size&get_mk_alignment_for_contiguous_layoutget_mn_major_tma_aligned_tensor,get_mn_major_tma_aligned_packed_ue8m0_tensor6get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor)testing)utils)*c                     sF   g d} t tjjj   fdd| D }|r!td|  d S d S )N)r#   r.   r/   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   rN   rO   rP   rQ   rR   rS   rI   rJ   rK   rL   rM   rE   rF   rG   rH   c                    s   g | ]}| vr|qS r   r   ).0opavailable_opsr   r   
<listcomp>   s    z&_verify_ops_loaded.<locals>.<listcomp>zWarning: Missing operations: )listr    r!   r"   __dict__keysprint)expected_opsmissing_opsr   rY   r   _verify_ops_loadedp   s   rb   T)r   N)<r   r   r    torch.utils.cpp_extensionenvsr   itemskeyvaluer   ImportError r   strr   r   r%   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   fp8_m_grouped_gemm_nt_maskedr;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rb   r   r   r   r   <module>   s    


	
