o
    Ii_o                     @   sX  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
mZ d dlZd dlZd dlZd dlZd dlmZmZ d dlZd dlZd dlZd dlmZ d dlZd dlmZmZmZmZ e ddd	d
Z!e!" Z#W d   n1 szw   Y  ej$%ej$&e'Z(dZ)dZ*e+dddkZ,e+dddkZ-e+dddkZ.e+dddkZ/e+dddkZ0e+dddkZ1e+dddkZ2e+dddkZ3e+dddkZ4e+dddkZ5e+dddkZ6e+dddkZ7e+dddkZ8e+dddkZ9e+dddkZ:e+dddkZ;e+dddkZ<e+d ddkZ=e+d!ddkZ>e+d"ddkZ?e+d#ddkZ@d d$lmAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZHmIZI 	%	dd&d'ZJeJejKjL_Jd(d) ZMd*d+ ZNd,eOd%dfd-d.ZPdd0eOd1eOd%eQfd2d3ZRd%eQfd4d5ZSd6d7 ZTd8d9 ZUd:d; ZVd<d= ZWd>d?iZXeYd@ZZi Z[g Z\e]g dA e-se^dB_ej` eaej`bdCd  Zceaej`bdCdD ZdePd eNe\ZeZfefedEk regdFefedEkr/eVd>dGdGeXd> dHdI dJ eVd>dKdGeXd> dLdI dJ ej$%e'Zhej$iehejjdMdNdOdGZkej$iekd>eZ Zlekejm ejndP  ejndP< elejndQ< eoeleeljpejqB  g ZrersdR ersdS e.rBdTejt_uee(jvZwewdU dV Zxg e/rTdWgng  e1r\dXgng  e0rddYgng  e2rldZgng  e3rtd[gng  e4r|d\gng  e5rd]gng  e6rd^gng  e7rd_gng  e8rd`gng  e9rdagng  e:rdbgng  e;rdcgng  e<rddgng  e=rdegng  e>rdfgng  e?rdggng  e@rdhgng  Zydige6sdjgng  Zzdige6sdjgng  e7sdkgng  Z{dige6sdjgng  Z|g e:sdlgng  e;sdmgng  e<sdngng  e=s'dogng  e>s/dpgng  Z}dqgZ~e}Zd/ge0s?drgng  Zd/ge1sJdsgng  Zd/ge4sUdtgng  Ze4r^d/gndugZd/ge5sidvgng  Zdwdx eeezeeeD Zdydx ee~e{eeeeD Zdzdx ee}e|eD Zd{dx ee}e|eD Ze/rg Zg Zd|ge?seng  e e?seng  e Ze0sed}g7 Zg d~ZeM dkreddg ee(exd gZe\sededdgey eW e er ey ded dd Zdd ZG dd deZee)e edddgde#dg de\e\r eedndeidg dd dS )    N)Path)parseVersion)setupfind_packages)bdist_wheel)BuildExtensionCppExtensionCUDAExtension	CUDA_HOMEz../README.mdrzutf-8)encoding
flash_attnzVhttps://github.com/Dao-AILab/flash-attention/releases/download/{tag_name}/{wheel_name}FLASH_ATTENTION_FORCE_BUILDFALSETRUEFLASH_ATTENTION_SKIP_CUDA_BUILDFLASH_ATTENTION_FORCE_CXX11_ABI FLASH_ATTENTION_DISABLE_BACKWARDFLASH_ATTENTION_DISABLE_SPLITFLASH_ATTENTION_DISABLE_PAGEDKV FLASH_ATTENTION_DISABLE_APPENDKVFLASH_ATTENTION_DISABLE_LOCALFLASH_ATTENTION_DISABLE_SOFTCAPFLASH_ATTENTION_DISABLE_PACKGQAFLASH_ATTENTION_DISABLE_FP16FLASH_ATTENTION_DISABLE_FP8FLASH_ATTENTION_DISABLE_VARLENFLASH_ATTENTION_DISABLE_CLUSTERFLASH_ATTENTION_DISABLE_HDIM64FLASH_ATTENTION_DISABLE_HDIM96FLASH_ATTENTION_DISABLE_HDIM128FLASH_ATTENTION_DISABLE_HDIM192FLASH_ATTENTION_DISABLE_HDIM256FLASH_ATTENTION_DISABLE_SM80 FLASH_ATTENTION_ENABLE_VCOLMAJOR)	IS_HIP_EXTENSIONCOMMON_HIP_FLAGSSUBPROCESS_DECODE_ARGS
IS_WINDOWSget_cxx_compiler_join_rocm_home_join_cuda_home_is_cuda_file_maybe_writereturnc           '   	   C   s  dd }||}||}||}||}||}||}t |t |ks&J t |dks.J t }dg}|d|  |
s@|rjtrHtdd}ntdd}d	tjv rXtd	}n|}|d
|  |d|  trpt	| }dd
| g}|dd
|  |
r|dd
|  |dd
|  dd |D }|dd
|  |ddg }|dd
|  |dd
|  |dd
|  dd |D }dg}tr|d |d n|d |d |d |
rHd g}d!}tjjd"urtd#d$d%kr|d |d d&}d'g|d(d"  d)| d*g }d+g|d(d"  d)| d,g }|d-| d. g }t||D ]O\}}t|oY|
}|rr|d/rfd0}n|d1rod2}nd3}nd4}tr|d5d6}|d5d6}|dd7}|dd7}|d8| d9| d|  qO|rtj
tj|d d:}d;g}|d< d8| d=d
| g}||g7 }ng g }}|	d"ur d>g} trtd?d@gjt dA}!t |!d(krtj|!d d5d6}"ntdB| dC|" dD n| dE d8|	 dFd
| g}#dG|	 g}$ng g g } }#}$|||g}%|
r?|%| |%| |%| |%|| |||#|$g7 }%dH
dIdJ |%D }&|&dK7 }&t| |& d"S )La  Write a ninja file that does the desired compiling and linking.

    `path`: Where to write this file
    `cflags`: list of flags to pass to $cxx. Can be None.
    `post_cflags`: list of flags to append to the $cxx invocation. Can be None.
    `cuda_cflags`: list of flags to pass to $nvcc. Can be None.
    `cuda_postflags`: list of flags to append to the $nvcc invocation. Can be None.
    `sources`: list of paths to source files
    `objects`: list of desired paths to objects, one per source.
    `ldflags`: list of flags to pass to linker. Can be None.
    `library_target`: Name of the output library. Can be None; in that case,
                      we do no linking.
    `with_cuda`: If we should be compiling with CUDA.
    c                 S   s   | d u rg S dd | D S )Nc                 S   s   g | ]}|  qS  )strip).0flagr0   r0   @/home/ubuntu/.local/lib/python3.10/site-packages/hopper/setup.py
<listcomp>p   s    z=_write_ninja_file.<locals>.sanitize_flags.<locals>.<listcomp>r0   )flagsr0   r0   r4   sanitize_flagsl   s   z)_write_ninja_file.<locals>.sanitize_flagsr   zninja_required_version = 1.3zcxx = binhipccnvccPYTORCH_NVCCznvcc_from_env = znvcc = z	cflags =  zpost_cflags = zcuda_cflags = zcuda_post_cflags = c                 S   s   g | ]
}|d kr
|ndqS )arch=compute_90a,code=sm_90aarch=compute_80,code=sm_80r0   )r2   sr0   r0   r4   r5      s    z%_write_ninja_file.<locals>.<listcomp>zcuda_post_cflags_sm80 = -gencoder>   zcuda_post_cflags_sm80_sm90 = zcuda_dlink_post_cflags = z
ldflags = c                 S   s   g | ]}t j|qS r0   )ospathabspath)r2   filer0   r0   r4   r5      s    zrule compilez@  command = cl /showIncludes $cflags -c $in /Fo$out $post_cflagsz  deps = msvczD  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflagsz  depfile = $out.dz  deps = gcczrule cuda_compile N*TORCH_EXTENSION_SKIP_NVCC_GEN_DEPENDENCIES01z?--generate-dependencies-with-compile --dependency-output $out.dzrule cuda_compile_sm80   z  command = $nvcc z3 $cuda_cflags -c $in -o $out $cuda_post_cflags_sm80zrule cuda_compile_sm80_sm90z8 $cuda_cflags -c $in -o $out $cuda_post_cflags_sm80_sm90z  command = $nvcc_from_env z. $cuda_cflags -c $in -o $out $cuda_post_cflags_sm90.cucuda_compile_sm80.cucuda_compile_sm80cuda_compile_sm80_sm90compile:z$:z$ zbuild z: zdlink.ozrule cuda_devlinkz5  command = $nvcc $in -o $out $cuda_dlink_post_cflagsz: cuda_devlink z	rule linkwhereclz
z'MSVC is required to load C++ extensionsz  command = "z)/link.exe" $in /nologo $ldflags /out:$outz%  command = $cxx $in $ldflags -o $outz: link zdefault z

c                 s   s    | ]}d  |V  qdS )
N)join)r2   br0   r0   r4   	<genexpr>   s    z$_write_ninja_file.<locals>.<genexpr>rS   )lenr*   appendr&   r+   r,   rA   environgetenvr'   rT   r)   torchversioncudazipr-   endswithreplacerB   dirname
subprocesscheck_outputdecoder(   splitRuntimeErrorr.   )'rB   cflagspost_cflagscuda_cflagscuda_post_cflagscuda_dlink_post_cflagssourcesobjectsldflagslibrary_target	with_cudar7   compilerconfigr:   nvcc_from_envr6   cuda_post_cflags_sm80cuda_post_cflags_sm80_sm90compile_rulecuda_compile_rulenvcc_gendepscuda_compile_rule_sm80cuda_compile_rule_sm80_sm90buildsource_fileobject_fileis_cuda_sourceruledevlink_outdevlink_ruledevlink	link_rulecl_pathscl_pathlinkdefaultblockscontentr0   r0   r4   _write_ninja_fileS   s   



 




 







r   c                  C   sd   t jdrdS t jdkr#dt d ddd } d|  d	S t jd
kr*dS tdt j)z?
    Returns the platform name as used in wheel filenames.
    linuxlinux_x86_64darwin.r   N   macosx__x86_64win32	win_amd64zUnsupported platform: {})sysplatform
startswithrT   mac_verre   
ValueErrorformat)mac_versionr0   r0   r4   get_platform  s   
 
r   c                 C   sJ   t j| d dgdd}| }|dd }t|| dd }||fS )	Nz	/bin/nvccz-VT)universal_newlinesreleaserI   ,r   )rb   rc   re   indexr   )cuda_dir
raw_outputoutputrelease_idxbare_metal_versionr0   r0   r4   get_cuda_bare_metal_version  s
   r   global_optionc                 C   s    t d urd S t|  d d S )Nz was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.)r   warningswarn)r   r0   r0   r4   check_if_cuda_home_none  s
   r   rE   namer   c                 C   s   t | | dv S )N)ONrH   YESr   Y)rA   rZ   upper)r   r   r0   r0   r4   check_env_flag'  s   r   c                   C   s
   t ddS )am  
    Downstream projects and distributions which bootstrap their own dependencies from scratch
    and run builds in offline sandboxes
    may set `FLASH_ATTENTION_OFFLINE_BUILD` in the build environment to prevent any attempts at downloading
    pinned dependencies from the internet or at using dependencies vendored in-tree.

    Dependencies must be defined using respective search paths (cf. `syspath_var_name` in `Package`).
    Missing dependencies lead to an early abortion.
    Dependencies' compatibility is not verified.

    Note that this flag isn't tested by the CI and does not provide any guarantees.
    FLASH_ATTENTION_OFFLINE_BUILDrE   )r   r0   r0   r0   r4   is_offline_build,  s   
r   c                  C   sJ   t d} | st dpt dpt dpd } | stdt j| dS )NFLASH_ATTENTION_HOMEHOMEUSERPROFILEHOMEPATHz"Could not find user home directoryz
.flashattn)rA   rZ   rf   rB   rT   )	user_homer0   r0   r4   get_flashattn_cache_path=  s   
"r   c                 C   s,   d}d|i}t j| d |}t jj|ddS )NzFMozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/119.0z
User-Agenti,  )timeout)urllibrequestRequesturlopen)url
user_agentheadersr   r0   r0   r4   open_urlF  s
   r   c                 C   sn  t  rd S t }tjt}t }zddddt  }W n t	y+   t }Y nw ddd}	||	| ||}
tj
|d| }tj
|tjddd|}|dkrSd	nd
}t|r^|||n|}tj
||}tj| }|rtd|
 d tjt|
dd}|j|d tjtj|d dd td| d| d tj|rtj||dd d S t|| d S )N64aarch64)x86_64arm64r   r   )LinuxDarwinnvidiathird_partybackendz
sbsa-linuxzx86_64-linuxzdownloading and extracting z ...zr|*)fileobjmode)rB   r   T)exist_okzcopy z to )dirs_exist_ok)r   r   rA   rB   ra   __file__r   systemmachineKeyErrorrT   pardircallableexistsprinttarfileopenr   
extractallmakedirsre   isdirshutilcopytreecopy)r   src_pathdst_pathr\   url_funcflashattn_cache_pathbase_dirr   arch	supportedr   tmp_pathplatform_namedownloadrD   r0   r0   r4   download_and_copyP  s6   
r   c                  C   s   t dpd} d| gS )NNVCC_THREADS4z	--threads)rA   rZ   )nvcc_threadsr0   r0   r4   nvcc_threads_argsn  s   r   r:   z12.3.107EXE)git	submoduleupdatez--initz../csrc/cutlassz

torch.__version__  = {}

r   rI   z12.3z9FlashAttention-3 is only supported on CUDA 12.3 and abover8   c                        fdd d S )Nc              	         d d d  d d	S Nz&https://anaconda.org/nvidia/cuda-nvcc/z
/download/-z/cuda-nvcc-z
-0.tar.bz2r0   version_majorversion_minor1version_minor2r   r   r\   r0   r4   <lambda>     <lambda>.<locals>.<lambda>r   re   r   r   r\   r0   r   r4   r        r   )r   r   r   r\   r   znvvm/binc                    r   )Nc              	      r   r   r0   r   r   r0   r4   r     r   r   r   r   r   r0   r   r4   r     r   r   r   r   PATHr;   r@   r=   Tcsrccutlassz!-DFLASHATTENTION_DISABLE_BACKWARDz -DFLASHATTENTION_DISABLE_PAGEDKVz-DFLASHATTENTION_DISABLE_SPLITz!-DFLASHATTENTION_DISABLE_APPENDKVz-DFLASHATTENTION_DISABLE_LOCALz -DFLASHATTENTION_DISABLE_SOFTCAPz -DFLASHATTENTION_DISABLE_PACKGQAz-DFLASHATTENTION_DISABLE_FP16z-DFLASHATTENTION_DISABLE_FP8z-DFLASHATTENTION_DISABLE_VARLENz -DFLASHATTENTION_DISABLE_CLUSTERz-DFLASHATTENTION_DISABLE_HDIM64z-DFLASHATTENTION_DISABLE_HDIM96z -DFLASHATTENTION_DISABLE_HDIM128z -DFLASHATTENTION_DISABLE_HDIM192z -DFLASHATTENTION_DISABLE_HDIM256z-DFLASHATTENTION_DISABLE_SM8xz!-DFLASHATTENTION_ENABLE_VCOLMAJORbf16fp16e4m3@   `            all_split_paged_softcap_softcapall_packgqac              
   C   s4   g | ]\}}}}}d | d| | | | dqS )instantiations/flash_fwd_hdim_rL   r0   )r2   hdimdtypere   pagedsoftcapr0   r0   r4   r5     s    "r5   c                 C   sF   g | ]\}}}}}}|r|s|sd | d| | | | | d	qS )r  r  rJ   r0   )r2   r  r  re   r  r  packgqar0   r0   r4   r5     s    (c                 C   (   g | ]\}}}d | d| | dqS )instantiations/flash_bwd_hdimr  rL   r0   r2   r  r  r  r0   r0   r4   r5         c                 C   r  )r  r  rJ   r0   r  r0   r0   r4   r5     r  zflash_api.cppzflash_fwd_combine.cu)	-O3
-std=c++17z--ftemplate-backtrace-limit=0z--use_fast_mathz--resource-usagez	-lineinfoz'-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLEDz-DCUTLASS_DEBUG_TRACE_LEVEL=0z-DNDEBUGr   z-D_USE_MATH_DEFINESz-Xcompiler=/Zc:__cplusplusincludeflash_attn_3_cudar  r  )cxxr:   )r   rl   extra_compile_argsinclude_dirsc                  C   s|   t ttd d} td|  tj}W d    n1 sw   Y  t|	d}t
jd}|r:| d| S t|S )Nz__init__.pyr   z^__version__\s*=\s*(.*)$rI   FLASH_ATTN_LOCAL_VERSION+)r   r   this_dirresearchread	MULTILINEastliteral_evalgrouprA   rY   getstr)fversion_matchpublic_versionlocal_versionr0   r0   r4   get_package_version  s   r3  c            
      C   s   t tjj} t tj}| jdkrt dnt d} dtjj tjj }t	 }t
 }| j | j }|j d|j }ttjj }t d| d| d| d	| d| d| d| d
}tjd| |d}	|	|fS )N   z11.8z12.2cpr   r   z+cur[   cxx11abi.whlv)tag_name
wheel_name)r   r[   r\   r]   __version__majorr   version_infominorr   r3  r.  _C_GLIBCXX_USE_CXX11_ABIr   PACKAGE_NAMEBASE_WHEEL_URLr   )
torch_cuda_versiontorch_version_rawpython_versionr   package_versioncuda_versiontorch_version	cxx11_abiwheel_filename	wheel_urlr0   r0   r4   get_wheel_url  s   
4rL  c                       s    e Zd ZdZ fddZ  ZS )CachedWheelsCommandai  
    The CachedWheelsCommand plugs into the default bdist wheel, which is ran by pip when it cannot
    find an existing wheel (which is currently the case for all installs). We use
    the environment parameters to detect whether there is already a pre-built version of a compatible
    wheel available and short-circuits the standard full build pipeline.
    c                    s   t rt  S t \}}td| zAtj|| tj	
| js&t| j |  \}}}| j d| d| d| }tj	| j|d }td| t|| W d S  tjjyg   td t   Y d S w )NzGuessing wheel URL: r   r7  zRaw wheel pathz4Precompiled wheel not found. Building from source...)FORCE_BUILDsuperrunrL  r   r   r   urlretrieverA   rB   r   dist_dirr   get_tagwheel_dist_namerT   r   moveerror	HTTPError)selfrK  rJ  impl_tagabi_tagplat_tagarchive_basename
wheel_path	__class__r0   r4   rP  9  s"   



zCachedWheelsCommand.run)__name__
__module____qualname____doc__rP  __classcell__r0   r0   r^  r4   rM  1  s    rM  )r{   r  r  testsdistdocs
benchmarks)excludeflash_attn_interfacezFlashAttention-3ztext/markdown)z#Programming Language :: Python :: 3z2License :: OSI Approved :: Apache Software LicensezOperating System :: Unix)r   	build_extr   z>=3.8)r[   einops	packagingninja)r   r\   packages
py_modulesdescriptionlong_descriptionlong_description_content_typeclassifiersext_modulescmdclasspython_requiresinstall_requires)r/   N)rE   )r   r   rA   statr&  r   r*  pathlibr   packaging.versionr   r   r   	sysconfigr   	itertools
setuptoolsr   r   rb   urllib.requestr   urllib.errorwheel.bdist_wheelr   _bdist_wheelr[   torch.utils.cpp_extensionr   r	   r
   r   r   fhr(  rr  rB   ra   rC   r   r%  rA  rB  rZ   rN  SKIP_CUDA_BUILDFORCE_CXX11_ABIDISABLE_BACKWARDDISABLE_SPLITDISABLE_PAGEDKVDISABLE_APPENDKVDISABLE_LOCALDISABLE_SOFTCAPDISABLE_PACKGQADISABLE_FP16DISABLE_FP8DISABLE_VARLENDISABLE_CLUSTERDISABLE_HDIM64DISABLE_HDIM96DISABLE_HDIM128DISABLE_HDIM192DISABLE_HDIM256DISABLE_SM8xENABLE_VCOLMAJORr&   r'   r(   r)   r*   r+   r,   r-   r.   r   utilscpp_extensionr   r   r.  r   boolr   r   r   r   r   r   NVIDIA_TOOLCHAIN_VERSIONget_config_varexe_extensionrv  ru  rP  r   r   r;  intre   TORCH_MAJORTORCH_MINORr  r   rf   r   rT   r   ctk_path_newnvcc_path_newpathseprY   chmodst_modeS_IEXECcc_flagrX   r?  r@  parentrepo_dircutlass_dirfeature_argsDTYPE_FWD_SM80DTYPE_FWD_SM90	DTYPE_BWDHEAD_DIMENSIONS_BWDHEAD_DIMENSIONS_FWDHEAD_DIMENSIONS_FWD_SM80SPLITPAGEDKVSOFTCAPSOFTCAP_ALLPACKGQAproductsources_fwd_sm80sources_fwd_sm90sources_bwd_sm80sources_bwd_sm90rl   
nvcc_flagsextendr"  r3  rL  rM  r0   r0   r0   r4   <module>   s  
,

 
-		





	
&

"
