o ˜à·i :ã@sðddlZddlZddlmZddlmZeeƒZdejd<dejd<dej j _dd „Zd e efdd„Zddedefdd„Zddd„Zdd„ZeƒedƒrvddlmZddlmZddlmZmZeegdƒej j jd<ee_ee_dSdS)éN)Úinit_logger©Úis_torch_equalÚ1ÚPYTORCH_NVML_BASED_CUDA_CHECKÚTORCHINDUCTOR_COMPILE_THREADSécs€ddlmm‰ddlm}m}m}m}m}ddl m ‰dttf‡‡fdd„}t ˆjj|ƒr@ˆjjjdur@|ˆjjjjƒ}nˆj ¡}|jrrt |jd|ƒrr|jdjj|vrr|j ¡|jrrt |jd|ƒrr|jdjj|vsY|ƒg}g} tt|jƒƒD]/} |j| }t ||ƒr–| |d¡|j| <qt ||ƒr¢| |ƒ¡qt ||ƒr®| | ¡¡q| | ¡¡t|ƒdks¾J‚dS)Nr)ÚEnterSubgraphLineÚExitSubgraphLineÚMemoryPlanningLineÚMemoryPlanningStateÚSubgraphPythonWrapperCodegen©ÚVÚreturncsddl}g}| d¡}| d¡}|D]3}t|ˆjƒr)| ˆjj›dt|ƒ›¡qt|ˆjƒr>| ˆjj›dt|ƒ›¡q| | ¡¡q|S)NrÚ_noneÚ_shape) Ú itertoolsÚcountÚ isinstanceÚNoneAsConstantBufferÚappendÚgraphÚnameÚnextÚShapeAsConstantBufferÚget_name)Ú graph_outputsrÚnamesÚ shape_counterÚnone_counterÚnode©rÚir©úG/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/env_override.pyÚget_output_names-s z3memory_plan_reuse_patched..get_output_nameséÿÿÿÿ)Útorch._inductor.irÚ _inductorr#Útorch._inductor.codegen.wrapperr r rrr Útorch._inductor.virtualizedrÚlistÚstrrrÚwrapper_codeÚpartition_signaturesÚoutput_nodesr&Úlinesr!rÚpopÚrangeÚlenÚplanr)Úselfr r rrr r&Ú out_namesÚplanning_statesÚpast_planning_statesÚiÚliner$r"r%Úmemory_plan_reuse_patched"sFÿ ÿ ÿþ ûÿþ €r<Úskip_cudagraphscsddlm}ddlm}m‰m‰ddlm‰ddlm }g}|ˆj ¡ƒ}ˆ ¡‰dt dtf‡‡‡‡fdd „‰tt|ƒt|ƒƒD]¹\}} |ƒ} |D] }| |j ¡¡qI| |¡}|j d d„|Dƒ¡} |‡fdd„| j| jBDƒƒ| }|‡fd d„|Dƒƒ}|ƒ‰|D]}ˆ |j¡q…‡fdd„ˆ| Dƒ}| |¡‡fdd„|Dƒ}‡‡fdd„|Dƒ}‡‡fdd„|Dƒ}| |¡|‡fdd„|Dƒƒ}‡‡fdd„|Dƒ}‡fdd„|Dƒ}ˆ ||¡}|||||| |ƒ}| |¡| ||¡}q@|ddd…S)z• Gets signature for each graph partition, including input nodes, output nodes, and whether deallocating an input within graph partition. r)Údependencies)ÚGraphPartitionSignatureÚMutationOutputÚ NoneLayoutr)Ú OrderedSetÚbuf_namercsVˆj |d¡}|dur dSt|jjˆƒr)t|jˆƒr'ˆj |d¡}r'ˆ|ƒSdSdS)z Checks if buf_name is NoneLayout. Buffers with NoneLayout is not allocated so graph partition should not take it as inputs or outputs. NFT)Úname_to_bufÚgetrr!ÚlayoutÚmutation_real_name)rCÚbufÚ real_name)r@rAÚis_none_layoutr6r$r%rJwsÿz=get_graph_partition_signature_patched..is_none_layoutcSsg|]}|j‘qSr$)Úread_writes)Ú.0r!r$r$r%Ú ˜sz9get_graph_partition_signature_patched..csg|] }ˆ|jƒs|j‘qSr$)r)rLÚx)rJr$r%rMŸsýÿc3ó|] }ˆj ||¡VqdS©N©rGrE©rLr©r6r$r%Ú ¨ó€ ÿz8get_graph_partition_signature_patched..csg|]}|ˆvr|‘qSr$r$rR©Úname_to_noder$r%rM³s þcsi|]}|ˆvr|ˆ|“qSr$r$rRrVr$r%Ú ºó þz9get_graph_partition_signature_patched..csi|]}|ˆvr||ˆv“qSr$r$rR©Úbuffer_names_to_freerWr$r%rX¿rYcs g|]}|ˆvr|ˆvr|‘qSr$r$rRrZr$r%rMÉs þc3rOrPrQrRrSr$r%rTÑrUcsg|] }ˆ|ƒsˆ|‘qSr$r$rR)rJrWr$r%rMÕsýÿcsg|] }|ˆjjvr|‘qSr$)rÚ constantsrRrr$r%rMÛsNr')Útorch._inductorr>r(r?r@rAr+rÚtorch.utils._ordered_setrBrr&Úget_name_to_nodesr-ÚboolÚzipÚreversedÚupdateÚoutputs_by_nameÚkeysÚintersectionÚ ReadWritesÚ merge_listÚreadsÚwritesÚ last_usageÚ!get_graph_partition_symbol_inputsrÚunion)r6Ú partitionsr=r>r?rBÚ signaturesÚunmet_output_namesÚ partitionÚskip_cudagraphÚoutput_namesr!Úreturned_output_namesrKÚpartition_input_namesÚextra_input_namesÚinput_nodesÚinput_deallocationÚextra_output_namesr0Úconstant_namesÚ symbol_inputsÚpartition_signaturer$)r@rArr[rJrWr6r%Ú%get_graph_partition_signature_patchedfsŠÿ ÿ þÿùÿÿ þ þþ þ ÿþ ÿú ÿr}FÚ should_logrcs²ddlmm}ddlm}m}ddlm}m}m }|j } t| tjjj ƒrY| j} rY| ¡}t| tjjƒr>|›d| j›n|}|tjjjvsN|tjjjvrYt| tjjƒsWJ‚dStjjjjsf|jdurfdSdtd|dBd dfd d„} |rw|n| }t||ƒrŠt‡fdd „|jDƒƒS|j dus‘J‚| ¡s|d|ddSt|j |jƒr¬|d|ddSt|j |jƒr»|d|ddSt|j ddƒrÊ|d|ddS||j ƒr×|d|ddSdS)zBReturn True if we should partition the inductor graph on this noderN)ÚBaseSchedulerNodeÚFusedSchedulerNode)Ú&_unstable_customized_partition_wrapperÚis_cudagraph_unsafe_opÚmaybe_log_cudagraph_partitionÚ.TÚmsgr!rcSsdSrPr$)r…r!r$r$r%Únoop_log9sz*should_partition_patched..noop_logc3s|]}ˆ |¡VqdSrP)Úshould_partition)rLÚsnoderSr$r%rT?s€z+should_partition_patched..znon gpu ops)r!zDeviceCopy opszConditional opsÚunbacked_bindingszunbacked binding opszCUDAGraph-unsafe custom opsF)r(r)r#Útorch._inductor.schedulerrr€Útorch._inductor.utilsrr‚rƒr!rÚtorchÚFallbackKernelÚop_overloadrÚ_opsÚ OpOverloadÚ _overloadnameÚconfigÚcustom_should_partition_opsÚtritonÚ cudagraphsÚwrapperr-ÚanyÚsnodesÚis_gpuÚ DeviceCopyÚConditionalÚgetattr)r6r!r~r#rr€rr‚rƒÚir_nodeÚopÚop_overload_packet_nameÚop_overload_namer†Úlog_partition_reasonr$rSr%Úshould_partition_patchedsVÿÿýÿ ÿ r¢cCshddlmm}ddlm}t|_t|_| dd¡||j ƒ|_WdƒdS1s-wYdS)zÀ (Re)initializes the scheduler member. When initializing the scheduler, no CUBIN files should be generated (to avoid biasing any benchmarks and pessimizing fusion decisions). rN)Ú Schedulerztriton.store_cubinF)Útorch._inductor.configr)r’rŠr£r¢r‡r}Úget_graph_partition_signatureÚpatchÚ operationsÚ scheduler)r6r’r£r$r$r%Ú_update_scheduler_patched[s "ÿr©cCsNddlm}|dƒs|dƒr%ddl}ttjdƒr#ddlm}||_dSdSdS)z;Workaround for TorchInductor autotune get_raw_stream() bug.rrú2.9.0z2.9.1NÚ_cuda_getCurrentRawStream)r«) Úvllm.utils.torch_utilsrÚbuiltinsÚhasattrrŒÚ_CÚtorch._Cr«Úget_raw_stream)rrÚ_get_raw_streamr$r$r%Ú_patch_get_raw_stream_if_neededts ýûr³rª)ÚPythonWrapperCodegen)Ú GraphLowering)Ú_ConfigÚ_ConfigEntry)Údefaultr“)F)rN)ÚosrŒÚvllm.loggerrr¬rÚ__name__ÚloggerÚenvironr)r’Úcompile_threadsr<r,r`r}r¢r©r³r*r´Útorch._inductor.graphrµÚtorch.utils._config_moduler¶r·Ú_configÚmemory_plan_reuseÚ_update_schedulerr$r$r$r%Ús6 D ÿ# Sÿ ô