o é—¦i)nã@sæUddlZddlmZmZmZddlmZddlmZm Z m Z mZmZm Z mZddlmZddlmZddlmZdd lmZddlZd dgZejjZdd „ZiZe e e fed<dd„ZdOdd„Zeej ƒddœde!fdd„ƒZ"eej#ƒdPde!fdd„ƒZ$eej%ƒdPde!fdd„ƒZ&eej'ƒdPde!fdd„ƒZ( dOdee!dee!dee!d e)de!f d!d"„Z*eej+ej,gƒddœde!fd#d$„ƒZ-eej.ƒde!fd%d&„ƒZ/d'd(„Z0eej1ej2ej3gƒddœde!fd)d*„ƒZ4d+d,„Z5dd-œdee e e!d.fe e!d.fe e!d.fee e!d.fffd/d0„Z6dd-œdee e e!d.fe e!d.fe e!d.fee e!d.fffd1d2„Z7eej8d3d4ddœde!fd5d6„ƒZ9eej:d3d4de!fd7d8„ƒZ;d9d:„Zej?gƒddœde!fd;d<„ƒZ@eejAd3d4de!fd=d>„ƒZBeejCd3d4de!fd?d@„ƒZDiej e"“ej#e$“ej%e&“ej'e(“ej+e-“ej,e-“ej.e/“ej1e4“ej2e4“ej3e4“ej=e@“ej>e@“ej?e@“ej8e9“ej:e;“ejAeB“ejCeD“ZdAdB„ZEgdC¢ZFdDdE„ZGdFdG„ZHdHdI„ZIdJdK„ZJGdLd „d ƒZKGdMdN„dNeƒZLdS)QéN)Útree_mapÚtree_flattenÚtree_unflattené)Ú ModuleTracker)ÚListÚAnyÚDictÚOptionalÚUnionÚTupleÚIterator)Údefaultdict)ÚTorchDispatchMode©Úprod©ÚwrapsÚFlopCounterModeÚregister_flop_formulacCst|tjƒr |jS|S©N)Ú isinstanceÚtorchÚTensorÚshape)Úi©rúV/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/torch/utils/flop_counter.pyÚ get_shapesrÚ flop_registrycstˆƒddœ‡fdd„ ƒ}|S)N)Úout_valcs(tt|||fƒ\}}}ˆ|d|i|¤ŽS)NÚ out_shape)rr)r ÚargsÚkwargsr!©ÚfrrÚnfszshape_wrapper..nfr©r%r&rr$rÚ shape_wrappersr(Fcs‡‡fdd„}|S)Ncs,ˆstˆƒ‰‡fdd„}tjj |ˆ¡ˆS)NcsHt|tjjƒstd|›dt|ƒ›ƒ‚|tvrtd|›ƒ‚ˆt|<dS)Nzlregister_flop_formula(targets): expected each target to be OpOverloadPacket (i.e. torch.ops.mylib.foo), got z which is of type zduplicate registrations for )rrÚ_opsÚOpOverloadPacketÚ ValueErrorÚtyperÚRuntimeError)Útarget©Úflop_formularrÚregister&sþþÿz=register_flop_formula..register_fun..register)r(rÚutilsÚ_pytreeÚ tree_map_)r0r1©Úget_rawÚtargetsr/rÚregister_fun"s z+register_flop_formula..register_funr)r7r6r8rr5rr!s)r!Úreturnc Os,|\}}|\}}||ksJ‚||d|S)zCount flops for matmul.ér) Úa_shapeÚb_shaper!r"r#ÚmÚkÚk2ÚnrrrÚmm_flop7srAcKó t||ƒS)zCount flops for addmm.)rA©Ú self_shaper;r<r!r#rrrÚ addmm_flopBs rEcKsD|\}}}|\}}} ||ksJ‚||ksJ‚||| d|} | S)z"Count flops for the bmm operation.r:r)r;r<r!r#Úbr=r>Úb2r?r@ÚfloprrrÚbmm_flopGs rIcKrB)z&Count flops for the baddbmm operation.©rIrCrrrÚbaddbmm_flopTs rKÚx_shapeÚw_shaper!Ú transposedc CsL|d}|r|n|dd…}|^}}} t|ƒt|ƒ|||d} | S)aCount flops for convolution. Note only multiplication is counted. Computation for bias are ignored. Flops for a transposed convolution are calculated as flops = (x_shape[2:] * prod(w_shape) * batch_size). Args: x_shape (list(int)): The input shape before convolution. w_shape (list(int)): The filter shape. out_shape (list(int)): The output shape after convolution. transposed (bool): is the convolution transposed Returns: int: the number of flops rr:Nr) rLrMr!rNÚ batch_sizeÚ conv_shapeÚc_outÚc_inÚfilter_sizerHrrrÚconv_flop_count\s rTc Ost||||dS)zCount flops for convolution.©rN)rT) rLrMÚ_biasÚ_strideÚ_paddingÚ _dilationrNr!r"r#rrrÚ conv_flopƒsrZcCs–dd„}d} | drt|dƒ}| t||||ƒ7} | drIt|dƒ}|r9| t||ƒ||ƒ||ƒdd7} | S| t||ƒ||ƒ||ƒdd7} | S)NcSs |d|dgt|dd…ƒS)Nrrr:)Úlist)rrrrÚt˜s zconv_backward_flop..trrFrU)rrT)Úgrad_out_shaperLrMrVrWrXrYrNÚ_output_paddingÚ_groupsÚoutput_maskr!r\Ú flop_countÚgrad_input_shapeÚgrad_weight_shaperrrÚconv_backward_flop‰sF þrdcCsÀ|\}}}}|\}}} } |\}}} }||kr|kr8nJ‚||kr)|kr8nJ‚|| kr8| | kr8|| ks:J‚d}|t||||f|||| fƒ7}|t|||| f||| |fƒ7}|S)z^ Count flops for self-attention. NB: We can assume that value_shape == key_shape rrJ)Úquery_shapeÚ key_shapeÚvalue_shaperFÚhÚs_qÚd_qÚ_b2Ú_h2Ús_kÚ_d2Ú_b3Ú_h3Ú_s3Úd_vÚtotal_flopsrrrÚsdpa_flop_countñsP""rtcOst|||ƒS)úCount flops for self-attention.©rt)rerfrgr!r"r#rrrÚ sdpa_flopsrwcCsFddlm}ddlm}t|||fƒs| ¡ ¡S|g| d¡dS)zŸ If the offsets tensor is fake, then we don't know the actual lengths. In that case, we can just assume the worst case; each batch has max length. r)Ú FakeTensor)ÚFunctionalTensorr)Útorch._subclasses.fake_tensorrxÚ#torch._subclasses.functional_tensorryrÚdiffÚtolistÚsize)ÚoffsetsÚmax_lenrxryrrrÚ_offsets_to_lengthss r)Úgrad_out.ccs&|dur€t|jƒdksJ‚t|jƒdksJ‚|dus#|j|jks#J‚|j\}} } |j\}}}|j\}} }|dus;J‚|dusAJ‚|j|jksIJ‚t||ƒ}t||ƒ}t||ƒD]%\}}d| || f}d|||f}d| ||f}|durt|nd}||||fVqXdS|j|j|j|dur|jndfVdS)a; Given inputs to a flash_attention_(forward|backward) kernel, this will handle behavior for NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for each batch element. In the case that this isn't a NestedTensor kernel, then it just yields the original shapes. Nér©ÚlenrrÚzip)ÚqueryÚkeyÚvaluer‚Ú cum_seq_qÚ cum_seq_kÚmax_qÚmax_kÚ_Úh_qrjÚh_kÚd_kÚh_vrrÚ seq_q_lengthsÚ seq_k_lengthsÚ seq_q_lenÚ seq_k_lenÚnew_query_shapeÚ new_key_shapeÚnew_value_shapeÚnew_grad_out_shaperrrÚ%_unpack_flash_attention_nested_shapess*€ &r›ccs,|durƒt|jƒdksJ‚t|jƒdksJ‚|dus#|j|jks#J‚|j\}}} } |j\}}}}|j\}}} }|dus>J‚|dusDJ‚|j|jksLJ‚t||ƒ}t||ƒ}t||ƒD]%\}}d| || f}d|||f}d| ||f}|durw|nd}||||fVq[dS|j|j|j|dur|jndfVdS)a? Given inputs to a efficient_attention_(forward|backward) kernel, this will handle behavior for NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for each batch element. In the case that this isn't a NestedTensor kernel, then it just yields the original shapes. Nérr„)r‡rˆr‰r‚Úcu_seqlens_qÚcu_seqlens_kÚmax_seqlen_qÚmax_seqlen_krŽrrjrr‘r’rrÚ seqlens_qÚ seqlens_kÚlen_qÚlen_kr—r˜r™ršrrrÚ)_unpack_efficient_attention_nested_shapesFs*€ &r¥T)r6c Os(t|||||||d} tdd„| DƒƒS)ru)r‡rˆr‰rŠr‹rŒrcsó$|] \}}}}t|||ƒVqdSrrv©Ú.0rerfrgrŽrrrÚ ó € ÿ ÿz0_flash_attention_forward_flop..©r›Úsum)r‡rˆr‰rŠr‹rŒrr!r"r#ÚsizesrrrÚ_flash_attention_forward_flopvóù þr®c Os(t|||||||d} tdd„| DƒƒS)ru)r‡rˆr‰rržrŸr csr¦rrvr§rrrr©°rªz4_efficient_attention_forward_flop..©r¥r¬)r‡rˆr‰ÚbiasrržrŸr r"r#rrrrÚ!_efficient_attention_forward_flop–r¯r²cCsVd}|\}}}}|\} } }}|\} }}}|\}}}}|| kr)| kr)|krBnJ‚|| kr;|kr;|krBnJ‚||ksDJ‚||krP||krP||ksRJ‚d}|t||||f||||fƒ7}|t||||f||||fƒ7}|t||||f||||fƒ7}|t||||f||||fƒ7}|t||||f||||fƒ7}|S)NrrJ)r]rerfrgrsrFrhrirjrkrlrmrnrorprqrrÚ_b4Ú_h4Ú_s4Ú_d4rrrÚsdpa_backward_flop_count¶sT"""""r·cOst||||ƒS)z(Count flops for self-attention backward.©r·)r]rerfrgr!r"r#rrrÚsdpa_backward_flopÑsr¹c Oó*t|||||||| d}tdd„|DƒƒS)N)r‡rˆr‰r‚rŠr‹rŒrcsó&|]\}}}}t||||ƒVqdSrr¸©r¨rerfrgr]rrrr©óó € ÿ ÿz1_flash_attention_backward_flop..r«) r‚r‡rˆr‰ÚoutÚ logsumexprŠr‹rŒrr"r#ÚshapesrrrÚ_flash_attention_backward_flopØóø þrÁc Orº)N)r‡rˆr‰r‚rržrŸr csr»rr¸r¼rrrr©r½z5_efficient_attention_backward_flop..r°) r‚r‡rˆr‰r±r¾rržrŸr r"r#rÀrrrÚ"_efficient_attention_backward_flopùrÂrÃcCst|tƒs|fS|Sr)rÚtuple)ÚxrrrÚnormalize_tuple.s rÆ)ÚÚKÚMÚBÚTcCs0tdtttƒdtt|ƒƒddƒƒ}t|S)Nrrr:rƒ)ÚmaxÚminr…ÚsuffixesÚstr)ÚnumberÚindexrrrÚget_suffix_str7s(rÒcCs&t |¡}|d|d›}|t|S)Nièz.3f)rÎrÑ)rÐÚsuffixrÑr‰rrrÚconvert_num_with_suffix>s rÔcCs|dkrdS||d›S)Nrú0%z.2%r)ÚnumÚdenomrrrÚconvert_to_percent_strEsrØcstˆƒ‡fdd„ƒ}|S)Ncst|ƒ\}}ˆ|Ž}t||ƒSr)rr)r"Ú flat_argsÚspecr¾r$rrr&Ks z)_pytreeify_preserve_structure..nfrr'rr$rÚ_pytreeify_preserve_structureJsrÛcs®eZdZdZ ddeeejje ejjfde dedeee e ff‡fd d „ Zde fdd „Zdeeee e fffdd„Zddd„Zdd„Zdd„Zdd„Z‡ZS)raþ ``FlopCounterMode`` is a context manager that counts the number of flops within its context. It does this using a ``TorchDispatchMode``. It also supports hierarchical output by passing a module (or list of modules) to FlopCounterMode on construction. If you do not need hierarchical output, you do not need to use it with a module. Example usage .. code-block:: python mod = ... with FlopCounterMode(mod) as flop_counter: mod.sum().backward() Nr:TÚmodsÚdepthÚdisplayÚcustom_mappingcsttƒ ¡tdd„ƒ|_||_||_d|_|duri}|dur&tjdddit ¥dd„| ¡Dƒ¥|_ tƒ|_dS)NcSsttƒSr)rÚintrrrrÚosz*FlopCounterMode.__init__..zÚvrrrÚ ys*z,FlopCounterMode.__init__..) ÚsuperÚ__init__rÚflop_countsrÝrÞÚmodeÚwarningsÚwarnrÚitemsrÚmod_tracker)ÚselfrÜrÝrÞrß©Ú __class__rrréhs ÿþzFlopCounterMode.__init__r9cCst|jd ¡ƒS)NÚGlobal)r¬rêÚvalues©rðrrrÚget_total_flops}szFlopCounterMode.get_total_flopscCsdd„|j ¡DƒS)aReturn the flop counts as a dictionary of dictionaries. The outer dictionary is keyed by module name, and the inner dictionary is keyed by operation name. Returns: Dict[str, Dict[Any, int]]: The flop counts as a dictionary. cSsi|] \}}|t|ƒ“qSr)ÚdictrårrrrçŠsz3FlopCounterMode.get_flop_counts..)rêrîrõrrrÚget_flop_counts€s zFlopCounterMode.get_flop_countsc s|durˆj}|dur d}ddl}d|_gd¢}g}ˆ ¡‰tˆƒ‰d‰‡‡‡‡fdd„}tˆj ¡ƒD]}|dkr;q4| d ¡d }||krGq4|||d ƒ}| |¡q4dˆjvroˆso|D] } d| d| d<q]|ddƒ|}t |ƒdkrzgd¢g}|j||d dS)Ni?BrT)ÚModuleÚFLOPz% TotalFcsŽtˆj| ¡ƒ}ˆ|ˆkO‰d|}g}| ||t|ˆƒt|ˆƒg¡ˆj| ¡D]\}}| |dt|ƒt|ˆƒt|ˆƒg¡q,|S)Nú z - )r¬rêrôÚappendrÔrØrîrÏ)Úmod_namerÝrsÚpaddingrôr>ræ©Úglobal_flopsÚ global_suffixÚis_global_subsumedrðrrÚprocess_modšs ýýz.FlopCounterMode.get_table..process_modróÚ.rrû)róÚ0rÕ)ÚleftÚrightr)ÚheadersÚcolalign)rÝÚtabulateÚPRESERVE_WHITESPACErörÒÚsortedrêÚkeysÚcountÚextendr…) rðrÝr ÚheaderrôrÚmodÚ mod_depthÚ cur_valuesr‰rrÿrÚ get_tableŒs6 zFlopCounterMode.get_tablecCs,|j ¡|j ¡t|ƒ|_|j ¡|Sr)rêÚclearrïÚ __enter__Ú_FlopCounterModerërõrrrrÉs zFlopCounterMode.__enter__cGsD|jdusJ‚|jj|Ž}d|_|j ¡|jr t| |j¡ƒ|Sr)rëÚ__exit__rïrÞÚprintrrÝ)rðr"rFrrrrÐs zFlopCounterMode.__exit__cCsV||jvr)|j|}||i|¤d|i¤Ž}t|jjƒD] }|j|||7<q|S)Nr )rÚsetrïÚparentsrê)rðÚfunc_packetr¾r"r#Úflop_count_funcraÚparrrrÚ_count_flopsÙs zFlopCounterMode._count_flops)Nr:TNr)Ú__name__Ú __module__Ú__qualname__Ú__doc__r rrÚnnrùrràÚboolr rrérörÏrørrrrÚ __classcell__rrrñrrTs*ûþýüû = c@s$eZdZdefdd„Zddd„ZdS) rÚcountercCs ||_dSr)r')rðr'rrrréäs z_FlopCounterMode.__init__rNcCs0|r|ni}|tjjjjtjjjjtjjjjtjjjjtjjjjtjjj jtjjj jtjjjjtjjjjtjjj jtjjjjtjjjjtjjjjtjjjjhvrRtS||jjvr‡|tjjjjur‡||j|i|¤Ž}|turx|WdƒSWdƒn1s‚wY||i|¤Ž}|j |j|||¡Sr)rÚopsÚatenÚ is_contiguousÚdefaultÚ memory_formatÚis_strides_like_formatÚis_non_overlapping_and_denser~Úsym_sizeÚstrideÚ sym_strideÚstorage_offsetÚsym_storage_offsetÚnumelÚ sym_numelÚdimÚprimÚlayoutÚNotImplementedr'rÚdeviceÚ decomposerÚ_overloadpacket)rðÚfuncÚtypesr"r#Úrr¾rrrÚ__torch_dispatch__çs6 óýþz#_FlopCounterMode.__torch_dispatch__)rN)r r!r"rrér@rrrrrãsr)Fr)MrÚtorch.utils._pytreerrrÚmodule_trackerrÚtypingrrr r rrr ÚcollectionsrÚtorch.utils._python_dispatchrÚmathrÚ functoolsrrìÚ__all__r(r)rrÚ__annotations__r(rÚmmràrAÚaddmmrEÚbmmrIÚbaddbmmrKr%rTÚconvolutionÚ_convolutionrZÚconvolution_backwardrdrtÚ'_scaled_dot_product_efficient_attentionÚ#_scaled_dot_product_flash_attentionÚ#_scaled_dot_product_cudnn_attentionrwrr›r¥Ú_flash_attention_forwardr®Ú_efficient_attention_forwardr²r·Ú0_scaled_dot_product_efficient_attention_backwardÚ,_scaled_dot_product_flash_attention_backwardÚ,_scaled_dot_product_cudnn_attention_backwardr¹Ú_flash_attention_backwardrÁÚ_efficient_attention_backwardrÃrÆrÎrÒrÔrØrÛrrrrrrÚsö $ üÿþýü û'ôgþû6 ö3û6 ö0 ÷õõþ ó ó ÿþýüûúùø ÷ öõô óòñðï