o
    闦i)n                     @   s  U d dl Z d dlmZmZmZ ddlmZ d dlmZm	Z	m
Z
mZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d dlZd
dgZe jjZdd Zi Ze
e	e	f ed< dd ZdOddZeej ddde!fddZ"eej#dPde!fddZ$eej%dPde!fddZ&eej'dPde!fddZ(	dOdee! dee! dee! d e)de!f
d!d"Z*eej+ej,gddde!fd#d$Z-eej.de!fd%d&Z/d'd( Z0eej1ej2ej3gddde!fd)d*Z4d+d, Z5dd-deeee!d.f ee!d.f ee!d.f eee!d.f  f  fd/d0Z6dd-deeee!d.f ee!d.f ee!d.f eee!d.f  f  fd1d2Z7eej8d3d4ddde!fd5d6Z9eej:d3d4de!fd7d8Z;d9d: Z<eej=ej>ej?gddde!fd;d<Z@eejAd3d4de!fd=d>ZBeejCd3d4de!fd?d@ZDi ej e"ej#e$ej%e&ej'e(ej+e-ej,e-ej.e/ej1e4ej2e4ej3e4ej=e@ej>e@ej?e@ej8e9ej:e;ejAeBejCeDZdAdB ZEg dCZFdDdE ZGdFdG ZHdHdI ZIdJdK ZJG dLd
 d
ZKG dMdN dNeZLdS )Q    N)tree_maptree_flattentree_unflatten   )ModuleTracker)ListAnyDictOptionalUnionTupleIterator)defaultdict)TorchDispatchModeprodwrapsFlopCounterModeregister_flop_formulac                 C   s   t | tjr	| jS | S N)
isinstancetorchTensorshape)i r   V/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/torch/utils/flop_counter.py	get_shape   s   r   flop_registryc                    s   t  d d fdd
}|S )N)out_valc                    s(   t t||| f\}}} |d|i|S )N	out_shape)r   r   )r    argskwargsr!   fr   r   nf   s   zshape_wrapper.<locals>.nfr   r%   r&   r   r$   r   shape_wrapper   s   r(   Fc                    s    fdd}|S )Nc                    s,   st    fdd}tjj|  S )Nc                    sH   t | tjjstd|  dt|  | tv rtd|   t| < d S )Nzlregister_flop_formula(targets): expected each target to be OpOverloadPacket (i.e. torch.ops.mylib.foo), got z which is of type zduplicate registrations for )r   r   _opsOpOverloadPacket
ValueErrortyper   RuntimeError)targetflop_formular   r   register&   s   z=register_flop_formula.<locals>.register_fun.<locals>.register)r(   r   utils_pytree	tree_map_)r0   r1   get_rawtargetsr/   r   register_fun"   s
   z+register_flop_formula.<locals>.register_funr   )r7   r6   r8   r   r5   r   r   !   s   )r!   returnc          	      O   s,   | \}}|\}}||ksJ || d | S )zCount flops for matmul.   r   )	a_shapeb_shaper!   r"   r#   mkk2nr   r   r   mm_flop7   s   rA   c                 K   
   t ||S )zCount flops for addmm.)rA   
self_shaper;   r<   r!   r#   r   r   r   
addmm_flopB   s   
rE   c                 K   sD   | \}}}|\}}}	||ksJ ||ksJ || |	 d | }
|
S )z"Count flops for the bmm operation.r:   r   )r;   r<   r!   r#   br=   r>   b2r?   r@   flopr   r   r   bmm_flopG   s   

rI   c                 K   rB   )z&Count flops for the baddbmm operation.rI   rC   r   r   r   baddbmm_flopT   s   
rK   x_shapew_shaper!   
transposedc           
      C   sL   | d }|r| n|dd }|^}}}	 t |t | | | | d }	|	S )a  Count flops for convolution.

    Note only multiplication is
    counted. Computation for bias are ignored.
    Flops for a transposed convolution are calculated as
    flops = (x_shape[2:] * prod(w_shape) * batch_size).
    Args:
        x_shape (list(int)): The input shape before convolution.
        w_shape (list(int)): The filter shape.
        out_shape (list(int)): The output shape after convolution.
        transposed (bool): is the convolution transposed
    Returns:
        int: the number of flops
    r   r:   Nr   )
rL   rM   r!   rN   
batch_size
conv_shapec_outc_infilter_sizerH   r   r   r   conv_flop_count\   s   
 rT   c          
      O   s   t | |||dS )zCount flops for convolution.rN   )rT   )
rL   rM   _bias_stride_padding	_dilationrN   r!   r"   r#   r   r   r   	conv_flop   s   rZ   c                 C   s   dd }d}	 |
d rt |d }|t| ||| 7 }|
d rIt |d }|r9|t|| ||||dd7 }|S |t|||| ||dd7 }|S )Nc                 S   s    | d | d gt | dd   S )Nr   r   r:   )list)r   r   r   r   t   s    zconv_backward_flop.<locals>.tr   r   FrU   )r   rT   )grad_out_shaperL   rM   rV   rW   rX   rY   rN   _output_padding_groupsoutput_maskr!   r\   
flop_countgrad_input_shapegrad_weight_shaper   r   r   conv_backward_flop   s   F  rd   c                 C   s   | \}}}}|\}}}	}
|\}}}}||  kr|kr8n J ||  kr)|kr8n J ||
kr8|	|kr8||
ks:J d}|t || ||f|| ||	f7 }|t || ||	f|| |	|f7 }|S )z^
    Count flops for self-attention.

    NB: We can assume that value_shape == key_shape
    r   rJ   )query_shape	key_shapevalue_shaperF   hs_qd_q_b2_h2s_k_d2_b3_h3_s3d_vtotal_flopsr   r   r   sdpa_flop_count   s   P""rt   c                O   s   t | ||S )Count flops for self-attention.rt   )re   rf   rg   r!   r"   r#   r   r   r   	sdpa_flop  s   rw   c                 C   sF   ddl m} ddlm} t| ||fs|   S |g| dd  S )z
    If the offsets tensor is fake, then we don't know the actual lengths.
    In that case, we can just assume the worst case; each batch has max length.
    r   )
FakeTensor)FunctionalTensorr   )torch._subclasses.fake_tensorrx   #torch._subclasses.functional_tensorry   r   difftolistsize)offsetsmax_lenrx   ry   r   r   r   _offsets_to_lengths  s
   r   )grad_out.c                 c   s&   |durt |jdksJ t |jdksJ |du s#|j| jks#J | j\}}	}
|j\}}}|j\}}}|dus;J |dusAJ |j|jksIJ t||}t||}t||D ]%\}}d|	||
f}d|||f}d|||f}|durt|nd}||||fV  qXdS | j|j|j|dur|jndfV  dS )a;  
    Given inputs to a flash_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   r   lenr   r   zip)querykeyvaluer   	cum_seq_q	cum_seq_kmax_qmax_k_h_qrj   h_kd_kh_vrr   seq_q_lengthsseq_k_lengths	seq_q_len	seq_k_lennew_query_shapenew_key_shapenew_value_shapenew_grad_out_shaper   r   r   %_unpack_flash_attention_nested_shapes  s*   

&r   c                 c   s,   |durt |jdksJ t |jdksJ |du s#|j| jks#J | j\}}}	}
|j\}}}}|j\}}}}|dus>J |dusDJ |j|jksLJ t||}t||}t||D ]%\}}d|	||
f}d|||f}d|||f}|durw|nd}||||fV  q[dS | j|j|j|dur|jndfV  dS )a?  
    Given inputs to a efficient_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   r   r   )r   r   r   r   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kr   r   rj   r   r   r   rr   	seqlens_q	seqlens_klen_qlen_kr   r   r   r   r   r   r   )_unpack_efficient_attention_nested_shapesF  s*   

&r   T)r6   c             	   O   s(   t | ||||||d}
tdd |
D S )ru   )r   r   r   r   r   r   r   c                 s   $    | ]\}}}}t |||V  qd S r   rv   .0re   rf   rg   r   r   r   r   	<genexpr>  
    


z0_flash_attention_forward_flop.<locals>.<genexpr>r   sum)r   r   r   r   r   r   r   r!   r"   r#   sizesr   r   r   _flash_attention_forward_flopv     	r   c              	   O   s(   t | ||||||d}
tdd |
D S )ru   )r   r   r   r   r   r   r   c                 s   r   r   rv   r   r   r   r   r     r   z4_efficient_attention_forward_flop.<locals>.<genexpr>r   r   )r   r   r   biasr   r   r   r   r"   r#   r   r   r   r   !_efficient_attention_forward_flop  r   r   c                 C   sV  d}|\}}}}|\}	}
}}|\}}}}| \}}}}||	  kr)|  kr)|krBn J ||
  kr;|  kr;|krBn J ||ksDJ ||krP||krP||ksRJ d}|t || ||f|| ||f7 }|t || ||f|| ||f7 }|t || ||f|| ||f7 }|t || ||f|| ||f7 }|t || ||f|| ||f7 }|S )Nr   rJ   )r]   re   rf   rg   rs   rF   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   _b4_h4_s4_d4r   r   r   sdpa_backward_flop_count  s   T"""""r   c                O   s   t | |||S )z(Count flops for self-attention backward.r   )r]   re   rf   rg   r!   r"   r#   r   r   r   sdpa_backward_flop  s   r   c
              
   O   *   t |||| ||||	d}tdd |D S )N)r   r   r   r   r   r   r   r   c                 s   &    | ]\}}}}t ||||V  qd S r   r   r   re   rf   rg   r]   r   r   r   r     
    

z1_flash_attention_backward_flop.<locals>.<genexpr>r   )r   r   r   r   out	logsumexpr   r   r   r   r"   r#   shapesr   r   r   _flash_attention_backward_flop     
r   c
              
   O   r   )N)r   r   r   r   r   r   r   r   c                 s   r   r   r   r   r   r   r   r     r   z5_efficient_attention_backward_flop.<locals>.<genexpr>r   )r   r   r   r   r   r   r   r   r   r   r"   r#   r   r   r   r   "_efficient_attention_backward_flop  r   r   c                 C   s   t | ts| fS | S r   )r   tuple)xr   r   r   normalize_tuple.  s   
r   ) KMBTc                 C   s0   t dtttd tt| d d }t| S )Nr   r   r:   r   )maxminr   suffixesstr)numberindexr   r   r   get_suffix_str7  s   (r   c                 C   s&   t |}| d|  d}|t |  S )Ni  z.3f)r   r   )r   suffixr   r   r   r   r   convert_num_with_suffix>  s   
r   c                 C   s   |dkrdS | | dS )Nr   0%z.2%r   )numdenomr   r   r   convert_to_percent_strE  s   r   c                    s   t   fdd}|S )Nc                    s   t | \}} | }t||S r   )r   r   )r"   	flat_argsspecr   r$   r   r   r&   K  s   
z)_pytreeify_preserve_structure.<locals>.nfr   r'   r   r$   r   _pytreeify_preserve_structureJ  s   r   c                       s   e Zd ZdZ				ddeeejje	ejj f  de
dedeeeef  f fd	d
Zde
fddZdeeeee
f f fddZdddZdd Zdd Zdd Z  ZS )r   a  
    ``FlopCounterMode`` is a context manager that counts the number of flops within its context.

    It does this using a ``TorchDispatchMode``.

    It also supports hierarchical output by passing a module (or list of
    modules) to FlopCounterMode on construction. If you do not need hierarchical
    output, you do not need to use it with a module.

    Example usage

    .. code-block:: python

        mod = ...
        with FlopCounterMode(mod) as flop_counter:
            mod.sum().backward()

    Nr:   Tmodsdepthdisplaycustom_mappingc                    st   t    tdd | _|| _|| _d | _|d u ri }|d ur&tjddd i t	dd |
 D | _	t | _d S )Nc                   S   s   t tS r   )r   intr   r   r   r   <lambda>o  s    z*FlopCounterMode.__init__.<locals>.<lambda>z<mods argument is not needed anymore, you can stop passing itr:   )
stacklevelc                 S   s*   i | ]\}}|t |d dr|nt|qS )_get_rawF)getattrr(   r   r>   vr   r   r   
<dictcomp>y  s   * z,FlopCounterMode.__init__.<locals>.<dictcomp>)super__init__r   flop_countsr   r   modewarningswarnr   itemsr   mod_tracker)selfr   r   r   r   	__class__r   r   r   h  s   
zFlopCounterMode.__init__r9   c                 C   s   t | jd  S )NGlobal)r   r   valuesr   r   r   r   get_total_flops}  s   zFlopCounterMode.get_total_flopsc                 C   s   dd | j  D S )a  Return the flop counts as a dictionary of dictionaries.

        The outer
        dictionary is keyed by module name, and the inner dictionary is keyed by
        operation name.

        Returns:
            Dict[str, Dict[Any, int]]: The flop counts as a dictionary.
        c                 S   s   i | ]	\}}|t |qS r   )dictr   r   r   r   r     s    z3FlopCounterMode.get_flop_counts.<locals>.<dictcomp>)r   r   r   r   r   r   get_flop_counts  s   
zFlopCounterMode.get_flop_countsc           
         s  |d u rj }|d u rd}dd l}d|_g d}g }  t d fdd}tj D ]}|dkr;q4|d	d
 }||krGq4|||d
 }|	| q4djv roso|D ]
}	d|	d  |	d< q]|dd| }t
|dkrzg dg}|j||ddS )Ni?B r   T)ModuleFLOPz% TotalFc                    s   t j|   }| kO d| }g }|||  t|t| g j|   D ]\}}||d t| t|t| g q,|S )N z - )r   r   r   appendr   r   r   r   )mod_namer   rs   paddingr   r>   r   global_flopsglobal_suffixis_global_subsumedr   r   r   process_mod  s    z.FlopCounterMode.get_table.<locals>.process_modr   .r   r   )r   0r   )leftrightr  )headerscolalign)r   tabulatePRESERVE_WHITESPACEr   r   sortedr   keyscountextendr   )
r   r   r
  headerr   r  mod	mod_depth
cur_valuesr   r   r   r   	get_table  s6   
zFlopCounterMode.get_tablec                 C   s,   | j   | j  t| | _| j  | S r   )r   clearr   	__enter___FlopCounterModer   r   r   r   r   r    s
   



zFlopCounterMode.__enter__c                 G   sD   | j d usJ | j j| }d | _ | j  | jr t| | j |S r   )r   __exit__r   r   printr  r   )r   r"   rF   r   r   r   r    s   
zFlopCounterMode.__exit__c                 C   sV   || j v r)| j | }||i |d|i}t| jjD ]}| j| |  |7  < q|S )Nr    )r   setr   parentsr   )r   func_packetr   r"   r#   flop_count_funcra   parr   r   r   _count_flops  s   

zFlopCounterMode._count_flops)Nr:   TNr   )__name__
__module____qualname____doc__r
   r   r   nnr   r   r   boolr	   r   r   r   r   r   r  r  r  r  __classcell__r   r   r   r   r   T  s*    
=	c                   @   s$   e Zd ZdefddZdddZdS )	r  counterc                 C   s
   || _ d S r   )r'  )r   r'  r   r   r   r     s   
z_FlopCounterMode.__init__r   Nc                 C   s0  |r|ni }|t jjjjt jjjjt jjjjt jjjjt jjjjt jjj	jt jjj
jt jjjjt jjjjt jjjjt jjjjt jjjjt jjjjt jjjjhv rRtS || jjvr|t jjjjur|  |j|i |}|turx|W  d    S W d    n1 sw   Y  ||i |}| j|j|||S r   )r   opsatenis_contiguousdefaultmemory_formatis_strides_like_formatis_non_overlapping_and_denser~   sym_sizestride
sym_stridestorage_offsetsym_storage_offsetnumel	sym_numeldimprimlayoutNotImplementedr'  r   device	decomposer  _overloadpacket)r   functypesr"   r#   rr   r   r   r   __torch_dispatch__  s6   












z#_FlopCounterMode.__torch_dispatch__)r   N)r   r!  r"  r   r   r@  r   r   r   r   r    s    r  )Fr   )Mr   torch.utils._pytreer   r   r   module_trackerr   typingr   r   r	   r
   r   r   r   collectionsr   torch.utils._python_dispatchr   mathr   	functoolsr   r   __all__r(  r)  r   r   __annotations__r(   r   mmr   rA   addmmrE   bmmrI   baddbmmrK   r%  rT   convolution_convolutionrZ   convolution_backwardrd   rt   '_scaled_dot_product_efficient_attention#_scaled_dot_product_flash_attention#_scaled_dot_product_cudnn_attentionrw   r   r   r   _flash_attention_forwardr   _efficient_attention_forwardr   r   0_scaled_dot_product_efficient_attention_backward,_scaled_dot_product_flash_attention_backward,_scaled_dot_product_cudnn_attention_backwardr   _flash_attention_backwardr   _efficient_attention_backwardr   r   r   r   r   r   r   r   r  r   r   r   r   <module>   s   
$


'g6

36

0
  	

 