o
    ,ib                     @   s  U d dl Z d dlmZmZmZ ddlmZ d dlmZm	Z	m
Z
mZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlZddgZe jjZdd Zi Ze
e	e	f ed< dd Z dLddZ!e!ej"ddde#fddZ$e!ej%dMde#fddZ&e!ej'dMde#fddZ(e!ej)dMde#fddZ*	dLdee# dee# d ee# d!e+de#f
d"d#Z,e!ej-ej.gddde#fd$d%Z/e!ej0de#fd&d'Z1d(d) Z2e!ej3ej4gddde#fd*d+Z5dd,deeee#d-f ee#d-f ee#d-f eee#d-f  f  fd.d/Z6dd,deeee#d-f ee#d-f ee#d-f eee#d-f  f  fd0d1Z7e!ej8d2d3ddde#fd4d5Z9e!ej:d2d3de#fd6d7Z;d8d9 Z<e!ej=ej>gddde#fd:d;Z?e!ej@d2d3de#fd<d=ZAe!ejBd2d3de#fd>d?ZCej"e$ej%e&ej'e(ej)e*ej-e/ej.e/ej0e1ej3e5ej4e5ej=e?ej>e?ej8e9ej:e;ej@eAejBeCiZd@dA ZDg dBZEdCdD ZFdEdF ZGdGdH ZHdIdJ ZIG dKd deZJdS )N    N)tree_maptree_flattentree_unflatten   )ModuleTracker)ListAnyDictOptionalUnionTupleIterator)defaultdict)TorchDispatchMode)register_decompositionprodwrapsFlopCounterModeregister_flop_formulac                 C   s   t | tjr	| jS | S N)
isinstancetorchTensorshape)i r   V/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/torch/utils/flop_counter.py	get_shape   s   r   flop_registryc                    s   t  d d fdd
}|S )N)out_valc                    s(   t t||| f\}}} |d|i|S )N	out_shape)r   r   )r!   argskwargsr"   fr   r   nf   s   zshape_wrapper.<locals>.nfr   r&   r'   r   r%   r   shape_wrapper   s   r)   Fc                    s    fdd}|S )Nc                    s"    st | } ttdd|  | S )NT)registryunsafe)r)   r   r    )flop_formulaget_rawtargetsr   r   register_fun"   s   z+register_flop_formula.<locals>.register_funr   )r/   r.   r0   r   r-   r   r   !   s   )r"   returnc          	      O   s,   | \}}|\}}||ksJ || d | S )zCount flops for matmul.   r   )	a_shapeb_shaper"   r#   r$   mkk2nr   r   r   mm_flop*   s   r9   c                 K   
   t ||S )zCount flops for addmm.)r9   
self_shaper3   r4   r"   r$   r   r   r   
addmm_flop5   s   
r=   c                 K   sD   | \}}}|\}}}	||ksJ ||ksJ || |	 d | }
|
S )z"Count flops for the bmm operation.r2   r   )r3   r4   r"   r$   br5   r6   b2r7   r8   flopr   r   r   bmm_flop:   s   

rA   c                 K   r:   )z&Count flops for the baddbmm operation.rA   r;   r   r   r   baddbmm_flopG   s   
rC   x_shapew_shaper"   
transposedc           
      C   sL   | d }|r| n|dd }|^}}}	 t |t | | | | d }	|	S )a  Count flops for convolution.

    Note only multiplication is
    counted. Computation for bias are ignored.
    Flops for a transposed convolution are calculated as
    flops = (x_shape[2:] * prod(w_shape) * batch_size).
    Args:
        x_shape (list(int)): The input shape before convolution.
        w_shape (list(int)): The filter shape.
        out_shape (list(int)): The output shape after convolution.
        transposed (bool): is the convolution transposed
    Returns:
        int: the number of flops
    r   r2   Nr   )
rD   rE   r"   rF   
batch_size
conv_shapec_outc_infilter_sizer@   r   r   r   conv_flop_countO   s   
 rL   c          
      O   s   t | |||dS )zCount flops for convolution.rF   )rL   )
rD   rE   _bias_stride_padding	_dilationrF   r"   r#   r$   r   r   r   	conv_flopv   s   rR   c                 C   s   dd }d}	 |
d rt |d }|t| ||| 7 }|
d rIt |d }|r9|t|| ||||dd7 }|S |t|||| ||dd7 }|S )Nc                 S   s    | d | d gt | dd   S )Nr   r   r2   )list)r   r   r   r   t   s    zconv_backward_flop.<locals>.tr   r   FrM   )r   rL   )grad_out_shaperD   rE   rN   rO   rP   rQ   rF   _output_padding_groupsoutput_maskr"   rT   
flop_countgrad_input_shapegrad_weight_shaper   r   r   conv_backward_flop|   s   F  r\   c                 C   s   | \}}}}|\}}}	}
|\}}}}||  kr|kr8n J ||  kr)|kr8n J ||
kr8|	|kr8||
ks:J d}|t || ||f|| ||	f7 }|t || ||	f|| |	|f7 }|S )z^
    Count flops for self-attention.

    NB: We can assume that value_shape == key_shape
    r   rB   )query_shape	key_shapevalue_shaper>   hs_qd_q_b2_h2s_k_d2_b3_h3_s3d_vtotal_flopsr   r   r   sdpa_flop_count   s   P""rl   c                O   s   t | ||S )Count flops for self-attention.rl   )r]   r^   r_   r"   r#   r$   r   r   r   	sdpa_flop   s   ro   )grad_out.c                 c   sJ   |durt |jdksJ t |jdksJ |du s#|j| jks#J | j\}}	}
|j\}}}|j\}}}|dus;J |dusAJ |j|jksIJ |dd |dd   }|dd |dd   }t||D ]%\}}d|	||
f}d|||f}d|||f}|dur|nd}||||fV  qjdS | j|j|j|dur|jndfV  dS )a;  
    Given inputs to a flash_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   r   lenr   tolistzip)querykeyvaluerp   	cum_seq_q	cum_seq_kmax_qmax_k_h_qrb   h_kd_kh_vrj   seq_q_lengthsseq_k_lengths	seq_q_len	seq_k_lennew_query_shapenew_key_shapenew_value_shapenew_grad_out_shaper   r   r   %_unpack_flash_attention_nested_shapes   s*   &r   c                 c   sP   |durt |jdksJ t |jdksJ |du s#|j| jks#J | j\}}}	}
|j\}}}}|j\}}}}|dus>J |dusDJ |j|jksLJ |dd |dd   }|dd |dd   }t||D ]%\}}d|	||
f}d|||f}d|||f}|dur|nd}||||fV  qmdS | j|j|j|dur|jndfV  dS )a?  
    Given inputs to a efficient_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   r   rr   rs   )rw   rx   ry   rp   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kr~   r   rb   r   r   r   rj   	seqlens_q	seqlens_klen_qlen_kr   r   r   r   r   r   r   )_unpack_efficient_attention_nested_shapes+  s*   &r   T)r.   c             	   O   s(   t | ||||||d}
tdd |
D S )rm   )rw   rx   ry   rz   r{   r|   r}   c                 s   $    | ]\}}}}t |||V  qd S r   rn   .0r]   r^   r_   r~   r   r   r   	<genexpr>u  
    


z0_flash_attention_forward_flop.<locals>.<genexpr>r   sum)rw   rx   ry   rz   r{   r|   r}   r"   r#   r$   sizesr   r   r   _flash_attention_forward_flop[     	r   c              	   O   s(   t | ||||||d}
tdd |
D S )rm   )rw   rx   ry   r   r   r   r   c                 s   r   r   rn   r   r   r   r   r     r   z4_efficient_attention_forward_flop.<locals>.<genexpr>r   r   )rw   rx   ry   biasr   r   r   r   r#   r$   r   r   r   r   !_efficient_attention_forward_flop{  r   r   c                 C   sV  d}|\}}}}|\}	}
}}|\}}}}| \}}}}||	  kr)|  kr)|krBn J ||
  kr;|  kr;|krBn J ||ksDJ ||krP||krP||ksRJ d}|t || ||f|| ||f7 }|t || ||f|| ||f7 }|t || ||f|| ||f7 }|t || ||f|| ||f7 }|t || ||f|| ||f7 }|S )Nr   rB   )rU   r]   r^   r_   rk   r>   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   _b4_h4_s4_d4r   r   r   sdpa_backward_flop_count  s   T"""""r   c                O   s   t | |||S )z(Count flops for self-attention backward.r   )rU   r]   r^   r_   r"   r#   r$   r   r   r   sdpa_backward_flop  s   r   c
              
   O   *   t |||| ||||	d}tdd |D S )N)rw   rx   ry   rp   rz   r{   r|   r}   c                 s   &    | ]\}}}}t ||||V  qd S r   r   r   r]   r^   r_   rU   r   r   r   r     
    

z1_flash_attention_backward_flop.<locals>.<genexpr>r   )rp   rw   rx   ry   out	logsumexprz   r{   r|   r}   r#   r$   shapesr   r   r   _flash_attention_backward_flop     
r   c
              
   O   r   )N)rw   rx   ry   rp   r   r   r   r   c                 s   r   r   r   r   r   r   r   r     r   z5_efficient_attention_backward_flop.<locals>.<genexpr>r   )rp   rw   rx   ry   r   r   r   r   r   r   r#   r$   r   r   r   r   "_efficient_attention_backward_flop  r   r   c                 C   s   t | ts| fS | S r   )r   tuple)xr   r   r   normalize_tuple  s   
r   ) KMBTc                 C   s0   t dtttd tt| d d }t| S )Nr   r   r2   rq   )maxminrt   suffixesstr)numberindexr   r   r   get_suffix_str  s   (r   c                 C   s&   t |}| d|  d}|t |  S )Ni  z.3f)r   r   )r   suffixr   ry   r   r   r   convert_num_with_suffix  s   
r   c                 C   s   |dkrdS | | dS )Nr   0%z.2%r   )numdenomr   r   r   convert_to_percent_str&  s   r   c                    s   t   fdd}|S )Nc                    s   t | \}} | }t||S r   )r   r   )r#   	flat_argsspecr   r%   r   r   r'   ,  s   
z)_pytreeify_preserve_structure.<locals>.nfr   r(   r   r%   r   _pytreeify_preserve_structure+  s   r   c                       s   e Zd ZdZ				ddeeejje	ejj f  de
dedeeeef  fd	d
Zde
fddZdeeeee
f f fddZdddZ fddZ fddZdddZdd Z  ZS )r   a  
    ``FlopCounterMode`` is a context manager that counts the number of flops within its context.

    It does this using a ``TorchDispatchMode``.

    It also supports hierarchical output by passing a module (or list of
    modules) to FlopCounterMode on construction. If you do not need hierarchical
    output, you do not need to use it with a module.

    Example usage

    .. code-block:: python

        mod = ...
        with FlopCounterMode(mod) as flop_counter:
            mod.sum().backward()

    Nr2   Tmodsdepthdisplaycustom_mappingc                 C   sd   t dd | _|| _|| _|d u ri }|d urtjddd i tdd | D | _t | _	d S )Nc                   S   s   t tS r   )r   intr   r   r   r   <lambda>O  s    z*FlopCounterMode.__init__.<locals>.<lambda>z<mods argument is not needed anymore, you can stop passing itr2   )
stacklevelc                 S   s*   i | ]\}}|t |d dr|nt|qS )_get_rawF)getattrr)   r   r6   vr   r   r   
<dictcomp>X  s   * z,FlopCounterMode.__init__.<locals>.<dictcomp>)
r   flop_countsr   r   warningswarnr    itemsr   mod_tracker)selfr   r   r   r   r   r   r   __init__I  s   zFlopCounterMode.__init__r1   c                 C   s   t | jd  S )NGlobal)r   r   valuesr   r   r   r   get_total_flops\  s   zFlopCounterMode.get_total_flopsc                 C   s   dd | j  D S )a  Return the flop counts as a dictionary of dictionaries.

        The outer
        dictionary is keyed by module name, and the inner dictionary is keyed by
        operation name.

        Returns:
            Dict[str, Dict[Any, int]]: The flop counts as a dictionary.
        c                 S   s   i | ]	\}}|t |qS r   )dictr   r   r   r   r   i  s    z3FlopCounterMode.get_flop_counts.<locals>.<dictcomp>)r   r   r   r   r   r   get_flop_counts_  s   
zFlopCounterMode.get_flop_countsc                    s  |d u rj }|d u rd}dd l}d|_g d}g }  t d fdd}tj D ]}|dkr;q4|d	d
 }||krGq4|||d
 }|	| q4djv rwswt
|D ]\}	}
d||	 d  ||	 d< q_|dd| }t|dkrg dg}|j||ddS )Ni?B r   T)ModuleFLOPz% TotalFc                    s   t j|   }| kO d| }g }|||  t|t| g j|   D ]\}}||d t| t|t| g q,|S )N z - )r   r   r   appendr   r   r   r   )mod_namer   rk   paddingr   r6   r   global_flopsglobal_suffixis_global_subsumedr   r   r   process_mody  s    z.FlopCounterMode.get_table.<locals>.process_modr   .r   r   )r   0r   )leftrightr   )headerscolalign)r   tabulatePRESERVE_WHITESPACEr   r   sortedr   keyscountextend	enumeratert   )r   r   r   headerr   r   mod	mod_depth
cur_valuesidxry   r   r   r   	get_tablek  s6   
zFlopCounterMode.get_tablec                    s"   | j   | j  t   | S r   )r   clearr   	__enter__superr   	__class__r   r   r    s   


zFlopCounterMode.__enter__c                    s4   t  j|  | j  | jrt| | j d S d S r   )r  __exit__r   r   printr  r   )r   r#   r  r   r   r    s
   
zFlopCounterMode.__exit__r   c                 C   s,   |r|ni }||i |}|  |j|||S r   )_count_flops_overloadpacket)r   functypesr#   r$   r   r   r   r   __torch_dispatch__  s   z"FlopCounterMode.__torch_dispatch__c                 C   sV   || j v r)| j | }||i |d|i}t| jjD ]}| j| |  |7  < q|S )Nr!   )r    setr   parentsr   )r   func_packetr   r#   r$   flop_count_funcrY   parr   r   r   r
    s   

zFlopCounterMode._count_flops)Nr2   TNr   )r   N)__name__
__module____qualname____doc__r
   r   r   nnr   r   r   boolr	   r   r   r   r   r   r  r  r  r  r
  __classcell__r   r   r  r   r   5  s,    

<
)Fr   )Kr   torch.utils._pytreer   r   r   module_trackerr   typingr   r   r	   r
   r   r   r   collectionsr   torch.utils._python_dispatchr   torch._decompr   mathr   	functoolsr   r   __all__opsatenr   r    __annotations__r)   r   mmr   r9   addmmr=   bmmrA   baddbmmrC   r  rL   convolution_convolutionrR   convolution_backwardr\   rl   '_scaled_dot_product_efficient_attention#_scaled_dot_product_flash_attentionro   r   r   _flash_attention_forwardr   _efficient_attention_forwardr   r   0_scaled_dot_product_efficient_attention_backward,_scaled_dot_product_flash_attention_backwardr   _flash_attention_backwardr   _efficient_attention_backwardr   r   r   r   r   r   r   r   r   r   r   r   <module>   s   
$
	

'g6

36

0
 !
