o
     i"                     @   s   d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZ d dlZG dd dZdded	ed
efddZded
efddZdejjjd
ejjjfddZeG dd dZdS )    N)defaultdict)	dataclass)AnyDictListOptionalSequencecastc                   @   s$   e Zd ZdejjjddfddZdS )FakeKinetoEventereturnNc                 C   s6   t |D ]}|drqt| |t|| q|| _d S )N_)dir
startswithsetattrgetattr_kineto_event)selfr   attr r   V/home/ubuntu/.local/lib/python3.10/site-packages/xformers/profiler/profile_analyzer.py__init__   s
   

zFakeKinetoEvent.__init__)__name__
__module____qualname__torch_C	_autograd_KinetoEventr   r   r   r   r   r
      s    r
   BHMKcausalfmtr   c                 C   s  t |tsJ |dv sJ |dkrdd | |fD \} }| ^ }}}|^ }}}|rjd| t|| d | dt|| d t|| d |  }	|	dt|| t|| | dt|| t|| |  d 7 }	nd| | | d| | |  }	|D ]}
|	|
9 }	q|t|	S )N)BMHKr   r"   c                 S   s(   g | ]}|d  |d |d |d gqS )r            r   ).0xr   r   r   
<listcomp>   s   ( z$_attention_flops.<locals>.<listcomp>r#   r   )
isinstanceboolmaxminint)queriesvaluesr    r!   BNKNvKvflopsbr   r   r   _attention_flops   s    >6 
r7   	arg_namesc                 G   sB   t | jjjD ]\}}|j|v r|  S qtd| d| jj )NzNo such argument z
 found in )	enumeratedefault_schema	argumentsname
ValueError)opr8   iargr   r   r   _get_arg_idx,   s
   
rB   r   c                    s  |   jdkr	| S |  }d tdd}dd tjjddi ftjjd	d|ftjjd
d|ftjjdd|ftjjddi ftjjddi ftjjdd|ftjjdd|ftjjdd|ftjjddi ff
D }|| v r|| \}}}| 	 }| 
 }z
|t|dd }	W n ty   |t|d dk}	Y nw t|t|d |t|d |	fi | |r d d   durt| }
 fdd|
_ttjjj|
} | S )z
    Adds a flops amount for operators that don't have this information in Kineto already
    This mostly applies for the attention for now, as GEMMs are already calculated by Kineto
    and other operations are negligible.
    CPUNr"   )r!   c                 S   s<   i | ]\}}}}t ||rt||j t||||fqS r   )hasattrr   r:   r=   )r&   libr?   is_bwdkwargsr   r   r   
<dictcomp>A   s    
z&_replace_if_needed.<locals>.<dictcomp>scaled_dot_product_attentionF	flash_fwd#efficient_attention_forward_cutlass_efficient_attention_forward,_scaled_dot_product_flash_attention_backwardT0_scaled_dot_product_efficient_attention_backward	flash_bwd$efficient_attention_backward_cutlass_efficient_attention_backward,_scaled_dot_product_cudnn_attention_backwardr    	is_causalcustom_mask_typer   queryvalue   r#   c                      s    S Nr   r   r5   r   r   <lambda>r   s    z$_replace_if_needed.<locals>.<lambda>)device_typer=   dictr   opsatenxformers_flashxformerskeysshapesconcrete_inputsrB   r>   r7   r
   r5   r	   r   r   r   )r   op_nameFMT_BMHKATTN_OPSr?   rF   rG   rb   rc   rS   new_er   rY   r   _replace_if_needed3   sj   
rh   c                	   @   s   e Zd ZU eejef ed< eejef ed< eed< 	ddejdededefd	d
Z	deejef defddZ
deejef defddZedeejjj deejjj fddZedeejjj dd fddZdS )AnalyzedTraceoperations_per_dtype_fwoperations_per_dtype_bwtotal_time_sTdtypefwbwr   c                 C   s4   d}|r|| j |d7 }|r|| j|d7 }|S N        )rj   getrk   )r   rm   rn   ro   r]   r   r   r   compute_num_ops}   s   zAnalyzedTrace.compute_num_opshardware_flopsc                 C   s2   d}|  D ]\}}|| || 7 }q|| j S rp   )itemsrs   rl   r   rt   hfu_secondsrm   hw_flopsr   r   r   compute_hfu   s   
zAnalyzedTrace.compute_hfuc                 C   sF   d}|  D ]\}}|td| j|dd | || 7 }q|| j S )Nrq   r%   F)ro   )ru   r,   rs   rl   rv   r   r   r   compute_mfu   s   
zAnalyzedTrace.compute_mfu
all_eventsc                 C   s   dd | D }t t}|D ]}|| | f | qg }| D ](}|jdd d d }|D ]}|d u sC| | |  krJ|}|| q1q#|S )Nc                 S   s:   g | ]}|  jd kr| s| r| dkr|qS )rC   r   )r[   r=   dtypesrb   r5   r&   r   r   r   r   r(      s    zBAnalyzedTrace._find_all_root_events_with_flops.<locals>.<listcomp>c                 S   s   |   |   fS rX   )start_nsduration_ns)r   r   r   r   rZ      s    z@AnalyzedTrace._find_all_root_events_with_flops.<locals>.<lambda>)key)	r   liststart_thread_idr[   appendr/   sortr~   r   )r{   all_ops_with_flopsevents_per_groupr   root_eventseventscurrent_rootr   r   r    _find_all_root_events_with_flops   s*   
z.AnalyzedTrace._find_all_root_events_with_flopsr   c                 C   s2  dd | D } t | }tt}tt}dd | D }dtjfdtjfdtjfdtjfd	tjfg}tj	d
}}|D ]5}d }	|D ]\}
}|
|
 v rM|}	 nq?|	d u rSq9| |v rd||	  | 7  < q9||	  | 7  < q9| D ]}| jdkr{qqt|| }t|| |  }qqt |||| d dS )Nc                 S   s   g | ]}t |qS r   )rh   r}   r   r   r   r(      s    z.AnalyzedTrace.from_profile.<locals>.<listcomp>c                 S   s    h | ]}|  d kr| qS )r   )fwd_thread_idr   r}   r   r   r   	<setcomp>   s     z-AnalyzedTrace.from_profile.<locals>.<setcomp>doublefloatz	c10::Halfzc10::BFloat16z	c10::Int8r   CUDAi ʚ;)rj   rk   rl   )ri   r   r   r   r   float64float16int8mathinfr|   r   r5   r[   r=   r,   r~   r+   r   )r   root_opsrj   rk   all_bw_threadsATEN_DTYPESbegin_nsend_nsr?   rm   
aten_dtypetorch_dtyper   r   r   from_profile   sD   
	
zAnalyzedTrace.from_profileN)TT)r   r   r   r   r   rm   r   __annotations__r*   rs   ry   rz   staticmethodr   r   r   r   r   r   r   r   r   r   ri   w   s8   
 

#ri   )r   )r   collectionsr   dataclassesr   typingr   r   r   r   r   r	   r   r
   r*   strr-   r7   rB   r   r   r   rh   ri   r   r   r   r   <module>   s    	
D