o
    oi4                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZ e jdd Zdd Zd	d
 Zdd Z	dddZdd ZdS )    N)cdiv)driver)get_dram_gbpsget_max_simd_tflopsget_max_tensorcore_tflopsnvsmic                  C   sT   z
t dgd d W S  ty)   dd l} |   | d}| || jd  Y S w )Nzclocks.max.smr   g     @@)r   FileNotFoundErrorpynvmlnvmlInitnvmlDeviceGetHandleByIndexnvmlDeviceGetMaxClockInfoNVML_CLOCK_SM)r	   handle r   Y/home/ubuntu/.local/lib/python3.10/site-packages/bitsandbytes/triton/matmul_perf_model.pyget_clock_rate_in_khz   s   
r   c                 C   D   |t |d }tjj| d d }t ||| t|t |  }|S z!return compute throughput in TOPS   multiprocessor_count)minr   activeutilsget_device_propertiesr   r   devicenum_ctas	num_warpsdtypetotal_warpsnum_subcorestflopsr   r   r   get_tensorcore_tflops   s   r"   c                 C   r   r   )r   r   r   r   r   r   r   r   r   r   r   get_simd_tflops+   s
   r#   c                 C   s>   t j| }|d dk r|t jkrt| |||S t| |||S )Nr      )torchcudaget_device_capabilityfloat32r#   r"   )r   r   r   r   
capabilityr   r   r   
get_tflops5   s   r*   Fc           +      K   s  t j }|j}| }t||}t||	}|}|| | }t||t||	}}d| | | d }t||| |}|| }tj	j
|d }td|| }td|d }ttd|d d d}t||d |d	   }|d
 }|| | dd|d    }|| | d |d  }|| | dd|d    } || | d |d  }!||  d }"||! d }#|"| |#|  }$|d }%|| | | d }&|dkr|&|% }'n|%}(|&|( }'|| d d |% })|'|)7 }'t||$|' }*|rtd|* d| d|$ d|' d|d  d |*S )zGreturn estimated running time in ms
    = max(compute, loading) + store   i   @r          L   r   gffffff?g?r   g?g?i   g333333?zTotal time: zms, compute time: zms, loading time: zms, store time: zms, Activate CTAs: d   %)r%   r&   current_devicer   element_sizer   maxr*   r   r   r   r   r   r   print)+r   
num_stagesABCMNKBLOCK_MBLOCK_NBLOCK_KSPLIT_Kdebugkwargsr   r   dtsize	num_cta_m	num_cta_n	num_cta_kr   	total_opstput
compute_msnum_smactive_cta_ratioactive_cta_ratio_bw1active_cta_ratio_bw2dram_bwl2_bwload_a_dram	load_a_l2load_b_dram	load_b_l2
total_dramtotal_l2load_msstore_bwstore_c_dramstore_ms	reduce_bwzero_mstotal_time_msr   r   r   estimate_matmul_time<   sX   



r\   c                    s  t j }t j }|d  }|d j}g }| D ]2}|j}	|	d |	d |	d |jf\}
}}}tj	j
|d }|
| | | | }||krK|| q|} |t jt jfvr]dd | D } i }| D ]9}|j}	|	d |	d |	d |	d |j|jf\}
}}}}}|
||||f}||v r|| ||f qa||fg||< qag }| D ]O\}}|\}
}}}}|d	 d
kr|
| | d }|td| d
 }d}||  tjd| fddd}|D ]	}||d	  qq|d	 d	 }d|_|| q|S )Nr6   r<   r=   r>   max_shared_memc                 S   s   g | ]}|j d  dkr|qS )r?   r,   )rA   ).0configr   r   r   
<listcomp>   s    z&early_config_prune.<locals>.<listcomp>r?   r   r$   i   r   i,  r+   c                    s0   | d   dk rdt | d    S | d   S )Nr,   r   
   )abs)xoptimal_num_stagesr   r   <lambda>   s   z$early_config_prune.<locals>.<lambda>)key)r%   r&   r1   r'   r2   r   rA   r5   r   r   r   r   appendfloat16r(   r   itemsr   heapq	nsmallest)configs
named_argsrA   r   r)   rB   r   pruned_configsr_   kwr<   r=   r>   r5   max_shared_memoryrequired_shared_memoryconfigs_mapr?   r   rg   kvmmas
mma_cyclesldgsts_latencynearestnrandom_configr   rd   r   early_config_prune   sl   



	

r|   )F)	functoolsrk   r%   tritonr   triton.runtimer   triton.testingr   r   r   r   	lru_cacher   r"   r#   r*   r\   r|   r   r   r   r   <module>   s   


L