o
    پiE                     @   s  d Z ddlZddlZzddlZeedredkredddlT ddlT ddl	T W n ey>   ddl
T ddlT ddlT Y nw zddlmZmZmZ W n ey^   ddlmZmZmZ Y nw eeZd	d
 e D Zdd ZdddZdddZdd Zdd Zdd Zdd ZdS )zN
Utilities for selecting CUTLASS library kernels based on problem description
    NCUTLASS_IGNORE_PACKAGETz+Disabling attempt to import cutlass_library)*   )get_valid_schedules)generate_data_types_from_math_instructionfix_alignmentsc                 C   s   i | ]\}}||qS  r   .0kvr   r   m/home/ubuntu/.local/lib/python3.10/site-packages/flashinfer/data/cutlass/python/cutlass_library/heuristics.py
<dictcomp>B   s    r   c           	      C   s   |   }|D ]K}| D ]\}}t|trt| ||< qt|tr't| ||< q|d }|D ]"}| D ]\}}t|trDt| ||< q4t|trOt| ||< q4q.qt|d}tj	||dd W d   dS 1 skw   Y  dS )a  
  Utilitiy function to write heuristics results to a json file for debug

  args:
    problems_with_configs: List of problems provided to the heuristic, with a list of operations added to each problem dict
    outfile_path: Outfile path
      
  returns:
    None
  configsw   )indentN)
copyitems
isinstanceDataTypeDataTypeNames
LayoutTypeShortLayoutTypeNamesopenjsondump)	problems_with_configsoutfile_pathpc_copypr   r   r   cfr   r   r   $serialize_heuristics_results_to_jsonD   s*   



"r#   Fc                 C   s.   |du rt  }|j| |||||||||	|
dS )a  
  Get heuristic-suggested GEMM kernel configurations for a single GEMM problem.

  args:
    m, n, k: GEMM dimensions
    batch_count: batch count
    layouts: tuple of layouts of type LayoutType
    use_fast_acc: Use fast accumulation for FP8. Ignored for other precisions
    count: Number of configs to return
    provider: Heuristics provider to use

  returns:
    A list of dictionaries containing the suggested kernel configurations and additional info from the input required to define a Cutlass GemmOperation, with the following keys:
      - 'cta_tile_m', 'cta_tile_m', 'cta_tile_k': CTA tile size
      - 'instr_tile_m', 'instr_tile_n', 'instr_tile_k': Instruction tile size
      - 'stages': kernel pipeline stage count
      - 'cluster_m', 'cluster_n', 'cluster_k': cluster size
      - 'layout_a', 'layout_b': input tensor layouts of type LayoutType
      - 'alignment_a', 'alignment_b': input tensor alignments, in count of elements
      - 'dtype_a', 'dtype_b', 'dtype_acc': dtypes of a, b, and accumulator, of type DataType
      - 'swizzle_size' : suggested threadblock swizzle 
      - 'split_k_slices': number of partitions of the k dimension for splitK
      - 'raster_order': raster order for CTAs over output tiles ('along_m' or 'along_n')
  N)voidCuse_fast_acccount)MatmulHeuristicsget_configs)mnr   batch_countlayoutsdtypesalignment_aalignment_br$   r%   r&   providerr   r   r   get_single_gemm_config`   s    r1   c                 C   s  g }| D ] }|  }z|d }|d }|d }|d }|d }	|d }
|d }W n tyA } ztd| d	|   d
}~ww |dd}|dd}|dd}|dd
}|dd}|dd}|dd}|ttj krztd| t	|dkrt
dd |D std| tdd |D }z$| |	 | |d
ur| n|
 |
 g}tdd |D }W n ty } z	td|   d
}~ww |d d!t|d"   }|d#d!t|d   }t|||||||||dk|||}||d$< || q|S )%a  
  Get heuristic-suggested GEMM kernel configurations for a set of GEMM problems.

  args:
    problems: List of dictionaries describing GEMM problems with the following keys:
      - 'm', 'n', 'k': Matrix dimensions (required)
      - 'dtype_a': Data type of matrix A (required)
      - 'dtype_b': Data type of matrix B (required)
      - 'dtype_c': Data type of matrix C (default: None)
      - 'dtype_d': Data type of matrix D (required)
      - 'dtype_acc': Compute data type (default 'f32')
      - 'layout': Operation layout (e.g. 'tnt')
      - 'alignment_a': Memory access granularity of A, in units of elements (default: 16 bytes equivalent elements)
      - 'alignment_b': Memory access granularity of B, in units of elements (default: 16 bytes equivalent elements)
      - 'alpha': Scalar multiplier for A*B (default: 1.0)
      - 'beta': Scalar multiplier for C (default: 0.0)
      - 'batch_count': Number of GEMM operations in batch (default: 1)
      - 'use_fast_acc': Enable fast accumulation for FP8 on Hopper (default: True)
    provider: Heuristics provider to use
    count: Number of configurations to return per problem (defualt: 1)
      
  returns:
    A copy of the input dictionary, with key `configs` added containing the selected gemm configs
  r)   r*   r   dtype_adtype_bdtype_dlayoutzMissing required parameter z for problem N	operationgemmr+   r   	dtype_accf32dtype_calphag      ?betag        r%   TzUnsupported operation    c                 s   s    | ]}|d v V  qdS )ntNr   )r
   r!   r   r   r   	<genexpr>       z#get_gemm_configs.<locals>.<genexpr>zDlayout must be a 3-character string containing only 'n' or 't', got c                 s   s$    | ]}|d krt jnt jV  qdS )tN)r   RowMajorColumnMajor)r
   lr   r   r   r?      s   " c                 s   s    | ]}t | V  qd S )N)	dtype_map)r
   dtr   r   r   r?      r@   zUnsupported data type: r.      r   r/   r   )r   KeyError_LOGGERerrorgetOperationKindNamesOperationKindGemm
ValueErrorlenalltuplelowerDataTypeSizer1   append)problemsr0   r&   retproblemr)   r*   r   r2   r3   r4   r5   er6   r+   r8   r:   r;   r<   r%   r,   
dtype_listr-   rF   r.   r/   r   r   r   r   get_gemm_configs}   sT   
2"r[   c                 C   s  d}d}| du rt  } g }g }|D ]}|d |d g|d |d g|d d	t|d
   gf}|d |d |d |d |d
 f\}	}
}}}|d d dk}|rTd|d  n|d |d |d d g}t||	|
|tjtj}|j|j|d rwt	j
n|j||j|jdg}|d |rdnd |d |d f}t|d |d  |d |d  |d d |d  gdg d||||d |d |d fd}g }|r|tjtjg n	|tjtjg t| |g|g||tjtjgtjdD ]}|| || qq||fS )+  
  Generate CUTLASS operations based on the list of configs provided by the heuristic provider

  args:
    manifest: manifest argument to which to add operations, or None to just return the operations without a manifest (for pruning an existing manifest)
    cuda_version: Cuda compiler version for generating cutlass operations
    kernel_configs: list of configs generated by the heuristic
      
  returns:
    (configs, operations): a list of heuristic-provided kernel configs along with a one-to-one corresponding list of the generated operations
  d   e   Nlayout_ar.   layout_br/   layout_drG   r4   r2   r3   r8   r:   	cluster_mr   r   
cta_tile_m
cta_tile_n
cta_tile_k   r$   )a_typeb_typec_typed_typeacc_typeepi_typer   	cluster_n	cluster_k)rf   r   r   cluster_shape)tile_schedulers	gemm_kind)ManifestrT   MathInstructionOpcodeClassTensorOpMathOperationmultiply_add	element_a	element_br   voidelement_accumulatorTileDescriptionrU   KernelScheduleTypeTmaWarpSpecialized2SmSm100EpilogueScheduleTypeTmaWarpSpecialized2SmTmaWarpSpecialized1SmSm100TmaWarpSpecialized1SmCreateGemmUniversal3xOperatorTileSchedulerTypeDefaultStreamKGemmKindUniversal3x)manifestcuda_versionkernel_configsmin_ccmax_ccr   
operationsconfigr5   ry   rz   r|   	element_c	element_dis_2sminstruction_shapemath_instruction
data_typestile_multipliertile_description	schedulesor   r   r   &generate_sm100_from_heuristics_configs   s^   6,*	"(
r   c                 C   s  d\}}| du rt  } g }g }|D ]}|d t|d   dko*|d t|d   dk}|d |d g|d	 |d gtjd
gf}	|d |d |d |d |d f\}
}}}}g d}t||
||tjtj}t	|||d}|rtt
||	dd}	g d}t|d |d |d gd|||||d |d |d fd}t||||d|	tj|d d\}}t|rt| |	g|g||tjdD ]}|| || qt|rt| |	g|g||tjgdD ]}|| || qq||fS )r\   )Z   r   Nr.   r2   rG   r/   r3   r_   r`   r   r8   r:   r4   )r   r   r   )element_sourceelement_dest)alignment_bitsrc   rd   re   r   rb   rm   rn   ro   i(#  r%   )r   r   
is_alignedr   instantiation_levelr5   rr   enable_fp8_fast_acc)rr   )rq   )rs   rT   r   rC   rt   ru   rv   rw   rx   r   r   r}   r   r   r   rP   r   rU   r   r   )r   r   r   r   r   r   r   r   r   r5   ry   rz   r|   r   r   dummy_instr_shaper   r   dummy_warp_countr   r   stream_k_schedulesr   r   r   r   %generate_sm90_from_heuristics_configs  sh   0(,




r   c                    s  g }t |jd}t|}W d   n1 sw   Y  |jdks'|jdkr)dn|j}t|d}tdd |jdD rC|	d	 t
|||jd
}g }g }|D ]Y}	tdd |jdD rpt|jrfdn| |j|	d \}
}tdd |jdD rt|jrdn| |j|	d \}
}||7 }dd |	 D   fddt|
|D }||7 }qQ|D ]}| d|  d q|stdt||j |S )a  
  Prune a manifest according to heuristics suggestions from the problems file

  args:
    manifest: Cutlass manifest to prune
    args: generator.py args, requires:
      - args.heuristics_problems_file
      - args.heuristics_gpu
      - args.heuristics_testlist_file
      
  returns:
    A list of dictionaries, each of which has information about an operation and a problem from the input problems
  rNauto )gpuc                 s       | ]}d |v V  qdS )100Nr   r
   archr   r   r   r?   o  r@   z<filter_manifest_and_write_heuristics_file.<locals>.<genexpr>;@   )r0   r&   c                 s   r   )90Nr   r   r   r   r   r?   v  r@   r   c                 s   s     | ]}d |v pd|v V  qdS )r   101Nr   r   r   r   r   r?   x  s    c                 S   s   i | ]\}}|d kr||qS )r   r   r	   r   r   r   r   |  s    z=filter_manifest_and_write_heuristics_file.<locals>.<dictcomp>c                    s$   g | ]\}}d |  i |qS )operation_name)procedural_name)r
   r!   r   problem_without_configsr   r   
<listcomp>}  s   $ z=filter_manifest_and_write_heuristics_file.<locals>.<listcomp>^$z!No valid configurations generated)r   heuristics_problems_filer   loadheuristics_gpur'   anyarchitecturessplitset_cta_div_nr[   heuristics_configs_per_problemr   heuristics_restrict_kernelsr   r   r   zipadd_kernel_filterr   	Exceptionwrite_profiler_testlist_to_csvheuristics_testlist_file)r   argsheuristics_problemsr"   r   mmhr   all_configs_and_operationsr   rX   problem_configsproblem_operationswith_problem_sizer6   r   r   r   )filter_manifest_and_write_heuristics_file\  s4   

  
r   c           	      C   s   |   }|D ]"}| D ]\}}t|trt| ||< qt|tr't| ||< qqt|ddd}|d  }t	j
||d}|  || W d   dS 1 sRw   Y  dS )a(  
  Write a list of configs to a testlist to be consumed by cutlass_profiler

  args:
    configs_list: List of kernel configs along with runtime arguments and any other columns to include in the CSV, expressed as a list of dictionaries
    outfile_path: Outfile path
      
  returns:
    None
  r   r   )modenewliner   )
fieldnamesN)r   r   r   r   r   r   r   r   keyscsv
DictWriterwriteheader	writerows)	configs_listr   profiler_testlistr!   r   r   ofilek_nameswriterr   r   r   r     s   

"r   )FTr   N)Nr   )__doc__r   r   builtinshasattrr   ImportErrorcutlass_library.librarycutlass_library.generator#cutlass_library.heuristics_providerlibrary	generatorheuristics_provider
sm90_utilsr   r   r   logging	getLogger__name__rI   r   r   rE   r#   r1   r[   r   r   r   r   r   r   r   r   <module>   s:    


JHM+