o
    c۷iH                  	   @   sp  d dl mZmZmZ d dlmZ d dlmZ d dlZd dl	m
Z
 d dlmZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ G d	d
 d
eZG dd deZG dd deZe
jdede
jdedefddZeG dd deZeG dd deZ G dd dZ!e
jdedeeef fddZ"G dd de!Z#eG d d! d!eZ$G d"d# d#e!Z%dS )$    )
NamedTupleTupleOptional)	dataclass)IntEnumN)Int32Float32Boolean
const_expr)
FastDivmod)PipelineStateWAdvance)ArgumentsBase
ParamsBasemlir_namedtuplec                   @   s   e Zd ZdZdZdZdS )RasterOrderOptionr         N)__name__
__module____qualname__AlongMAlongN	Heuristic r   r   J/home/ubuntu/vllm_env/lib/python3.10/site-packages/quack/tile_scheduler.pyr      s    r   c                   @   s   e Zd ZdZdZdS )RasterOrderr   r   N)r   r   r   r   r   r   r   r   r   r      s    r   c                   @   s   e Zd ZdZdZdZdZdS )PersistenceModer   r   r      N)r   r   r   NONESTATICDYNAMICCLCr   r   r   r   r      s
    r   raster_order_optionproblem_shape_ncluster_mn
group_sizereturnc                 C   sX   | t jkrtjntj}| t jkr*t|d |}t|d |}||kr'tjntj}|S Nr   r   )r   r   r   r   r   cuteround_up)r"   r#   r$   raster_orderproblem_blocks_mproblem_blocks_nr   r   r   get_raster_order_from_option#   s   

r,   c                   @   s^   e Zd ZU eed< ejZej	e ed< edZ
eed< dZeej ed< dZeej ed< dS )TileSchedulerOptionsmax_active_clustersr)      max_swizzle_sizeNtile_count_semaphorebatch_idx_permute)r   r   r   r   __annotations__r   r   r)   cutlass	Constexprr0   r1   r   r'   Pointerr2   Tensorr   r   r   r   r-   6   s   
 r-   c                   @   sv   e Zd ZU ejed< eje ed< e	ed< ejej ed< dZ
eej ed< dZeej ed< ejZeje ed< dS )	TileSchedulerArgumentsproblem_shape_ntile_mnlr)   r$   cluster_shape_mnkNr1   r2   persistence_mode)r   r   r   r'   Shaper3   r4   r5   r   r   r1   r   r6   r2   r7   r   r   r;   r   r   r   r   r8   ?   s   
 
r8   c                   @   s  e Zd ZeG dd deZddddedededed	eej	 d
ee
jj dedefddZeddddedefddZeejdAdddd	eej	 ddfddZeejddddedeeeef deeee f fddZeej			dBdddded	eej	 d
ee
jj deeB dd f
ddZeddddededeeeef fddZejddddedeeef fd d!Zejdddd"d#ed$ed%edeeef fd&d'Zej		dCdddd"d(ed)ee d*ee d%ede
jjf
d+d,Zejdddde
jjfd-d.Zdddde
jjfd/d0Z ejddddeeeeef B fd1d2Z!ejdddd3e
jjfd4d5Z"ej	dDd6ddd7deeB d8e#fd9d:Z$d;d< Z%d=d> Z&d?d@ Z'dS )ETileSchedulerc                   @   s   e Zd ZU ejed< eed< eed< eed< eed< eed< eed< e	ej
 ed< e	ej ed	< ejej ed
< eje ed< eejddddedd fddZdS )zTileScheduler.Paramsproblem_shape_ncluster_mnlr)   num_clusters_per_problem_fddnum_groups_regulargroup_size_fddgroup_size_tail_fddnum_clusters_in_group_fddr1   r2   cluster_shape_mnr;   Nlocipargsr%   c                C   s>  | j d dks	J ttj| j ddgd}tj| jddgd}t||}|| jd f }t|}t| j|| j	}|t
jkrC|d n|d }	|t
jkrP|d n|d }
t| j	|	}|	| }|	| }||
 }t| jtjkru| jd usuJ t||t||t|t|dkr|ndt|t| jtjkr| jnd | j|| jS Nr   r   r   mode)r:   r
   r'   selectr9   ceil_divsizer,   r)   r$   r   r   minr;   r   r    r1   r=   Paramsr   r2   )rH   rF   rG   rD   problem_shape_ntile_mnr#   r>   num_clusters_per_problemr)   ncluster_fastncluster_slowr$   group_size_tailr@   num_clusters_in_groupr   r   r   createY   sP   



zTileScheduler.Params.create)r   r   r   r'   r<   r3   r   r   r   r   r6   r7   r4   r5   r   staticmethodjitr8   rW   r   r   r   r   rP   K   s   
 
"rP   NrE   current_work_idxnum_tiles_executedcurrent_batch_idxnum_work_idx_before_cur_batch
sched_smemscheduler_pipelinepipeline_stateparamsc	                C   @   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	d S N
_current_work_idxr[   _current_batch_idx_num_work_idx_before_cur_batch_sched_smem_scheduler_pipeline_pipeline_statera   _loc_ipselfrZ   r[   r\   r]   r^   r_   r`   ra   rF   rG   r   r   r   __init__      
zTileScheduler.__init__rH   r%   c                C      t jj| ||dS NrE   )r=   rP   rW   rH   rF   rG   r   r   r   to_underlying_arguments      z%TileScheduler.to_underlying_argumentsc                C   sv   t j| dgddksJ | d jd }t j  t j|d W d    n1 s*w   Y  t j  t j  d S )Nr   rJ      Nr   r/   r   )r'   rN   iteratorarch	elect_onembarrier_initmbarrier_init_fence	sync_warp)r^   rF   rG   clc_mbar_ptrr   r   r   _init_clc_mbarrier   s   
z TileScheduler._init_clc_mbarriercluster_idxc                C   sN   t | jtjtjfv rt|d }t|d }||fS t|d }d }||fS Nr   r   r
   r;   r   r   r!   r   ra   r   rF   rG   rZ   	batch_idxr   r   r   _cluster_idx_to_work_idx_batch   s   z,TileScheduler._cluster_idx_to_work_idx_batchFis_scheduler_warpc          	      C   s   t j| tj ||d\}}d}t| jtjtj	tj
fv r3|dus#J |dus)J ttj|dgd}t| jtj
krE|rEt j|||d t |tdtdtd||t|tdtdtd| ||d
S )zGis_scheduler_warp should only be true for one warp in the whole clusterrE   r   Nr   rJ   )r=   r   r'   ry   r   r
   r;   r   r   r    r!   rN   r   r   r   	ra   r^   r_   r   rF   rG   rZ   _stagesr   r   r   rW      s6   
zTileScheduler.creater.   c          	      C   s   t | jtjtjfv r#| jd t| jd d  | jd | jd fS tj| j||dt| j }tj| j||d}|| }t	
||}|| }g | j|R S )Nr   r   r   rE   )r
   r;   r   r   r!   rD   r'   rN   r>   r4   rO   )	ra   r.   rF   rG   num_ctas_in_problemnum_ctas_per_clusternum_ctas_per_wavenum_persistent_ctasnum_persistent_clustersr   r   r   get_grid_shape   s   
zTileScheduler.get_grid_shapecluster_id_in_problemc                C   s   | j }t||j\}}tdtd}}||jk r"t||j\}}nt||j\}}|d dkrF|jtj	kr;|j
d n|j
d }	|	d | }||jj | }
|
|}}|jtjkr^||
}}||fS Nr   r   r   )ra   divmodrC   r   r@   rA   rB   r)   r   r   r>   divisorr   )rn   r   rF   rG   ra   group_idid_in_groupcid_fast_in_groupcid_slowrT   cid_fastcid_mcid_nr   r   r   _swizzle_cta  s"   



zTileScheduler._swizzle_ctablock_zero_onlyrF   rG   r   r   r   c          	      C   s\   t |rtdtdf}ntj }|| jjd  |d  }|| jjd  |d  }||fS r&   )r
   r   r'   ry   block_in_cluster_idxra   rD   )	rn   r   r   r   rF   rG   bidx_in_clusterpid_mpid_nr   r   r   _cluster_id_to_cta_id  s   
z#TileScheduler._cluster_id_to_cta_idwork_idxbidzis_validc                C   s6  | j }t|d u r4t|jtjkr| jdk}nt|jtjkr,|t|j	d d k }n|t|j	k }t
dt
dt
d}}	}
|rt|jtjtjfv rZ|}tj \}}}nt||j\}}t|d urj|}| j|||d\}}| j|||||d\}}	t|jd u r|n|j| }
||	d |
f}tj||S Nr   r   rE   r   )ra   r
   r;   r   r   r[   r!   r'   rN   r>   r   ry   	block_idxr   r?   r   r   r2   r4   utilsWorkTileInforn   r   r   r   r   rF   rG   ra   r   r   r   r   r   bidz_r   r   tile_coord_mnklr   r   r   _delinearize_work_idx)  s2   

z#TileScheduler._delinearize_work_idxc          
         s    j }tdtdtdtdf\}}}}t|jtjkrnO j j	  fddt
dD \}}}}tt|jdkrCtj  tj  tj   j j	 W d    n1 s_w   Y   j	  t|}||d |f}	tj|	t|S )Nr   Fc                    s   g | ]} j | jjf qS r   )rh   rj   index).0irn   r   r   
<listcomp>[  s    z2TileScheduler.get_current_work.<locals>.<listcomp>   r   )ra   r   r	   r
   r;   r   r   ri   consumer_waitrj   ranger'   rN   rD   ry   fence_view_async_sharedr}   rz   consumer_releaseadvancer4   r   r   )
rn   rF   rG   ra   r   r   r   r   is_valid_i32r   r   r   r   get_current_workQ  s$   $



zTileScheduler.get_current_workc                C   s   | j | j||dS rr   )r   re   )rn   rF   rG   r   r   r   initial_work_tile_infok  s   z$TileScheduler.initial_work_tile_infoc                C   s  | j }ttj d }t|jtjkr| j	| S t|jtj
krUtd}tj dkrNt|jd durE|tt|jd |j }n	|td|j }tj|dS t|jtjkr| jd| jjf jd }| jd jd }tj  tj  tjj|d||d	 tj||||d	 W d   n1 sw   Y  tj  tjj|| jj||d	 tjj|||d	\}}	}
}tj  ||j d  |	|j d  |
f}t!| j"||||d	\}}||t#|fS tdS )
z+should only be called by the scheduler warpr   r   Nr   r   rw   r/      rE   )$ra   r   r'   ry   grid_dimr
   r;   r   r   re   r    lane_idxr>   r   atomic_inc_i32rN   r1   atomic_add_i32shuffle_syncr!   rh   rj   r   rx   r}   rz   mbarrier_arrive_and_expect_txissue_clc_query_nomulticastmbarrier_waitphaseclc_responser   rD   typer   r	   )rn   rF   rG   ra   r   next_work_linear_idxclc_response_ptrmbarrier_addrbidxbidyr   validr   r   r   r   r   _fetch_next_work_idxr  sJ   




z"TileScheduler._fetch_next_work_idxwork_tile_infoc             	   C   sV  | j }t| jd urt| jj| jj| jj| jjdA }| j	
| |jd |jd |jd t|jg}tj }|t|jk r| jj}tt|jdkrhtdD ]}	||	 | j|	|f< qS| j	| j d S |}
|
|jd  }|
|jd  }| j	| j}tj|d|
 tj|d | |d | |d |d | jd |f j||
d d S d S d S )Nr   r   r   r   r   r   )smem_ptrmbar_ptrpeer_cta_rank_in_cluster)ra   r
   rh   r   rj   r   countr   r   ri   producer_acquiretile_idxr   is_valid_tiler'   ry   r   rN   rD   r4   range_constexprproducer_commitproducer_get_barrierr   r   store_shared_remote_x4rx   )rn   r   rF   rG   ra   pipeline_state_producer
sched_datar   pipeline_idxr   r   r   bidy_in_clusterr   r   r   r   write_work_tile_to_smem  sJ   




z%TileScheduler.write_work_tile_to_smemr   )advance_countrF   rG   r   c          	      C   s   | j }|  jt|7  _t| jduo|dkr| j|d  t|jtjtj	fv rJ|rH| j
||d| _| j| jd||d}| j|||d dS dS t|jtjkru|rw| j
||d\| _}}| j| j||d||d}| j|||d dS dS dS )zis_scheduler_warp should only be true for one warp in the whole cluster.
        Moreover, we assume that only block zero in the cluster is calling this function.
        If calling with is_scheduler_warp = True, advance_count must be 1.
        Nr   rE   Tr   )ra   r[   r   r
   rj   advance_itersr;   r   r   r    r   re   r   r   r!   )	rn   r   r   rF   rG   ra   r   batchr   r   r   r   advance_to_next_work  s*   
z"TileScheduler.advance_to_next_workc                 C   sD   t | jd ur t| jj| jj| jj| jjdA }| j| d S d S )Nr   )	r
   ri   r   rj   r   r   r   r   producer_tail)rn   r   r   r   r   r     s   
zTileScheduler.producer_tailc                 C   s\   g g }| _ | j| j| j| j| j| j| j| jfD ]}t	
|}||7 }| j t| q|S rc   )_values_posre   r[   rf   rg   rh   ri   rj   ra   r4   extract_mlir_valuesappendlen)rn   valuesobj
obj_valuesr   r   r   __extract_mlir_values__  s   

z%TileScheduler.__extract_mlir_values__c              	   C   sv   g }t | j| j| j| j| j| j| j| jg| j	D ]\}}|
t||d |  ||d  }q| jt|d| jiS )NrF   )zipre   r[   rf   rg   rh   ri   rj   ra   r   r   r4   new_from_mlir_values	__class__tuplerk   )rn   r   obj_listr   n_itemsr   r   r   __new_from_mlir_values__  s    
z&TileScheduler.__new_from_mlir_values__rc   NNFNN)F)(r   r   r   r   r   rP   r   r   r'   r7   r4   pipelinePipelineAsyncr   ro   rX   r8   rt   rY   r   r   r   boolr	   rW   r   r   r   r   r   r   r   r   r   r   intr   r   r   r   r   r   r   r   r=   J   s    G
	
&
&

	'&/-!
r=   idxc                 C   s<   t t d|  d d d }| ||d  d  }||fS )z
    Convert a triangular index to 2D coordinates.
    This is used to convert the linear index to 2D coordinates for triangular matrices.
    r         @      ?r   )r   ceilsqrt)r   rowcolr   r   r   triangular_idx_to_coord!  s    r  c                   @   s   e Zd ZdZeG dd deZeddddedefdd	Z	ee
j			
dddddedee
j deejj deeB dd f
ddZeddddededeeeef fddZe
jddddedeeef fddZe
j		dd
ddddedee dee dedejjf
ddZdS ) TriangularTileSchedulerz[We assume the tile size per cluster is square (e.g., 128 x 256 per CTA, with cluster 2 x 1)c                   @   s   e Zd ZU ejed< eed< eed< eed< eed< eed< eed< eed< e	ej
 ed	< ejej ed
< eje ed< eejddddedd fddZdS )zTriangularTileScheduler.Paramsr>   r?   group_size_inv_f32r@   rA   rB   group_size_mul_group_size_fdd"group_size_tail_mul_group_size_fddr1   rD   r;   NrE   rH   r%   c                C   s  | j d dks	J ttj| j ddgd}tj| jddgd}t||}|| jd f }|d }||d  d }t| j|}	||	 }
||	 }t| jt	j
krV| jd usVJ t|t|td|	 |t|	t|
dkrl|
ndt|	|	 t|
dkrz|
nd|	 t| jt	j
kr| jnd || jS )Nr   r   r   rJ   g      ?)r:   r
   r'   rL   r9   rM   rO   r$   r;   r   r    r1   r  rP   r   r   )rH   rF   rG   rD   rQ   r#   r>   	cluster_mrR   r$   rU   r@   r   r   r   rW   =  s:   

z%TriangularTileScheduler.Params.create)r   r   r   r'   r<   r3   r   r   r   r   r6   r4   r5   r   rX   rY   r8   rW   r   r   r   r   rP   /  s(   
 
rP   NrE   rH   r%   c                C   rq   rr   )r  rP   rW   rs   r   r   r   rt   b  ru   z/TriangularTileScheduler.to_underlying_argumentsFra   r^   r_   r   c          	      C   s   t j| tj ||d\}}d}t| jtjtj	tj
fv r3|d us#J |d us)J ttj|dgd}t| jtj
krE|rEt j|||d t|tdtdtd||t|tdtdtd| ||d
S NrE   r   r   rJ   )r=   r   r'   ry   r   r
   r;   r   r   r    r!   rN   r   r  r   r   r   r   r   r   rW   f  6   
zTriangularTileScheduler.creater.   c                C   s   | j jdf}tdd t|| jD | jd f }t| jtj	tj
fv r&|S tj|||d}tj| j||d}|| }t||}	|	| }
g | j|
R S )Nr   c                 s   s    | ]	\}}|| V  qd S rc   r   )r   xyr   r   r   	<genexpr>  s    z9TriangularTileScheduler.get_grid_shape.<locals>.<genexpr>r   rE   )r?   r   r   r   rD   r>   r
   r;   r   r   r!   r'   rN   r4   rO   )ra   r.   rF   rG   clustersnum_ctas_mnlr   r   r   r   r   r   r   r   r     s   z&TriangularTileScheduler.get_grid_shaper   c                C   s  | j }|jj}ttd| d d |j d }|| }|||d  d  }||jk r/|n|jj}	t	dt	d}
}||jk rJt
||j\}
}nt
||j\}
}t	dt	d}}||	| | krjt|\}}n||jk rxt
||j\}}nt
||j\}}|| }|
| | }||fS )Nr   r   r   r   r   )ra   rA   r   r   r   r   r  r@   rB   r   r   r  r  r  )rn   r   rF   rG   ra   r$   r   cid_m_startr   group_size_actual	group_colgroup_remaindercid_m_in_groupcid_n_in_groupr   r   r   r   r   r     s:   


z$TriangularTileScheduler._swizzle_ctar   r   r   r   r   c                C   s  | j }t|d u r"t|jtjkr| jdk}n||jj|jd  k }t	dt	dt	d}}	}
|rtt|jtjtj
fv rH|}tj \}}}nt||j\}}t	|}t|d ur\|}| j|||d\}}| j|||||d\}}	|}
||	d |
f}tj||S r   )ra   r
   r;   r   r   r[   r?   r   r>   r   r!   r'   ry   r   r   r   r   r4   r   r   r   r   r   r   r     s4   

z-TriangularTileScheduler._delinearize_work_idxr   r   )r   r   r   __doc__r   r   rP   rX   r8   rt   r'   rY   r   r7   r4   r   r   r   r	   rW   r   r   r   r   r   r   r   r   r   r   r   r  ,  sx    2
%
&	r  c                   @   s   e Zd ZU ejed< eed< ejed< ej	e
 ed< eed< ej	ej ed< ej	ej ed< dZeej ed	< ejZej	e ed
< dS )VarlenMTileSchedulerArgumentsr9   total_mcu_seqlens_mr)   r$   tile_shape_mnr:   Nr1   r;   )r   r   r   r'   r<   r3   r   r7   r4   r5   r   r1   r   r6   r   r   r;   r   r   r   r   r    s   
 

r  c                   @   s  e Zd ZeG dd deZddddedededed	eej	 d
ee
jj dedefddZeddddedefddZeejddddedeeeef deeee f fddZeej			d-dddded	eej	 d
ee
jj deeB dd f
ddZeddddededeeeef fddZejddddededeeef fdd Zejd!ed"ed#e
je defd$d%Zej		d.dddd&d'ed(ee d)ee d*ede
jjf
d+d,ZdS )/VarlenMTileSchedulerc                   @   s   e Zd ZU ejed< eed< ejed< ej	e
 ed< eed< ee ed< ee ed< eed< ej	ej ed	< eej ed
< ej	ej ed< ej	e ed< eejddddedd fddZdS )zVarlenMTileScheduler.Paramsr>   r  r  r)   r$   rA   rB   rC   r  r1   rD   r;   NrE   rH   r%   c                C   s  | j d dks	J ttj| j ddgd}tj| jddgd}d t|d |d f}|| jd f }t| jtjkr=t	jnt	j
}||t	jkrIdnd }||t	jkrTdnd }	t|d urht| j|}
||
 }n| jd }
}d }t|	d urz|
|	 }t| jtjkr| jd usJ t|| j| j||
|d urt|
nd |d urt|dkr|ndnd |d urt|nd | jt| jtjkr| jnd || jS rI   )r:   r
   r'   rL   r9   rM   r)   r   r   r   r   rO   r$   r;   r   r    r1   r  rP   r  r  r   r  )rH   rF   rG   rD   rQ   r#   r>   r)   rS   rT   r$   rU   rV   r   r   r   rW     s^   
z"VarlenMTileScheduler.Params.create)r   r   r   r'   r<   r3   r   r7   r4   r5   r   r   r   r6   r   rX   rY   r8   rW   r   r   r   r   rP     s*   
 

rP   NrE   rZ   r[   r\   r]   r^   r_   r`   ra   c	                C   rb   rc   rd   rm   r   r   r   ro   M  rp   zVarlenMTileScheduler.__init__rH   r%   c                C   rq   rr   )r  rP   rW   rs   r   r   r   rt   f  ru   z,VarlenMTileScheduler.to_underlying_argumentsr   c                C   s<   t | jtjtjfv rt|d }nt|d }d }||fS r   r   r   r   r   r   r   j  s
   z3VarlenMTileScheduler._cluster_idx_to_work_idx_batchFr   c          	      C   s   t j| tj ||d\}}d}t| jtjtj	tj
fv r3|d us#J |d us)J ttj|dgd}t| jtj
krE|rEtj|||d t |tdtdtd||t|tdtdtd| ||d
S r  )r  r   r'   ry   r   r
   r;   r   r   r    r!   rN   r=   r   r   r   r   r   r   r   rW   v  r  zVarlenMTileScheduler.creater.   c          	      C   s   | j d | jd  }| jd }| j||d   | }|| jd  }t| jtjtjfv r9| jd | | jd dfS t	
||}g | j|R S r   )r  rD   r>   r  r
   r;   r   r   r!   r4   rO   )	ra   r.   rF   rG   
block_size	num_batchtotal_clusters_m_maxtotal_clusters_maxr   r   r   r   r     s   
z#VarlenMTileScheduler.get_grid_shaper   num_clusters_mc                C   sl  | j }t|jd urt||j\}}|jj}n|jtjksJ |j| }|| }|||  }t	dt	d}	}
t|j
d uoA|jd urd||jd  }|d | |kr[t||j
\}
}	n't||j\}
}	n|jtjkslJ t|j|||j  }|| }
||
|  }	|d dkr|jtjkr|jd n|}|d |
 }
||j |	 }||
}}|jtjkr|
|}}||fS )Nr   r   r   )ra   r
   rC   r   r   r)   r   r   r$   r   rA   rB   r>   r   r4   rO   )rn   r   r  rF   rG   ra   r   r   rV   r   r   num_clustersr  rT   r   r   r   r   r   r   r     s>   




z!VarlenMTileScheduler._swizzle_ctalane
bidb_startr  c           	      C   st   | j jd }|| }td}||kr| j j| }tjj|dd}|| }||k r6|tjjd k r6t||S tdS )Nr   r   r   )offset)	ra   r>   r   r  r'   ry   shuffle_sync_down	WARP_SIZErM   )	rn   r   r!  r  r  r   cur_cu_seqlennext_cu_seqlenseqlenr   r   r   _get_num_m_blocks  s   
z&VarlenMTileScheduler._get_num_m_blocksr   r   r   	is_valid_r   c                C   s  |d u sJ | j }tj }| j jd }	|jd |jd  }
| j}|}| j}t	dt	dt	d}}}d}t
|d ur>|}|r||kr| j|||
d}||jd  }t||}tj|tjjd }||7 }||krt|tjjd 7 }||	krt	|	}|d }||ksDnt	|	}||	k }t
|jtjkr|| jdkM }t	dt	d}}| j}|r|| }tjtj|| |k}||7 }|dkrdntj||d }tj||}|| }|| }| j||||d\}}| j|||||d\}}||d |f}|| _|| _tj||S )Nr   r   T)r!  r  r   rE   r   )ra   r'   ry   r   r>   r  rD   rf   rg   r   r
   r(  r   warp_prefix_sumr   r$  r;   r   r   r[   popcvote_ballot_syncr   r   r4   r   )rn   r   r   r)  r   rF   rG   ra   r   r  r  r   next_tile_idxproblems_end_tiler  num_clusters_cumulativeclusters_in_problemsr   r  r   r   r]   problems_start_tilebatch_idx_in_problemsnum_clusters_prev_laner   r   r   r   r   r   r   r     sv   



z*VarlenMTileScheduler._delinearize_work_idxr   r   ) r   r   r   r   r   rP   r   r   r'   r7   r4   r   r   r   ro   rX   r8   rt   rY   r   r   r   r	   rW   r   r   r5   r   r(  r   r   r   r   r   r   r   r    s    R
	


%
(	r  )&typingr   r   r   dataclassesr   enumr   r4   cutlass.cuter'   r   r   r	   r
   quack.utilsr   quack.fast_mathr   quack.pipeliner   quack.cute_dsl_utilsr   r   r   r   r   r   rY   r<   r,   r-   r8   r=   r  r  r  r  r   r   r   r   <module>   sL   
   Z
 L