o
    پi                  	   @   sX  d dl mZmZ d dlmZ d dlmZ d dlZd dlm	Z	 d dlm
Z
mZmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZ G d	d
 d
eZG dd deZe	jdede	jde
defddZeG dd deZeG dd deZG dd dZe	jde
dee
e
f fddZG dd deZ eG dd deZ!G d d! d!eZ"dS )"    )TupleOptional)	dataclass)IntEnumN)Int32Float32Boolean
const_expr)
FastDivmod)PipelineStateWAdvance)ArgumentsBase
ParamsBasec                   @   s   e Zd ZdZdZdZdS )RasterOrderOptionr         N)__name__
__module____qualname__AlongMAlongN	Heuristic r   r   H/home/ubuntu/.local/lib/python3.10/site-packages/quack/tile_scheduler.pyr      s    r   c                   @   s   e Zd ZdZdZdS )RasterOrderr   r   N)r   r   r   r   r   r   r   r   r   r      s    r   raster_order_optionproblem_shape_ncluster_mn
group_sizereturnc                 C   sX   | t jkrtjntj}| t jkr*t|d |}t|d |}||kr'tjntj}|S )Nr   r   )r   r   r   r   r   cuteround_up)r   r   r   raster_orderproblem_blocks_mproblem_blocks_nr   r   r   get_raster_order_from_option   s   

r#   c                   @   s^   e Zd ZU eed< ejZej	e ed< edZ
eed< dZeej ed< dZeej ed< dS )TileSchedulerOptionsmax_active_clustersr       max_swizzle_sizeNtile_count_semaphorebatch_idx_permute)r   r   r   r   __annotations__r   r   r    cutlass	Constexprr'   r(   r   r   Pointerr)   Tensorr   r   r   r   r$   /   s   
 r$   c                   @   st   e Zd ZU ejed< eje ed< e	ed< ejej ed< dZ
eej ed< dZeej ed< dZeje ed	< dS )
TileSchedulerArgumentsproblem_shape_ntile_mnlr    r   cluster_shape_mnkNr(   r)   Fis_persistent)r   r   r   r   Shaper*   r+   r,   r   r   r(   r   r-   r)   r.   r2   boolr   r   r   r   r/   8   s   
 
r/   c                   @   s  e Zd ZeG dd deZddddededeej	 dee
jj d	ed
efddZeddddedefddZeej			d,dddd
edeej	 dee
jj deeB dd f
ddZedddd
ededeeeef fddZejddddedeeef fddZejdddde
jjfddZdddddZejd-ddddeeB fdd Zej	d-d!ddd"deeB d#efd$d%Zd&d' Z d(d) Z!d*d+ Z"dS ).TileSchedulerc                   @   s   e Zd ZU ejed< eed< eed< eed< eed< eed< eed< e	ej
 ed< e	ej ed	< ejej ed
< eje ed< eejddddedd fddZdS )zTileScheduler.Paramsproblem_shape_ncluster_mnlr    num_clusters_per_problem_divmodnum_groups_regulargroup_size_divmodgroup_size_tail_divmodnum_clusters_in_group_divmodr(   r)   cluster_shape_mnr2   Nlocipargsr   c                C   s"  | j d dks	J ttj| j ddgd}tj| jddgd}t||}|| jd f }t|}t| j|| j	}|t
jkrC|d n|d }	|t
jkrP|d n|d }
t| j	|	}|	| }|	| }||
 }t||t||t|t|dkr{|ndt|t| jr| jnd | j|| jS Nr   r   r   mode)r1   r	   r   selectr0   ceil_divsizer#   r    r   r   r   minr5   Paramsr
   creater2   r(   r)   )r@   r>   r?   r<   problem_shape_ntile_mnr   r6   num_clusters_per_problemr    ncluster_fastncluster_slowr   group_size_tailr8   num_clusters_in_groupr   r   r   rI   R   sH   



zTileScheduler.Params.create)r   r   r   r   r3   r*   r   r
   r   r   r-   r.   r+   r,   r4   staticmethodjitr/   rI   r   r   r   r   rH   D   s   
 
"rH   Nr=   current_work_linear_idxnum_tiles_executed
tile_countscheduler_pipelinepipeline_stateparamsc          	      C   s4   || _ || _|| _|| _|| _|| _|| _|| _d S N)_current_work_linear_idxrS   _tile_count_scheduler_pipeline_pipeline_staterW   _loc_ip)	selfrR   rS   rT   rU   rV   rW   r>   r?   r   r   r   __init__}   s   
zTileScheduler.__init__r@   r   c                C      t jj| ||dS Nr=   )r5   rH   rI   r@   r>   r?   r   r   r   to_underlying_arguments      z%TileScheduler.to_underlying_argumentsFis_scheduler_warpc                C   s   d}t | j r#tj \}}}	tj \}
}	}	|||
  }t|}n&tj \}	}	}t|}t | jdurI|dus<J |dusBJ t t	|}t
|td||t|tdtdt|r]dnd| ||dS )Gis_scheduler_warp should only be true for one warp in the whole clusterr   Nr   r=   )r	   r2   r   archcluster_idxcluster_dimr   	block_idxr(   rF   r5   r   )rW   rT   rU   rf   r>   r?   stagescidxcidy_cdimx
cluster_idrR   bidzr   r   r   rI      s,   
 zTileScheduler.creater%   c          
      C   s   t dd t| j| jD | jd f }t| j r|S tj|||d}tj| j||d}|| }t	||}|| }	g | j|	R S )Nc                 s       | ]	\}}|| V  qd S rX   r   .0xyr   r   r   	<genexpr>   s    
z/TileScheduler.get_grid_shape.<locals>.<genexpr>r   r=   )
tuplezipr6   r<   r	   r2   r   rF   r+   rG   )
rW   r%   r>   r?   num_ctas_mnlnum_ctas_in_problemnum_ctas_per_clusternum_ctas_per_wavenum_persistent_ctasnum_persistent_clustersr   r   r   get_grid_shape   s   
zTileScheduler.get_grid_shapecluster_id_in_problemc                C   s   | j }|j|\}}tdtd}}||jk r"|j|\}}n|j|\}}|d dkrF|jtj	kr;|j
d n|j
d }	|	d | }||jj | }
|
|}}|jtjkr^||
}}||fS Nr   r   r   )rW   r;   divmodr   r8   r9   r:   r    r   r   r6   divisorr   )r_   r   r>   r?   rW   group_idid_in_groupcid_fast_in_groupcid_slowrM   cid_fastcid_mcid_nr   r   r   _swizzle_cta   s"   



zTileScheduler._swizzle_ctac                C   s   | j }t|j r| j}tj \}}}n	|j| j\}}| j	|||d\}}tj
 }	||jd  |	d  }
||jd  |	d  }t|jd u rL|n|j| }|
|d |f}t|j rc| jdk}n	| jt|jk }tj||S )Nr=   r   r   )rW   r	   r2   rY   r   rh   rk   r7   r   r   block_in_cluster_idxr<   r)   rS   rF   r6   r+   utilsWorkTileInfo)r_   r>   r?   rW   r   ro   rr   r   r   bidx_in_clusterpid_mpid_n	batch_idxtile_coord_mnklis_validr   r   r   get_current_work   s$   
zTileScheduler.get_current_workc                C   s   | j ||dS rb   )r   )r_   r>   r?   r   r   r   initial_work_tile_info  s   z$TileScheduler.initial_work_tile_infoc                C   sz   | j }t|jo|jdur;| j}|r6tj dkr/tj d }|t	
t|jd |j }tj|d}|| _dS dS rg   Nr   r   r   )rW   r	   r2   r(   rY   r   rh   lane_idxgrid_dimr   atomic_inc_i32rF   r6   shuffle_syncr_   rf   r>   r?   rW   rR   r   r   r   r   fetch_next_work  s   
zTileScheduler.fetch_next_workr   )advance_countr>   r?   r   c                C   s  t j d }t j d }t j d }| j}t|jrt j d }	t|jd u r7|  j	|t
|	 7  _	nt|dkrE| j|d  | j	}
|r| j| j t j }|t |jk rtt |jdkrw|
| j| jj< | j| j nh|}| j| j}t j|d| tj|
| jj| jj ||d nG| j| j | j| jj }
tt |jdkrt jjt jjjt jjjd t j   t j!  | j"| j W d    n1 sw   Y  |
| _	| j#  |  j$t
|7  _$d S )Nr   r   r      )valsmem_ptrmbar_ptrpeer_cta_rank_in_cluster)space)%r   rh   
thread_idxrk   rW   r	   r2   r   r(   rY   r   r\   advance_itersr[   producer_acquirer   rF   r<   rZ   indexproducer_commitproducer_get_barriermbarrier_arrive_and_expect_txr   store_shared_remoteiteratorconsumer_waitfence_proxy	ProxyKindasync_sharedSharedSpace
shared_cta	sync_warp	elect_oneconsumer_releaseadvancerS   )r_   rf   r   r>   r?   tidxbidxrr   rW   r   rR   r   r   r   r   r   r   advance_to_next_work  sZ   	

	

z"TileScheduler.advance_to_next_workc                 C   s.   t | jjo
| jjd ur| j| j d S d S rX   )r	   rW   r2   r(   r[   producer_tailr\   )r_   r   r   r   r   Y  s   zTileScheduler.producer_tailc                 C   sT   g g }| _ | j| j| j| j| j| jfD ]}t|}||7 }| j 	t
| q|S rX   )_values_posrY   rS   rZ   r[   r\   rW   r+   extract_mlir_valuesappendlenr_   valuesobj
obj_valuesr   r   r   __extract_mlir_values__]  s   
z%TileScheduler.__extract_mlir_values__c              	   C   sn   g }t | j| j| j| j| j| jg| jD ]\}}|t	
||d |  ||d  }q| jt|d| jiS Nr>   )rz   rY   rS   rZ   r[   r\   rW   r   r   r+   new_from_mlir_values	__class__ry   r]   r_   r   obj_listr   n_itemsr   r   r   __new_from_mlir_values__l  s   z&TileScheduler.__new_from_mlir_values__NNFF)#r   r   r   r   r   rH   r   r   r   r.   r+   pipelinePipelineAsyncr   r`   rP   r/   rd   rQ   r4   r   rI   r   r   r   r   r   r   r   r   intr   r   r   r   r   r   r   r   r5   C   s    A


#
Ar5   idxc                 C   s<   t t d|  d d d }| ||d  d  }||fS )z
    Convert a triangular index to 2D coordinates.
    This is used to convert the linear index to 2D coordinates for triangular matrices.
    r         @      ?r   )r   ceilsqrt)r   rowcolr   r   r   triangular_idx_to_coord~  s    r   c                   @   s   e Zd ZdZeG dd deZeddddedefdd	Z	ee
j			
dddddedee
j deejj deeB dd f
ddZeddddededeeeef fddZe
jddddejjfddZdS )TriangularTileSchedulerz[We assume the tile size per cluster is square (e.g., 128 x 256 per CTA, with cluster 2 x 1)c                   @   s   e Zd ZU ejed< eed< eed< eed< eed< eed< eed< eed< e	ej
 ed	< ejej ed
< eje ed< eejddddedd fddZdS )zTriangularTileScheduler.Paramsr6   r7   group_size_inv_f32r8   r9   r:    group_size_mul_group_size_divmod%group_size_tail_mul_group_size_divmodr(   r<   r2   Nr=   r@   r   c                C   s  | j d dks	J ttj| j ddgd}tj| jddgd}t||}|| jd f }|d }||d  d }t| j|}	||	 }
||	 }t	|t
|td|	 |t
|	t
|
dkr`|
ndt
|	|	 t
|
dkrp|
nd|	 t| jr|| jnd || jS )Nr   r   r   rB   g      ?)r1   r	   r   rD   r0   rE   rG   r   r   rH   r
   rI   r   r2   r(   )r@   r>   r?   r<   rJ   r   r6   	cluster_mrK   r   rN   r8   r   r   r   rI     s2   
z%TriangularTileScheduler.Params.create)r   r   r   r   r3   r*   r
   r   r   r   r-   r+   r,   r4   rP   rQ   r/   rI   r   r   r   r   rH     s(   
 
rH   Nr=   r@   r   c                C   ra   rb   )r   rH   rI   rc   r   r   r   rd     re   z/TriangularTileScheduler.to_underlying_argumentsFrW   rT   rU   rf   c                C   s   d}t | j rtj \}}}t|}	n&tj \}}}
t|
}	t | jd ur;|d us.J |d us4J t t|}t	|	td||t
|tdtdt|rOdnd| ||dS Nr   r   r=   )r	   r2   r   rh   ri   r   rk   r(   rF   r   r   )rW   rT   rU   rf   r>   r?   rl   rq   ro   rR   rr   r   r   r   rI     s(   
 zTriangularTileScheduler.creater%   c                C   s   | j jd| jd f}tdd t|| jD | jd f }t| j r%|S tj	|||d}tj	| j||d}|| }t
||}	|	| }
g | j|
R S )Nr   r   c                 s   rs   rX   r   rt   r   r   r   rx     s    z9TriangularTileScheduler.get_grid_shape.<locals>.<genexpr>r=   )r7   r   r6   ry   rz   r<   r	   r2   r   rF   r+   rG   )rW   r%   r>   r?   clustersr{   r|   r}   r~   r   r   r   r   r   r     s   	z&TriangularTileScheduler.get_grid_shapec                C   s  | j }t|j r| j}tj \}}}n	|j| j\}}|j	j
}ttd| d d |j d }|| }	||	|	d  d  }
||jk rJ|n|jj
}tdtd}}||jk re|j|
\}}n|j|
\}}tdtd}}|
|| | krt|\}}n||jk r|j	|\}}n|j|\}}|	| }|| | }tj }||jd  |d  }||jd  |d  }||d |f}t|j r| jdk}n| j|jj
|jd  k }tj||S )Nr   r   r   r   r   )rW   r	   r2   rY   r   rh   rk   r7   r   r9   r   r   r   r   r   r8   r:   r   r   r   r   r   r<   rS   r6   r+   r   )r_   r>   r?   rW   r   ro   rr   r   r   cid_m_startr   group_size_actual	group_colgroup_remaindercid_m_in_groupcid_n_in_groupr   r   r   r   r   r   r   r   r   r   r     sb   




z(TriangularTileScheduler.get_current_workr   )r   r   r   __doc__r   r   rH   rP   r/   rd   r   rQ   r   r.   r+   r   r   r4   r   rI   r   r   r   r   r   r   r   r   r   r   r     sJ    .
  r   c                   @   s   e Zd ZU ejed< eed< ejed< ej	e
 ed< eed< ej	ej ed< ej	ej ed< dZeej ed	< d
Zej	e ed< dS )VarlenMTileSchedulerArgumentsr0   total_mcu_seqlens_mr    r   tile_shape_mnr1   Nr(   Fr2   )r   r   r   r   r3   r*   r   r.   r+   r,   r   r(   r   r-   r2   r4   r   r   r   r   r   >  s   
 

r   c                   @   s  e Zd ZeG dd deZddddedededed	eej	 d
ee
jj dedefddZeddddedefddZeej			d+dddded	eej	 d
ee
jj deeB dd f
ddZeddddededeeeef fddZejdedede
je defddZejdddded edeeef fd!d"Zejdddde
jjfd#d$Zejd,ddddeeB fd%d&Zd'd( Z d)d* Z!dS )-VarlenMTileSchedulerc                   @   s   e Zd ZU ejed< eed< ejed< ej	e
 ed< eed< ee ed< ee ed< eed< ej	ej ed	< eej ed
< ej	ej ed< ej	e ed< eejddddedd fddZdS )zVarlenMTileScheduler.Paramsr6   r   r   r    r   r9   r:   r;   r   r(   r<   r2   Nr=   r@   r   c                C   sx  | j d dks	J ttj| j ddgd}tj| jddgd}d t|d |d f}|| jd f }t| jtjkr=t	jnt	j
}|t	jkrJ|d n|d }|t	jkrW|d n|d }	t|d urlt| j|}
||
 }n| jd }
}t|	d ur}|
|	 }nd }t|| j| j||
|d urt|
nd |d urt|dkr|ndnd |d urt|nd | jt| jr| jnd || jS rA   )r1   r	   r   rD   r0   rE   r    r   r   r   r   rG   r   r   rH   r   r   r
   rI   r   r2   r(   )r@   r>   r?   r<   rJ   r   r6   r    rL   rM   r   rN   rO   r   r   r   rI   [  s^   




z"VarlenMTileScheduler.Params.create)r   r   r   r   r3   r*   r   r.   r+   r,   r   r   r
   r-   r4   rP   rQ   r/   rI   r   r   r   r   rH   L  s*   
 

rH   Nr=   rR   rS   current_batch_idxnum_work_idx_before_cur_batchrT   rU   rV   rW   c	                C   s@   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	d S rX   )
rY   rS   _current_batch_idx_num_work_idx_before_cur_batchrZ   r[   r\   rW   r]   r^   )r_   rR   rS   r   r   rT   rU   rV   rW   r>   r?   r   r   r   r`     s   
zVarlenMTileScheduler.__init__r@   r   c                C   ra   rb   )r   rH   rI   rc   r   r   r   rd     re   z,VarlenMTileScheduler.to_underlying_argumentsFrf   c          
      C   s   d}t j \}}}t|}	t| jd ur(|d usJ |d us!J tt |}t|	tdtdtd||t|tdtdt|rBdnd| ||d
S r   )	r   rh   rk   r   r	   r(   rF   r   r   )
rW   rT   rU   rf   r>   r?   rl   ro   rr   rR   r   r   r   rI     s&    zVarlenMTileScheduler.creater%   c          	      C   sz   | j d | jd  }| jd }| j||d   | }|| jd  }t| j r/g | j|R S t||}g | j|R S r   )r   r<   r6   r   r	   r2   r+   rG   )	rW   r%   r>   r?   
block_size	num_batchtotal_clusters_m_maxtotal_clusters_maxr   r   r   r   r     s   
z#VarlenMTileScheduler.get_grid_shapelane
bidb_startr   c           	      C   st   | j jd }|| }td}||kr| j j| }tjj|dd}|| }||k r6|tjjd k r6t||S tdS )Nr   r   r   )offset)	rW   r6   r   r   r   rh   shuffle_sync_down	WARP_SIZErE   )	r_   r   r   r   r   r   cur_cu_seqlennext_cu_seqlenseqlenr   r   r   _get_num_m_blocks  s   
z&VarlenMTileScheduler._get_num_m_blocksr   num_clusters_mc                C   sl  | j }t|jd ur|j|\}}|jj}n|jtjksJ |j| }|| }|||  }t	dt	d}	}
t|j
d uoA|jd urd||jd  }|d | |kr[|j
|\}
}	n'|j|\}
}	n|jtjkslJ t|j|||j  }|| }
||
|  }	|d dkr|jtjkr|jd n|}|d |
 }
||j |	 }||
}}|jtjkr|
|}}||fS )Nr   r   r   )rW   r	   r;   r   r   r    r   r   r   r   r9   r:   r6   r   r+   rG   )r_   r   r   r>   r?   rW   r   r   rO   r   r   num_clustersr   rM   r   r   r   r   r   r   r     sF   




z!VarlenMTileScheduler._swizzle_ctac                C   sN  | j }tj }| j jd }|jd |jd  }| j}| j|||d}||jd  }	t	
|	|}
tj|
tjjd }| j| }tdtd}}| j}||kr|tjjd 7 }||kret|}|d }n$| j|||d}||jd  }	t	
|	|}
tj|
tjjd }||7 }||ksP|| }||krtdtdt|}}}n?|| }tjtj||
 |k}||7 }|dkrdntj|
|d }tj||}|| }|| }| j||||d\}}|| _|| _tj }||jd  |d  }||jd  |d  }||d |f}t|j r| jdko||k }n||k }tj	||S )Nr   r   )r   r   r   r=   )rW   r   rh   r   r6   r   r<   r   r   r   warp_prefix_sumr   r   r   r   rY   popcvote_ballot_syncr   r   r	   r2   rS   r+   r   )r_   r>   r?   rW   r   r   r   r   r   r   num_clusters_cumulativeclusters_in_problemsproblems_end_tiler   r   next_tile_idxr   problems_start_tilebatch_idx_in_problemsnum_clusters_prev_laner   r   r   r   r   r   r   r   r   r   $  sv   




z%VarlenMTileScheduler.get_current_workc                C   sj   t | jjdur3| j}| j}|r.tj dkr'tj d }|t	d|j }tj
|d}|| _dS dS r   )r	   rW   r(   rY   r   rh   r   r   r   atomic_add_i32r   r   r   r   r   r   p  s   
z$VarlenMTileScheduler.fetch_next_workc                 C   s\   g g }| _ | j| j| j| j| j| j| j| jfD ]}t	
|}||7 }| j t| q|S rX   )r   rY   rS   r   r   rZ   r[   r\   rW   r+   r   r   r   r   r   r   r   r     s   

z,VarlenMTileScheduler.__extract_mlir_values__c              	   C   sv   g }t | j| j| j| j| j| j| j| jg| j	D ]\}}|
t||d |  ||d  }q| jt|d| jiS r   )rz   rY   rS   r   r   rZ   r[   r\   rW   r   r   r+   r   r   ry   r]   r   r   r   r   r     s    
z-VarlenMTileScheduler.__new_from_mlir_values__r   r   )"r   r   r   r   r   rH   r   r   r   r.   r+   r   r   r   r`   rP   r/   rd   rQ   r4   r   rI   r   r   r,   r   r   r   r   r   r   r   r   r   r   r   r   r   r   K  s    U
	


,Kr   )#typingr   r   dataclassesr   enumr   r+   cutlass.cuter   r   r   r   r	   quack.utilsr   quack.fast_mathr
   quack.pipeliner   quack.cute_dsl_utilsr   r   r   r   rQ   r3   r#   r$   r/   r5   r   r   r   r   r   r   r   r   <module>   sH   
  =
 6