o
    پiy                     @   s.  d dl mZmZ d dlmZmZ zd dl mZ W n ey'   d dlmZ Y nw d dl	Z	d dl
mZ d dlmZ d dl	mZmZ d dlm  mZ d dlmZ d dlmZ G d	d
 d
e	jjZeG dd dZeG dd deZG dd dZG dd dZG dd dZG dd dZG dd dZdS )    )OptionalTuple)	dataclassfields)overrideN)ir)Int32
const_exprclz)FastDivmodDivisorc                   @   s,   e Zd ZdZedeej dd fddZdS )WorkTileInfozJAltered WorkTileInfo which includes four axes: (block, head, batch, split)valuesreturnc                 C   sD   t |dksJ t| j|d d }t| j|d g}t||S )N   )lencutlassnew_from_mlir_values	_tile_idx_is_valid_tiler   )selfr   new_tile_idxnew_is_valid_tile r   Y/home/ubuntu/.local/lib/python3.10/site-packages/flash_attn_origin/cute/tile_scheduler.py__new_from_mlir_values__   s   
z%WorkTileInfo.__new_from_mlir_values__N)	__name__
__module____qualname____doc__r   listr   Valuer   r   r   r   r   r      s    r   c                   @   s   e Zd Zdd Zdd ZdS )
ParamsBasec                    s`    fddt  D }dd |D }g g } _|D ]}t|}||7 } jt| q|S )Nc                    s   g | ]}t  |jqS r   )getattrname.0fieldr   r   r   
<listcomp>#   s    z6ParamsBase.__extract_mlir_values__.<locals>.<listcomp>c                 S   s   g | ]
}t |tjs|qS r   
isinstancer   	Constexpr)r'   fr   r   r   r*   $       )r   _values_posr   extract_mlir_valuesappendr   )r   
all_fieldsnon_constexpr_fieldsr   obj
obj_valuesr   r)   r   __extract_mlir_values__"   s   
z"ParamsBase.__extract_mlir_values__c                    s    fddt  D }dd | D }dd | D }t|  jD ]\\}}}t||d | ||< ||d  }q% jdi ||S )Nc                    s   i | ]
}|j t |j qS r   )r%   r$   r&   r)   r   r   
<dictcomp>-   r/   z7ParamsBase.__new_from_mlir_values__.<locals>.<dictcomp>c                 S   s"   i | ]\}}t |tjr||qS r   r+   r'   nr.   r   r   r   r8   .   s   " c                 S   s"   i | ]\}}t |tjs||qS r   r+   r9   r   r   r   r8   /   s
    r   )r   itemszipr0   r   r   	__class__)r   r   r3   constexpr_fieldsr4   r%   r(   n_itemsr   r)   r   r   ,   s   z#ParamsBase.__new_from_mlir_values__N)r   r   r   r7   r   r   r   r   r   r#       s    
r#   c                   @   s  e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< ejeeef  ed	< d
Z	ejeeef  ed< dZ
eej ed< dZeej ed< dZeje ed< dZeje ed< dZeje ed< dZeje ed< dZeje ed< dZeje ed< dS )TileSchedulerArguments	num_blocknum_head	num_batch
num_splitsseqlen_kheaddim	headdim_vtotal_qtile_shape_mn   rK   cluster_shape_mnNmCuSeqlensQ	mSeqUsedQrK   qhead_per_kvhead_packgqa   element_sizeFis_persistentlptis_split_kvhead_swizzle)r   r   r   r   __annotations__r   r-   r   intrL   rM   r   cuteTensorrN   rO   rQ   rR   boolrS   rT   rU   r   r   r   r   r@   8   s&   
 r@   c                	   @   s   e Zd ZeG dd deZddddedejfddZe	dddd	e
d
efddZe	dddded
d fddZe	dddded
eeeef fddZdddd
efddZdddddZdddddZdddddZdd Zdd ZdS )SingleTileSchedulerc                   @   s   e Zd ZU eed< eed< eed< eed< eed< dZeje	 ed< dZ
ejeeef  ed	< ed
d
ddedd fddZd
S )zSingleTileScheduler.ParamsrA   rB   rC   rD   num_splits_divmodFrT   rJ   rL   Nlocipargsr   c             	   C   s(   t | j| j| j| jt| j| j| jS N)	r[   ParamsrA   rB   rC   rD   r   rT   rL   r`   r^   r_   r   r   r   createY   s   z!SingleTileScheduler.Params.create)r   r   r   r   rV   r   rT   r   r-   rZ   rL   r   rW   staticmethodr@   rd   r   r   r   r   rb   O   s   
 rb   Nr]   params	blk_coordc                C   s"   || _ || _d| _|| _|| _d S NT)rf   
_blk_coord_is_first_block_loc_ip)r   rf   rg   r^   r_   r   r   r   __init__g   
   
zSingleTileScheduler.__init__r`   r   c                C      t jj| ||dS Nr]   )r[   rb   rd   rc   r   r   r   to_underlying_argumentsn      z+SingleTileScheduler.to_underlying_argumentsc                C   s   t j }t| |||dS rp   )rX   arch	block_idxr[   )rf   r^   r_   rg   r   r   r   rd   r   s   
zSingleTileScheduler.createc                C   s:   | j d dksJ dt| j| j d | j| j | jfS )NrK   z*Only cluster_shape_mn[1] == 1 is supportedr   )rL   rX   round_uprA   rB   rD   rC   rf   r^   r_   r   r   r   get_grid_shapex   s
   
z"SingleTileScheduler.get_grid_shapec                C   sH   | j \}}}t| jjrt|| jj\}}ntd}t||||f| jS Nr   )	ri   r	   rf   rT   divmodr\   r   r   rj   )r   r^   r_   rt   head_idx	batch_idx	split_idxr   r   r   get_current_work   s   
z$SingleTileScheduler.get_current_workc                C      | j ||dS rp   r}   r   r^   r_   r   r   r   initial_work_tile_info      z*SingleTileScheduler.initial_work_tile_infoc                C      d S ra   r   r   r   r   r   prefetch_next_work      z&SingleTileScheduler.prefetch_next_workc                C   
   d| _ d S NFrj   r   r   r   r   advance_to_next_work   s   
z(SingleTileScheduler.advance_to_next_workc                 C   D   g g }| _ | j| jfD ]}t|}||7 }| j t| q|S ra   )r0   rf   ri   r   r1   r2   r   r   r   r5   r6   r   r   r   r7         
z+SingleTileScheduler.__extract_mlir_values__c              	   C   \   g }t | j| jg| jD ]\}}|t||d |  ||d  }qtt|d| j	iS Nr^   )
r<   rf   ri   r0   r2   r   r   r[   tuplerk   r   r   obj_listr5   r?   r   r   r   r      s
   z,SingleTileScheduler.__new_from_mlir_values__)r   r   r   r   r#   rb   rX   Coordrm   re   r@   rq   rd   r   r   rw   r   r}   r   r   r   r7   r   r   r   r   r   r[   N   s,    r[   c                	   @   s   e Zd ZeG dd deZddddedefddZedddd	e	d
efddZ
edddded
d fddZedddded
eeeef fddZdddd
efddZdddddZdddddZdddddZdd Zdd ZdS )StaticPersistentTileSchedulerc                   @   sD   e Zd ZU eed< eed< eed< eddddedd fdd	ZdS )
z$StaticPersistentTileScheduler.Paramsnum_block_divmodnum_head_divmodtotal_blocksNr]   r`   r   c                C   s,   | j | j | j }tt| j t| j|S ra   )rA   rB   rC   r   rb   r   )r`   r^   r_   r   r   r   r   rd      s   z+StaticPersistentTileScheduler.Params.create)	r   r   r   r   rV   r   re   r@   rd   r   r   r   r   rb      s   
 rb   Nr]   rf   tile_idxc                C      || _ || _|| _|| _d S ra   rf   r   rk   rl   r   rf   r   r^   r_   r   r   r   rm         
z&StaticPersistentTileScheduler.__init__r`   r   c                C   ro   rp   )r   rb   rd   rc   r   r   r   rq      rr   z5StaticPersistentTileScheduler.to_underlying_argumentsc                C      t j d }t| |||dS Nr   r]   )rX   rs   rt   r   rf   r^   r_   r   r   r   r   rd      s   z$StaticPersistentTileScheduler.createc                C   s.   t j }| }t || jtdtdfS NrK   )r   utilsHardwareInfoget_device_multiprocessor_countminr   r   )rf   r^   r_   hardware_infosm_countr   r   r   rw      s   
z,StaticPersistentTileScheduler.get_grid_shapec                C   sV   t | j| jj\}}t || jj\}}| j| jjk }tt|t|t|tdf|S rx   )ry   r   rf   r   r   r   r   r   )r   r^   r_   hn_idxrt   r{   rz   is_validr   r   r   r}      s   z.StaticPersistentTileScheduler.get_current_workc                C   r~   rp   r   r   r   r   r   r      r   z4StaticPersistentTileScheduler.initial_work_tile_infoc                C   r   ra   r   r   r   r   r   r      r   z0StaticPersistentTileScheduler.prefetch_next_workc                C   s   |  j tj d 7  _ d S rx   )r   rX   rs   grid_dimr   r   r   r   r      s   z2StaticPersistentTileScheduler.advance_to_next_workc                 C   r   ra   r0   rf   r   r   r1   r2   r   r   r   r   r   r7      r   z5StaticPersistentTileScheduler.__extract_mlir_values__c              	   C   r   r   )
r<   rf   r   r0   r2   r   r   r   r   rk   r   r   r   r   r      s   
z6StaticPersistentTileScheduler.__new_from_mlir_values__)r   r   r   r   r#   rb   r   rm   re   r@   rq   rd   r   rw   r   r}   r   r   r   r7   r   r   r   r   r   r      s,    
r   c                	   @   s  e Zd ZeG dd deZddddededefdd	Zedddd
e	defddZ
eejddddedd fddZeddddedeeeef fddZejddddefddZdddddZdddddZdddddZdd Zdd ZdS )SingleTileLPTSchedulerc                   @   s   e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< eed	< eed
< dZeje	 ed< e
ejddddedd fddZdS )zSingleTileLPTScheduler.Paramsr   rD   rA   l2_minorr   r   l2_minor_divmodl2_major_divmodl2_minor_residual_divmodnum_hb_quotientFrT   Nr]   r`   r   c          
      C   s   | j | j| j  | j }|}d}dd }||k rdnd||| > }| j| j | }| j| j | }	tj| j| j | j | jt	|t
| jt
| jt
|t
|| j t
t|	dt	|| j| jdS )N   c                 S      dt |  S N   r
   r:   r   r   r   <lambda>      z6SingleTileLPTScheduler.Params.create.<locals>.<lambda>rK   )r   rA   r   r   r   r   r   r   r   rD   rT   )rE   rF   rG   rQ   rB   rC   r   rb   rA   r   r   maxrD   rT   )
r`   r^   r_   size_one_kv_headsize_one_headsize_l2
log2_floorswizzler   num_hb_remainderr   r   r   rd     s,   z$SingleTileLPTScheduler.Params.create)r   r   r   r   rV   r   rT   r   r-   rZ   re   rX   jitr@   rd   r   r   r   r   rb      s(   
 rb   Nr]   rf   r   r|   c                C   s"   || _ || _|| _|| _|| _d S ra   )rf   r   
_split_idxrk   rl   r   rf   r   r|   r^   r_   r   r   r   rm   1  rn   zSingleTileLPTScheduler.__init__r`   r   c                C   ro   rp   )r   rb   rd   rc   r   r   r   rq   8  rr   z.SingleTileLPTScheduler.to_underlying_argumentsc                C   "   t j \}}}t| ||||dS rp   )rX   rs   rt   r   rf   r^   r_   r   r|   _r   r   r   rd   <  s   zSingleTileLPTScheduler.createc                C   s   | j | jtdfS r   )r   rD   r   rv   r   r   r   rw   C  s   z%SingleTileLPTScheduler.get_grid_shapec                C   s   | j }t| j|j\}}d\}}||jk rt||j\}}nt||j\}}||j | }t||j\}	}
|j	d | }| j|j
k }tt|t|
t|	t| jf|S )Nr   r   rK   )rf   ry   r   r   r   r   r   r   r   rA   r   r   r   r   )r   r^   r_   rf   bidhbl2_modblockbidhb_residualbidhb_actualr{   rz   r   r   r   r   r}   L  s   
z'SingleTileLPTScheduler.get_current_workc                C   r~   rp   r   r   r   r   r   r   a  r   z-SingleTileLPTScheduler.initial_work_tile_infoc                C   r   ra   r   r   r   r   r   r   d  r   z)SingleTileLPTScheduler.prefetch_next_workc                C      | j j| _d S ra   rf   r   r   r   r   r   r   r   g     z+SingleTileLPTScheduler.advance_to_next_workc                 C   H   g g }| _ | j| j| jfD ]}t|}||7 }| j t| q|S ra   r0   rf   r   r   r   r1   r2   r   r   r   r   r   r7   k     
z.SingleTileLPTScheduler.__extract_mlir_values__c              	   C   sb   g }t | j| j| jg| jD ]\}}|t||d |  ||d  }q| jt	|d| j
iS r   )r<   rf   r   r   r0   r2   r   r   r=   r   rk   r   r   r   r   r   s  s
    z/SingleTileLPTScheduler.__new_from_mlir_values__)r   r   r   r   r#   rb   r   rm   re   r@   rq   rX   r   rd   r   rw   r   r}   r   r   r   r7   r   r   r   r   r   r      s0    2r   c                	   @   s  e Zd ZeG dd deZddddedefddZedddd	e	d
efddZ
eejdddded
d fddZedddded
eeeef fddZejdddd
ejjfddZdddddZdddddZdddddZdd Zdd ZdS )SingleTileLPTBwdSchedulerc                   @   s   e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< d	Zeje	e
e
f  ed
< dZeje ed< eejddddedd fddZdS )z SingleTileLPTBwdScheduler.Paramsr   rA   r   r   r   r   r   r   rJ   rL   TsptNr]   r`   r   c                C   s   d}| j | j| j  | j }d}|| }dd }||k rdnd||| > }| j| j | }	| j| j | }
t| j| j	d }t
j|| j	d  | j | j |t|t| jt|t|| tt|
dt|	| j	| jd
S )Nr   r   c                 S   r   r   r
   r   r   r   r   r     r   z9SingleTileLPTBwdScheduler.Params.create.<locals>.<lambda>rK   )
r   rA   r   r   r   r   r   r   rL   r   )rE   rF   rG   rQ   rB   rC   rX   ceil_divrA   rL   r   rb   r   r   r   rS   )r`   r^   r_   r   size_one_qdo_headsize_one_dqaccum_headr   r   r   r   r   rA   r   r   r   rd     s6   
z'SingleTileLPTBwdScheduler.Params.create)r   r   r   r   rV   r   rL   r   r-   r   rW   r   rZ   re   rX   r   r@   rd   r   r   r   r   rb   |  s&   
 rb   Nr]   rf   r   c                C   r   ra   r   r   r   r   r   rm     r   z"SingleTileLPTBwdScheduler.__init__r`   r   c                C   ro   rp   )r   rb   rd   rc   r   r   r   rq     rr   z1SingleTileLPTBwdScheduler.to_underlying_argumentsc                C   r   r   )rX   rs   rt   r   r   r   r   r   rd     s   z SingleTileLPTBwdScheduler.createc                C   s   | j tdtdfS r   )r   r   rv   r   r   r   rw     s   z(SingleTileLPTBwdScheduler.get_grid_shapec                C   s   | j | jjd  }| j}t||j\}}d\}}||jk r&t||j\}}nt||j\}}||j | }	t|	|j	\}
}| j |j
k }tj }||jd  |d  }t|jr`|jd | }tt|t|t|
tdf|S )Nr   r   rK   )r   rf   rL   ry   r   r   r   r   r   r   r   rX   rs   block_in_cluster_idxr   r	   r   rA   r   r   )r   r^   r_   cluster_idxrf   r   r   r   r   r   r{   rz   r   bidx_in_clusterr   r   r   r}     s   

"z*SingleTileLPTBwdScheduler.get_current_workc                C   r~   rp   r   r   r   r   r   r     r   z0SingleTileLPTBwdScheduler.initial_work_tile_infoc                C   r   ra   r   r   r   r   r   r     r   z,SingleTileLPTBwdScheduler.prefetch_next_workc                C   r   ra   r   r   r   r   r   r     r   z.SingleTileLPTBwdScheduler.advance_to_next_workc                 C   r   ra   r   r   r   r   r   r7     r   z1SingleTileLPTBwdScheduler.__extract_mlir_values__c              	   C   s^   g }t | j| jg| jD ]\}}|t||d |  ||d  }q| jt|d| j	iS r   )
r<   rf   r   r0   r2   r   r   r=   r   rk   r   r   r   r   r     s
   z2SingleTileLPTBwdScheduler.__new_from_mlir_values__)r   r   r   r   r#   rb   r   rm   re   r@   rq   rX   r   rd   r   rw   r   r   r   r}   r   r   r   r7   r   r   r   r   r   r   {  s0    /r   c                	   @   s  e Zd ZeG dd deZddddededefdd	Zedddd
e	defddZ
eddddedd fddZeddddedeeeef fddZejdededefddZejddddefddZdddddZdddddZdddddZdd Zd d! ZdS )"SingleTileVarlenSchedulerc                   @   s   e Zd ZU eed< eed< eed< eed< eed< ejeeef  ed< dZ	e
ej ed< dZe
ej ed	< d
Zeje ed< dZeje ed< dZeje ed< dZeje ed< eejddddedd fddZdS )z SingleTileVarlenScheduler.ParamsrB   rC   rH   rD   max_kvblock_in_l2rI   NrM   rN   rK   rO   FrS   rT   rU   r]   r`   r   c                C   sx   d}|| j | j | j | jd   }| jd us | jd us J dtj| j| j	| j
| j|| j| j| j| j| j| j| jdS )Nr   rK   z9At least one of mCuSeqlensQ or mSeqUsedQ must be provided)rB   rC   rH   rD   r   rI   rM   rN   rO   rS   rT   rU   )rF   rG   rQ   rI   rM   rN   r   rb   rB   rC   rH   rD   rO   rS   rT   rU   )r`   r^   r_   r   r   r   r   r   rd     s*   z'SingleTileVarlenScheduler.Params.create)r   r   r   r   rV   r   r-   r   rW   rM   r   rX   rY   rN   rO   rS   rZ   rT   rU   re   r   r@   rd   r   r   r   r   rb     s*   
 rb   Nr]   rf   r   r|   c                C   s(   || _ || _|| _d| _|| _|| _d S rh   )rf   r   r   rj   rk   rl   r   r   r   r   rm   !  s   
z"SingleTileVarlenScheduler.__init__r`   r   c                C   ro   rp   )r   rb   rd   rc   r   r   r   rq   )  rr   z1SingleTileVarlenScheduler.to_underlying_argumentsc                C   r   rp   )rX   rs   rt   r   r   r   r   r   rd   -  s   z SingleTileVarlenScheduler.createc                C   s:   | j | j| jd d   | jd  }|| j | jtdfS )Nr   rK   )rH   rC   rI   rB   rD   r   )rf   r^   r_   total_blocks_maxr   r   r   rw   3  s   z(SingleTileVarlenScheduler.get_grid_shapelane
bidb_startc                 C   s   | j }|| }t|jd urtd}||jk r|j| }n!|jd us%J td}||jkr3|j| }tjj	|dd}|| }t|j
dkrL||j
9 }||jk rb|tjjd k rbt||jd S tdS )Nr   rK   )offset)rf   r   r	   rN   r   rC   rM   rX   rs   shuffle_sync_downrO   	WARP_SIZEr   rI   )r   r   r   rf   r{   seqlencur_cu_seqlennext_cu_seqlenr   r   r   _get_num_m_blocks?  s(   




z+SingleTileVarlenScheduler._get_num_m_blocksc                C   s  | j }tj }| j|dd}t||}tj|tjjd }||j	 }t
dt
dt
d}	}
}| j}||krt|tjjd 7 }||jkrQt
|j}|d }n| j||d}t||}tj|tjjd }|||j	 7 }||ks:d}||jkrt
dt
dt
|j}	}
}n|||j	  }tjtj|||j	  |k}||7 }|dkrdntj||d }tj||}|| ||j	  }t|jp|jrK||jd  |j |jd  }|d |jkrdn|d |jkrdn|d |jkrdn|d |jkrdnd}t||j	}|| }|| }|||  }||d  |j	kr&|n|j	||  }|| }	||	|  }|| | }
t|jrJ|d |	 }	n
|| }
||
|  }	| jo]||jk }t|jrg| jnt
d}tt
|	t
|
t
||f|S )	Nr   )r   rK   F         rP   )rf   rX   rs   lane_idxr   r   warp_prefix_sumshuffle_syncr   rB   r   r   rC   popcvote_ballot_syncr   r	   rS   rU   rI   rO   r   r   rj   rT   r   r   )r   r^   r_   rf   r   num_m_blocksnum_m_blocks_cumulativem_blocks_in_groupgroup_end_tiler   rz   r{   next_tile_idxr   group_start_tilebatch_idx_in_groupnum_m_blocks_prev_lanemh_blocknum_n_blocksnheads_in_l2mh_in_l2section_idxr   nheads_in_this_sectionhead_idx_residualr|   r   r   r   r}   V  s   





 
z*SingleTileVarlenScheduler.get_current_workc                C   r~   rp   r   r   r   r   r   r     r   z0SingleTileVarlenScheduler.initial_work_tile_infoc                C   r   ra   r   r   r   r   r   r     r   z,SingleTileVarlenScheduler.prefetch_next_workc                C   r   r   r   r   r   r   r   r     s   
z.SingleTileVarlenScheduler.advance_to_next_workc                 C   r   ra   r   r   r   r   r   r7     r   z1SingleTileVarlenScheduler.__extract_mlir_values__c              	   C   s`   g }t | j| j| jg| jD ]\}}|t||d |  ||d  }qtt	|d| j
iS r   )r<   rf   r   r   r0   r2   r   r   r   r   rk   r   r   r   r   r     s   z2SingleTileVarlenScheduler.__new_from_mlir_values__)r   r   r   r   r#   rb   r   rm   re   r@   rq   rd   r   rw   rX   r   r   r   r}   r   r   r   r7   r   r   r   r   r   r     s2    )^r   )typingr   r   dataclassesr   r   r   ImportErrortyping_extensionsr   cutlass._mlirr   cutlass.cuterX   r   r	   flash_attn_origin.cute.utilsr    flash_attn_origin.cute.fast_mathr   r   r   r#   r@   r[   r   r   r   r   r   r   r   r   <module>   s0   ]R~{