o
    iFd                     @   s   d dl mZmZ d dlmZmZ d dlZd dlmZ d dlm	Z	 d dl
m  mZ d dlmZmZ eG dd dZeG dd	 d	eZG d
d dZG dd dZG dd dZG dd dZdS )    )OptionalTuple)	dataclassfieldsN)Int32)
FastDivmodclzc                   @   s   e Zd Zdd Zdd ZdS )
ParamsBasec                    s`    fddt  D }dd |D }g g } _|D ]}t|}||7 } jt| q|S )Nc                    s   g | ]}t  |jqS  )getattrname.0fieldselfr
   T/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/cute/tile_scheduler.py
<listcomp>   s    z6ParamsBase.__extract_mlir_values__.<locals>.<listcomp>c                 S   s   g | ]
}t |tjs|qS r
   
isinstancecutlass	Constexpr)r   fr
   r
   r   r          )r   _values_posr   extract_mlir_valuesappendlen)r   
all_fieldsnon_constexpr_fieldsvaluesobj
obj_valuesr
   r   r   __extract_mlir_values__   s   
z"ParamsBase.__extract_mlir_values__c                    s    fddt  D }dd | D }dd | D }t|  jD ]\\}}}t||d | ||< ||d  }q% jdi ||S )Nc                    s   i | ]
}|j t |j qS r
   )r   r   r   r   r
   r   
<dictcomp>   r   z7ParamsBase.__new_from_mlir_values__.<locals>.<dictcomp>c                 S   s"   i | ]\}}t |tjr||qS r
   r   r   nr   r
   r
   r   r$      s   " c                 S   s"   i | ]\}}t |tjs||qS r
   r   r%   r
   r
   r   r$      s
    r
   )r   itemszipr   r   new_from_mlir_values	__class__)r   r    r   constexpr_fieldsr   r   r   n_itemsr
   r   r   __new_from_mlir_values__   s   z#ParamsBase.__new_from_mlir_values__N)__name__
__module____qualname__r#   r-   r
   r
   r
   r   r	      s    
r	   c                   @   s   e Zd ZU eed< eed< eed< eed< eed< eed< eed< ejeeef  ed< d	Z	e
ej ed
< d	Ze
ej ed< dZeje ed< dZeje ed< dZeje ed< dZeje ed< d	S )TileSchedulerArguments	num_blocknum_head	num_batchseqlen_kheaddim	headdim_vtotal_qtile_shape_mnNmCuSeqlensQ	mSeqUsedQ   qhead_per_kvhead_packgqa   element_sizeFis_persistentlpt)r.   r/   r0   r   __annotations__r   r   r   intr:   r   cuteTensorr;   r=   r?   r@   boolrA   r
   r
   r
   r   r1   &   s   
 r1   c                	   @   s   e Zd ZeG dd deZddddejfddZe	dddde
d	efd
dZe	dddded	d fddZe	dddded	eeeef fddZdddd	ejjfddZdddddZdddddZdddddZdd Zdd ZdS )SingleTileSchedulerc                   @   sD   e Zd ZU eed< eed< eed< eddddedd fdd	ZdS )
zSingleTileScheduler.Paramsr2   r3   r4   Nlocipargsreturnc                C   s   t | j| j| jS N)rG   Paramsr2   r3   r4   rK   rI   rJ   r
   r
   r   create?   s   z!SingleTileScheduler.Params.create)r.   r/   r0   r   rB   staticmethodr1   rP   r
   r
   r
   r   rN   9      
 rN   NrH   	blk_coordc                C   s   || _ d| _|| _|| _d S )NT)
_blk_coord_is_first_block_loc_ip)r   rS   rI   rJ   r
   r
   r   __init__E   s   
zSingleTileScheduler.__init__rK   rL   c                C      t jj| ||dS NrH   )rG   rN   rP   rO   r
   r
   r   to_underlying_argumentsK      z+SingleTileScheduler.to_underlying_argumentsparamsc                C   s   t j }t|||dS rZ   )rD   arch	block_idxrG   )r]   rI   rJ   rS   r
   r
   r   rP   O   s   
zSingleTileScheduler.createc                C   s   | j | j| jfS rM   )r2   r3   r4   r]   rI   rJ   r
   r
   r   get_grid_shapeU   s   z"SingleTileScheduler.get_grid_shapec                C   s   t j| j| jS rM   )r   utilsWorkTileInforT   rU   r   rI   rJ   r
   r
   r   get_current_work^   s   z$SingleTileScheduler.get_current_workc                C      | j ||dS rZ   re   rd   r
   r
   r   initial_work_tile_infoa      z*SingleTileScheduler.initial_work_tile_infoc                C      d S rM   r
   rd   r
   r
   r   prefetch_next_workd      z&SingleTileScheduler.prefetch_next_workc                C   
   d| _ d S NFrU   rd   r
   r
   r   advance_to_next_workg   s   
z(SingleTileScheduler.advance_to_next_workc                 C   s@   g g }| _ | jfD ]}t|}||7 }| j t| q
|S rM   )r   rT   r   r   r   r   r   r    r!   r"   r
   r
   r   r#   j   s   
z+SingleTileScheduler.__extract_mlir_values__c              	   C   sX   g }t | jg| jD ]\}}|t||d |  ||d  }q
tt|d| jiS NrI   )	r(   rT   r   r   r   r)   rG   tuplerV   r   r    obj_listr!   r,   r
   r
   r   r-   r   s
   z,SingleTileScheduler.__new_from_mlir_values__)r.   r/   r0   r   r	   rN   rD   CoordrX   rQ   r1   r[   rP   r   r   ra   r   rb   rc   re   rh   rk   rp   r#   r-   r
   r
   r
   r   rG   8   s,    rG   c                	   @   s   e Zd ZeG dd deZddddedededefd	d
Ze	dddde
defddZe	ddddedd fddZe	ddddedeeeef fddZddddejjfddZdddddZdddddZdddddZdd Zdd ZdS ) StaticPersistentTileSchedulerc                   @   sD   e Zd ZU eed< eed< eed< eddddedd fdd	ZdS )
z$StaticPersistentTileScheduler.Paramsnum_block_divmodnum_head_divmodtotal_blocksNrH   rK   rL   c                C   s0   | j | j | j }tt| j t| j|S rM   )r2   r3   r4   rw   rN   r   rP   )rK   rI   rJ   rz   r
   r
   r   rP      s   z+StaticPersistentTileScheduler.Params.create)	r.   r/   r0   r   rB   r   rQ   r1   rP   r
   r
   r
   r   rN   {   rR   rN   NrH   rx   ry   rz   tile_idxc                C   s(   || _ || _|| _|| _|| _|| _d S rM   )rx   ry   rz   	_tile_idxrV   rW   )r   rx   ry   rz   r{   rI   rJ   r
   r
   r   rX      s   

z&StaticPersistentTileScheduler.__init__rK   rL   c                C   rY   rZ   )rw   rN   rP   rO   r
   r
   r   r[      r\   z5StaticPersistentTileScheduler.to_underlying_argumentsr]   c                C   s(   t j d }t| j| j| j|||dS Nr   rH   )rD   r^   r_   rw   rx   ry   rz   r]   rI   rJ   r{   r
   r
   r   rP      s   z$StaticPersistentTileScheduler.createc                C   s.   t j }| }t || jtdtdfS Nr<   )r   rb   HardwareInfoget_device_multiprocessor_countminrz   r   )r]   rI   rJ   hardware_infosm_countr
   r
   r   ra      s   
z,StaticPersistentTileScheduler.get_grid_shapec                C   sN   | j | j\}}| j|\}}| j| jk }tjt|t|t|f|S rM   )	rx   divmodr|   ry   rz   r   rb   rc   r   )r   rI   rJ   hn_idxr_   	batch_idxhead_idxis_validr
   r
   r   re      s   z.StaticPersistentTileScheduler.get_current_workc                C   rf   rZ   rg   rd   r
   r
   r   rh      ri   z4StaticPersistentTileScheduler.initial_work_tile_infoc                C   rj   rM   r
   rd   r
   r
   r   rk      rl   z0StaticPersistentTileScheduler.prefetch_next_workc                C   s   |  j tj d 7  _ d S )Nr   )r|   rD   r^   grid_dimrd   r
   r
   r   rp      s   z2StaticPersistentTileScheduler.advance_to_next_workc                 C   sL   g g }| _ | j| j| j| jfD ]}t|}||7 }| j t| q|S rM   )	r   rx   ry   rz   r|   r   r   r   r   rq   r
   r
   r   r#      s   
z5StaticPersistentTileScheduler.__extract_mlir_values__c              	   C   sd   g }t | j| j| j| jg| jD ]\}}|t||d |  ||d  }qt	t
|d| jiS rr   )r(   rx   ry   rz   r|   r   r   r   r)   rw   rs   rV   rt   r
   r
   r   r-      s   z6StaticPersistentTileScheduler.__new_from_mlir_values__)r.   r/   r0   r   r	   rN   r   r   rX   rQ   r1   r[   rP   r   ra   r   rb   rc   re   rh   rk   rp   r#   r-   r
   r
   r
   r   rw   z   s@    

rw   c                   @   s  e Zd ZeG dd deZddddedededed	ed
ededefddZe	dddde
defddZe	ejddddedd fddZe	ddddedeeeef fddZejddddejjfddZdddddZdddddZdddddZd d! Zd"d# ZdS )$SingleTileLPTSchedulerc                   @   sj   e Zd ZU eed< eed< eed< eed< eed< eed< eed< eejddd	d
e	dd fddZ
dS )zSingleTileLPTScheduler.Paramsrz   rx   ry   l2_minor_divmodl2_major_divmodl2_minor_residual_divmodnum_hb_quotientNrH   rK   rL   c          
      C   s   | j | j| j  | j }|}d}dd }||k rdnd||| > }| j| j | }| j| j | }	tj| j| j | j t	
| jt	
| jt	
|t	
|| j t	
t|	dt|dS )N   c                 S   s   dt |  S )N   )r   )r&   r
   r
   r   <lambda>   s    z6SingleTileLPTScheduler.Params.create.<locals>.<lambda>r<   )rz   rx   ry   r   r   r   r   )r5   r6   r7   r?   r3   r4   r   rN   r2   r   rP   maxr   )
rK   rI   rJ   size_one_kv_headsize_one_headsize_l2
log2_floorswizzler   num_hb_remainderr
   r
   r   rP      s$   

z$SingleTileLPTScheduler.Params.create)r.   r/   r0   r   rB   r   rQ   rD   jitr1   rP   r
   r
   r
   r   rN      s    
 rN   NrH   rz   rx   ry   r   r   r   r   r{   c	                C   s@   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	d S rM   )
rz   rx   ry   r   r   r   r   r|   rV   rW   )r   rz   rx   ry   r   r   r   r   r{   rI   rJ   r
   r
   r   rX   
  s   
zSingleTileLPTScheduler.__init__rK   rL   c                C   rY   rZ   )r   rN   rP   rO   r
   r
   r   r[   #  r\   z.SingleTileLPTScheduler.to_underlying_argumentsr]   c                C   s8   t j d }t| j| j| j| j| j| j	| j
|||d
S r}   )rD   r^   r_   r   rz   rx   ry   r   r   r   r   r~   r
   r
   r   rP   '  s   zSingleTileLPTScheduler.createc                C   s   | j tdtdfS r   )rz   r   r`   r
   r
   r   ra   9  s   z%SingleTileLPTScheduler.get_grid_shapec                C   s   | j | j\}}d\}}|| jk r| j|\}}n| j|\}}|| jj | }| j|\}}	| jjd | }| j| j	k }
t
jt|t|	t|f|
S )N)r   r   r<   )r   r   r|   r   r   r   divisorry   rx   rz   r   rb   rc   r   )r   rI   rJ   bidhbl2_modblockbidhb_residualbidhb_actualr   r   r   r
   r
   r   re   B  s   
z'SingleTileLPTScheduler.get_current_workc                C   rf   rZ   rg   rd   r
   r
   r   rh   V  ri   z-SingleTileLPTScheduler.initial_work_tile_infoc                C   rj   rM   r
   rd   r
   r
   r   rk   Y  rl   z)SingleTileLPTScheduler.prefetch_next_workc                C   s   | j | _d S rM   )rz   r|   rd   r
   r
   r   rp   \  s   z+SingleTileLPTScheduler.advance_to_next_workc                 C   s\   g g }| _ | j| j| j| j| j| j| j| jfD ]}t	
|}||7 }| j t| q|S rM   )r   rz   rx   ry   r   r   r   r   r|   r   r   r   r   rq   r
   r
   r   r#   `  s   

z.SingleTileLPTScheduler.__extract_mlir_values__c              	   C   st   g }t | j| j| j| j| j| j| j| jg| j	D ]\}}|
t||d |  ||d  }qtt|d| jiS rr   )r(   rz   rx   ry   r   r   r   r   r|   r   r   r   r)   r   rs   rV   rt   r
   r
   r   r-   q  s    
z/SingleTileLPTScheduler.__new_from_mlir_values__)r.   r/   r0   r   r	   rN   r   r   rX   rQ   r1   r[   rD   r   rP   r   ra   r   rb   rc   re   rh   rk   rp   r#   r-   r
   r
   r
   r   r      sT    5	
r   c                   @   sd  e Zd ZeG dd deZ					d,dddded	ed
ededeej	 deej	 de
jeeg de
je de
je fddZeddddedefddZeddddedd fddZeddddedeeeef fddZejdededefddZejdddde
jjfd d!Zdddd"d#Zdddd$d%Zdddd&d'Zd(d) Zd*d+ ZdS )-SingleTileVarlenSchedulerc                   @   s   e Zd ZU eed< eed< eed< eed< ejeeef  ed< dZ	e
ej ed< dZe
ej ed< d	Zeje ed
< dZeje ed< eejddddedd fddZdS )z SingleTileVarlenScheduler.Paramsr3   r4   r8   max_kvblock_in_l2r9   Nr:   r;   r<   r=   FrA   rH   rK   rL   c                C   sP   d}|| j | j | j | jd   }tj| j| j| j|| j| j	| j
| j| jd	S )Nr   r<   )	r3   r4   r8   r   r9   r:   r;   r=   rA   )r6   r7   r?   r9   r   rN   r3   r4   r8   r:   r;   r=   rA   )rK   rI   rJ   r   r   r
   r
   r   rP     s    z'SingleTileVarlenScheduler.Params.create)r.   r/   r0   r   rB   r   r   r   rC   r:   r   rD   rE   r;   r=   rA   rF   rQ   r   r1   rP   r
   r
   r
   r   rN     s$   
 rN   N   r   r<   FrH   r3   r4   r   r{   r:   r;   r9   r=   rA   c
                C   sh   || _ || _|| _|| _|| _| jd us| jd usJ d|| _|| _|	| _|| _d| _	|
| _
|| _d S )Nz9At least one of mCuSeqlensQ or mSeqUsedQ must be providedT)r3   r4   r   r:   r;   r9   r=   rA   r|   rU   rV   rW   )r   r3   r4   r   r{   r:   r;   r9   r=   rA   rI   rJ   r
   r
   r   rX     s   
z"SingleTileVarlenScheduler.__init__rK   rL   c                C   rY   rZ   )r   rN   rP   rO   r
   r
   r   r[     r\   z1SingleTileVarlenScheduler.to_underlying_argumentsr]   c                C   s<   t j d }t| j| j| j|| j| j| j	| j
| j||dS )Nr   )r:   r;   r9   r=   rA   rI   rJ   )rD   r^   r_   r   r3   r4   r   r:   r;   r9   r=   rA   r~   r
   r
   r   rP     s   z SingleTileVarlenScheduler.createc                C   s<   | j | j| jd d   | jd  }|| j tdtdfS )Nr   r<   )r8   r4   r9   r3   r   )r]   rI   rJ   total_blocks_maxr
   r
   r   ra     s   z(SingleTileVarlenScheduler.get_grid_shapelane
bidb_startc                 C   s   || }t | jd urtd}|| jk r| j| }n!| jd us"J td}|| jkr0| j| }tjj|dd}|| }t | j	dkrI|| j	9 }|| jk r_|tjj
d k r_t|| jd S tdS )Nr   r<   )offset)r   
const_exprr;   r   r4   r:   rD   r^   shuffle_sync_downr=   	WARP_SIZEceil_divr9   )r   r   r   r   seqlencur_cu_seqlennext_cu_seqlenr
   r
   r   _get_num_m_blocks  s&   




z+SingleTileVarlenScheduler._get_num_m_blocksc                C   s  t j }| j|dd}t||}t j|t jjd }|| j }t	dt	dt	d}}	}
| j
}||krq|
t jjd 7 }
|
| jkrNt	| j}
|d }n| j||
d}t||}t j|t jjd }||| j 7 }||ks7d}|
| jkrt	dt	dt	| j}}	}
n||| j  }t jt j||| j  |k}|
|7 }
|dkrdnt j||d }t j||}|| || j  }t| jr:|| jd  | jd  }|d | jkrdn|d | jkrdn|d | jkrdn
|d | jkrdnd}t|| j}|| }|| }|||  }||d  | jkr|n| j||  }|| }|||  }|| | }	|d | }n
|| }	||	|  }| joL|
| jk }tjt	|t	|	t	|
f|S )	Nr   )r   r<   F         r>   )rD   r^   lane_idxr   rb   warp_prefix_sumshuffle_syncr   r3   r   r|   r4   popcvote_ballot_syncr   r   rA   r9   r   r   rU   rc   )r   rI   rJ   r   num_m_blocksnum_m_blocks_cumulativem_blocks_in_groupgroup_end_tiler   r   r   next_tile_idxr   group_start_tilebatch_idx_in_groupnum_m_blocks_prev_lanemh_blocknum_n_blocksnheads_in_l2mh_in_l2section_idxr   nheads_in_this_sectionhead_idx_residualr
   r
   r   re     sj   





 L&z*SingleTileVarlenScheduler.get_current_workc                C   rf   rZ   rg   rd   r
   r
   r   rh   E  ri   z0SingleTileVarlenScheduler.initial_work_tile_infoc                C   rj   rM   r
   rd   r
   r
   r   rk   H  rl   z,SingleTileVarlenScheduler.prefetch_next_workc                C   rm   rn   ro   rd   r
   r
   r   rp   K  s   
z.SingleTileVarlenScheduler.advance_to_next_workc                 C   sT   g g }| _ | j| j| j| j| j| jfD ]}t|}||7 }| j 	t
| q|S rM   )r   r3   r4   r   r|   r:   r;   r   r   r   r   rq   r
   r
   r   r#   O  s   
z1SingleTileVarlenScheduler.__extract_mlir_values__c              	   C   sx   g }t | j| j| j| j| j| jg| jD ]\}}|t	
||d |  ||d  }qtt|| j| j| j| jdS )N)r9   r=   rA   rI   )r(   r3   r4   r   r|   r:   r;   r   r   r   r)   r   rs   r9   r=   rA   rV   rt   r
   r
   r   r-   ^  s(   z2SingleTileVarlenScheduler.__new_from_mlir_values__)NNr   r<   F)r.   r/   r0   r   r	   rN   r   r   rD   rE   r   r   rC   rF   rX   rQ   r1   r[   rP   r   ra   r   r   rb   rc   re   rh   rk   rp   r#   r-   r
   r
   r
   r   r     sf    $	

Hr   )typingr   r   dataclassesr   r   r   cutlass.cuterD   r   flash_attn.cute.utilsrb   flash_attn.cute.fast_mathr   r   r	   r1   rG   rw   r   r   r
   r
   r
   r   <module>   s    Bd (