o
    i                     @   s   d dl mZ d dlmZ d dlZd dlmZ d dlmZm	Z	m
Z
 d dlmZmZmZmZ d dlmZmZ G dd dZd	ed
efddZeddG dd deZdS )    )Optional)	dataclassN)BooleanInt32if_generate)PipelineAsyncPipelineStateCooperativeGrouppipeline_init_wait)PipelineUserType
PipelineOpc                   @   sz   e Zd ZdZdedefddZdddZedefd	d
Z	edefddZ
edefddZdd Zdd Zdd ZdS )PipelineStateSimplea#  
    Pipeline state contains an index and phase bit corresponding to the current position in the circular buffer.
    Use a single Int32 to store both the index and phase bit, then we use divmod to get the
    index and phase. If stages is a power of 2, divmod turns into bit twiddling.
    stagesphase_indexc                 C   s   || _ || _d S N)_stages_phase_index)selfr   r    r   N/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/cute/pipeline.py__init__   s   
zPipelineStateSimple.__init__returnc                 C   s   t | j| jS r   )r   r   r   r   r   r   r   clone   s   zPipelineStateSimple.clonec                 C   s   | j S r   )r   r   r   r   r   r      s   zPipelineStateSimple.stagesc                 C   s   | j | j S r   r   r   r   r   r   r   index$   s   zPipelineStateSimple.indexc                 C   s   | j | j S r   r   r   r   r   r   phase*   s   zPipelineStateSimple.phasec                 C   s   |  j d7  _ d S )N   )r   r   r   r   r   advance3   s   zPipelineStateSimple.advancec                 C   s   | j }| gS r   )r   ir_value)r   r   r   r   r   __extract_mlir_values__E   s   
z+PipelineStateSimple.__extract_mlir_values__c                 C   s   t | jt|d S )Nr   )r   r   r   )r   valuesr   r   r   __new_from_mlir_values__I   s   z,PipelineStateSimple.__new_from_mlir_values__N)r   r   )__name__
__module____qualname____doc__intr   r   r   propertyr   r   r   r   r    r"   r   r   r   r   r      s    
r   typer   c                 C   s8   | t ju rt|t|S | t ju rt|tdS J d)zz
    Creates a pipeline state. Producers are assumed to start with an empty buffer and have a flipped phase bit of 1.
    r   FzBError: invalid PipelineUserType specified for make_pipeline_state.)r   Producerr   r   Consumer)r)   r   r   r   r   make_pipeline_stateM   s
   

r,   T)frozenc                   @   sv   e Zd ZdZe	ddejdededede	de
je fd	d
Zddedee fddZdefddZdefddZdS )PipelineTmaAsyncNoClustera  
    If size(ClusterShape) == 1, PipelineTmaAsync has all threads
    signaling the barrier during consumer_release. This causes a perf regression in FA3
    forward pass (especially hdim 128 causal). We instead implement a version of
    PipelineTmaAsync where only 1 out of 128 threads signals the barrier.

    Assumptions:
    (1) num_consumers % NumThreadsPerWarpGroup == 0
    (2) all 128 threads in the warp group are sync'ed right before calling consumer_release
    Tbarrier_storage
num_stagesproducer_groupconsumer_grouptx_count	init_waitc                 C   sv   t j}t j}||f}||f}	t| jdd|||}
t| jdd| ||	}d}d}t|r3t  t	|
||||S )a  
        This helper function computes any necessary attributes and returns an instance of PipelineTmaAsync.
        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
        :type barrier_storage: cute.Pointer
        :param num_stages: Number of buffer stages for this pipeline
        :type num_stages: Int32
        :param producer_group: CooperativeGroup for the producer agent
        :type producer_group: CooperativeGroup
        :param consumer_group: CooperativeGroup for the consumer agent
        :type consumer_group: CooperativeGroup
        :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage
        :type tx_count: int
           )	min_alignN)
r   TmaLoadAsyncThreadr   _make_sync_objectaligncutlass
const_exprr
   r.   )r/   r0   r1   r2   r3   r4   producer_typeconsumer_typeproducerconsumersync_object_fullsync_object_emptydst_rankproducer_maskr   r   r   createg   s*   
z PipelineTmaAsyncNoCluster.createNstatetry_acquire_tokenc                    s6   t |du p|dk fdd  jj j dS )z
        TMA producer commit conditionally waits on buffer empty and sets the transaction barrier for leader threadblocks.
        Nr   c                      s    j jjS r   )rB   waitr   r   r   r   rF   r   r   <lambda>       z<PipelineTmaAsyncNoCluster.producer_acquire.<locals>.<lambda>)r   rA   arriver   rD   )r   rF   rG   r   rI   r   producer_acquire   s
   z*PipelineTmaAsyncNoCluster.producer_acquirec                 C   s   dS )zv
        TMA producer commit is a NOP. The transaction barrier signals the commit upon completion of the TMA.
        Nr   rI   r   r   r   producer_commit   s   z)PipelineTmaAsyncNoCluster.producer_commitc                    s*   t tj d d dk fdd dS )z^
        TMA consumer release conditionally signals the empty buffer to the producer.
        r      c                      s    j j jS r   )rB   rL   r   consumer_maskr   rI   r   r   rJ      rK   z<PipelineTmaAsyncNoCluster.consumer_release.<locals>.<lambda>N)r   cutearch
thread_idxrI   r   rI   r   consumer_release   s   z*PipelineTmaAsyncNoCluster.consumer_release)Tr   )r#   r$   r%   r&   staticmethodrQ   Pointerr   r	   r'   r;   	ConstexprboolrE   r   r   r   rM   rN   rT   r   r   r   r   r.   Z   s(    +
r.   )typingr   dataclassesr   r;   cutlass.cuterQ   cutlass.cutlass_dslr   r   r   cutlass.pipeliner   r   r	   r
   r   r   r   r'   r,   r.   r   r   r   r   <module>   s   ?