o
    پi4'                     @   s  d dl mZ d dlmZ d dlZd dlmZ d dlmZmZm	Z	 d dl
mZ d dlmZmZmZmZ d dlmZmZ d dlmZ d d	lmZ dd
eej fddZdefddZG dd dZdedefddZeddG dd deZeddG dd deZdS )    )Optional)	dataclassN)BooleanInt32
const_expr)if_generate)PipelineAsyncPipelineStateAgentCooperativeGroup)PipelineUserType
PipelineOp)PipelineTmaAsync)PipelineTmaUmmacta_layout_vmnkc                 C   s<   t j  | du st | dkrttj dS ttj dS )zG
    Fences the mbarrier init and syncs the threadblock or cluster
    N   )cutearchmbarrier_init_fencesize_syncr
   ThreadBlockThreadBlockCluster)r    r   S/home/ubuntu/.local/lib/python3.10/site-packages/flash_attn_origin/cute/pipeline.pypipeline_init_wait   s   
r   groupc                 C   sT   | t ju r	td| t ju rtj  dS | t ju r&tj  tj	  dS J d)z,
    Syncs all threads within an agent.
    zError: Not supported.Fz[Error: No explicit sync instruction exists. Please use barriers (named / mbarrier) instead.N)
r
   ThreadNotImplementedErrorr   r   r   sync_threadsr   cluster_arrive_relaxedcluster_wait)r   r   r   r   r       s   



r   c                   @   sz   e Zd ZdZdedefddZdddZedefd	d
Z	edefddZ
edefddZdd Zdd Zdd ZdS )PipelineStateSimplea#  
    Pipeline state contains an index and phase bit corresponding to the current position in the circular buffer.
    Use a single Int32 to store both the index and phase bit, then we use divmod to get the
    index and phase. If stages is a power of 2, divmod turns into bit twiddling.
    stagesphase_indexc                 C   s   || _ || _d S N)_stages_phase_index)selfr#   r$   r   r   r   __init__8   s   
zPipelineStateSimple.__init__returnc                 C   s   t | j| jS r%   )r"   r#   r'   r(   r   r   r   clone?   s   zPipelineStateSimple.clonec                 C   s   | j S r%   )r&   r+   r   r   r   r#   B   s   zPipelineStateSimple.stagesc                 C   s"   t | jdkrtdS | j| j S )Nr   r   )r   r&   r   r'   r+   r   r   r   indexG   s   zPipelineStateSimple.indexc                 C   s    t | jdkr
| jS | j| j S Nr   r   r&   r'   r+   r   r   r   phaseP   s   zPipelineStateSimple.phasec                 C   s2   t | jdkr|  jdN  _d S |  jd7  _d S r.   r/   r+   r   r   r   advance\   s   zPipelineStateSimple.advancec                 C   s   | j }| gS r%   )r'   ir_value)r(   r$   r   r   r   __extract_mlir_values__q   s   
z+PipelineStateSimple.__extract_mlir_values__c                 C   s   t | jt|d S Nr   )r"   r#   r   )r(   valuesr   r   r   __new_from_mlir_values__u   s   z,PipelineStateSimple.__new_from_mlir_values__N)r*   r"   )__name__
__module____qualname____doc__intr   r)   r,   propertyr#   r-   r0   r1   r3   r6   r   r   r   r   r"   1   s    
r"   typer#   c                 C   s8   | t ju rt|t|S | t ju rt|tdS J d)zz
    Creates a pipeline state. Producers are assumed to start with an empty buffer and have a flipped phase bit of 1.
    r   FzBError: invalid PipelineUserType specified for make_pipeline_state.)r   Producerr"   r   Consumer)r=   r#   r   r   r   make_pipeline_statey   s
   

r@   T)frozenc                   @   s<   e Zd ZdZedd Z		ddedee de	fd	d
Z
dS )r   zH
    Override producer_acquire to take in extra_tx_count parameter.
    c                  O   s"   t j| i |}t|dt |S )N	__class__)PipelineTmaAsyncOgcreateobject__setattr__r   )argskwargsobjr   r   r   rD      s   zPipelineTmaAsync.createNr   statetry_acquire_tokenextra_tx_countc                    sb   t |du p|dk fdd t|dkr! jj j dS  jj| } jj| dS )
        TMA producer commit conditionally waits on buffer empty and sets the transaction barrier for leader threadblocks.
        Nr   c                          j jjS r%   sync_object_emptywaitr-   r0   r   r(   rJ   r   r   <lambda>       z3PipelineTmaAsync.producer_acquire.<locals>.<lambda>)r   r   sync_object_fullarriver-   producer_masktx_countarrive_and_expect_tx)r(   rJ   rK   rL   rX   r   rR   r   producer_acquire   s   	z!PipelineTmaAsync.producer_acquirer4   )r7   r8   r9   r:   staticmethodrD   r	   r   r   r;   rZ   r   r   r   r   r      s    

r   c                   @   s|   e Zd Zeddddddedededed	ejd
eej	 de
eef deje fddZ		ddedee defddZdS )r   N)r   r   T)barrier_storager   mcast_mode_mn	init_wait
num_stagesproducer_groupconsumer_grouprX   r\   r   r]   r^   c                 C   s   t |tjstdt| tj}tj}	||f}
|	|f}t	|j
dd| |
|}t	|j
dd|  | |}|du sAt|dkrFd}d}nt||}t|}|du s_tj|dgddkretjjjjntjjjj}|}t|rut| t||| ||||S )	a  
        This helper function computes any necessary attributes and returns an instance of PipelineTmaUmma.
        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
        :type barrier_storage: cute.Pointer
        :param num_stages: Number of buffer stages for this pipeline
        :type num_stages: Int32
        :param producer_group: `CooperativeGroup` for the producer agent
        :type producer_group: CooperativeGroup
        :param consumer_group: `CooperativeGroup` for the consumer agent
        :type consumer_group: CooperativeGroup
        :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage
        :type tx_count: int
        :param cta_layout_vmnk: Layout of the cluster shape
        :type cta_layout_vmnk: cute.Layout | None
        :param mcast_mode_mn: Tuple of two integers, specifying whether mcast is enabled for the m and n modes. At least one of the two integers must be 1.
        :type mcast_mode_mn: tuple[int, int]
        z7Expected barrier_storage to be a cute.Pointer, but got    )	min_alignNr   Tr   )mode)
isinstancer   Pointer
ValueErrorr=   r   TmaLoad
TCGen05Mmar   _make_sync_objectalignr   r   _compute_mcast_arrival_mask_compute_is_leader_ctanvgputcgen05CtaGroupONETWOr   r   )r_   r`   ra   rX   r\   r   r]   r^   producer_typeconsumer_typeproducerconsumerrU   rP   rW   is_leader_cta	cta_groupconsumer_maskr   r   r   rD      sJ   

zPipelineTmaUmma.creater   rJ   rK   rL   c                    sn   t |du p|dk fdd t|dkr#t  j fdd dS  jj| t  j fdd dS )rM   Nr   c                      rN   r%   rO   r   rR   r   r   rS     rT   z2PipelineTmaUmma.producer_acquire.<locals>.<lambda>c                      s    j j jS r%   )rU   rV   r-   rW   r   rR   r   r   rS   	  rT   c                      s    j jS r%   )rU   rY   r-   r   r(   rJ   rX   r   r   rS     s    )r   r   rw   rU   rX   )r(   rJ   rK   rL   r   rz   r   rZ      s   	z PipelineTmaUmma.producer_acquirer4   )r7   r8   r9   r[   r;   r   r   rf   r   Layouttuplecutlass	ConstexprboolrD   r	   r   rZ   r   r   r   r   r      s@    
	Qr   r%   )typingr   dataclassesr   r}   cutlass.cuter   r   r   r   cutlass.cutlass_dslr   cutlass.pipeliner   r	   r
   r   r   r   r   rC   r   PipelineTmaUmmaOgr{   r   r   r"   r;   r@   r   r   r   r   <module>   s$   H!