o
    	iX                     @   sr  d dl mZ d dlmZmZ d dlmZmZmZ d dl	m
Z
 dd ZG dd dZed	d
G dd deZed	d
G dd deZed	d
G dd deZed	d
G dd deZed	d
G dd deZed	d	dG dd deZG dd dZe
dd Zed	d
G dd deZed	d	dG dd deZed	d	dG d d! d!eZd'd"d#Zd$d% Zd&S )(    )	dataclass)ListOptional)_unwrap_if_constexpr_unwrap_shapeconstexpr_type)constexpr_functionc                 C   sd   | j pdg| }| jpdg| }| jpttt|}t| d| t| d| t| d| d S )N   ctas_per_cgacta_split_num	cta_order)r
   r   r   listreversedrangeobject__setattr__)layoutrankr
   r   r    r   a/home/ubuntu/vllm_env/lib/python3.10/site-packages/triton/experimental/gluon/language/_layouts.py_realize_cta_layout   s   r   c                   @      e Zd ZdZedd ZdS )DistributedLayoutz@
    Base class for distributed memory layouts in Gluon IR.
    c                 C      t | S Nr   selfr   r   r   type      zDistributedLayout.typeN__name__
__module____qualname____doc__propertyr   r   r   r   r   r          r   T)frozenc                   @   s   e Zd Zdd Zdd ZdS )
AutoLayoutc                 C   s   |  S r   )get_auto_layoutr   builderr   r   r   _to_ir   s   zAutoLayout._to_irc                 C   s   dS )NALr   r   r   r   r   mangle    s   zAutoLayout.mangleN)r!   r"   r#   r,   r.   r   r   r   r   r(      s    r(   c                       s   e Zd ZU dZee ed< ee ed< ee ed< ee ed< dZeee  ed< dZ	eee  ed< dZ
eee  ed	<  fd
dZdd ZdefddZdd Z  ZS )BlockedLayouta`  
    Represents a blocked layout, partitioning a tensor across threads, warps, and CTAs.

    Args:
        size_per_thread (List[int]): Number of elements per thread per dimension.
        threads_per_warp (List[int]): Number of threads per warp per dimension.
        warps_per_cta (List[int]): Number of warps per CTA per dimension.
        order (List[int]): The ordering of dimensions for partitioning.
        ctas_per_cga (Optional[List[int]]): CTAs per CGA grouping.
        cta_split_num (Optional[List[int]]): Split factors for CTAs.
        cta_order (Optional[List[int]]): Ordering for CTAs.
    size_per_threadthreads_per_warpwarps_per_ctaorderNr
   r   r   c                    s  t  dt| j t  dt| j t  dt| j t  dt| j t  dt| j t  dt| j t  dt| j	 t
| j}t| | t
| j|ksYJ t
| j|ksbJ t
| j|kskJ t
| j|kstJ t
| j|ks}J t
| j	|ksJ d S )Nr0   r1   r2   r3   r
   r   r   )superr   r   r0   r1   r2   r3   r
   r   r   lenr   r   r   	__class__r   r   __post_init__:   s   

zBlockedLayout.__post_init__c              	   C   $   | | j| j| j| j| j| j| jS r   )get_blocked_layoutr0   r1   r2   r3   r
   r   r   r*   r   r   r   r,   L      zBlockedLayout._to_irreturnc           	      C   s~   dd }|| j }|| j}|| j}|| j}|| j}|| j}|| j}d| d| d| d| d| d| d| dS )Nc                 S      | d u rdS d tt| S N _joinmapstrxr   r   r   	stringifyY      z'BlockedLayout.mangle.<locals>.stringifyB)r0   r1   r2   r3   r
   r   r   )	r   rH   r0   r1   r2   r3   r
   r   r   r   r   r   r.   W   s   






0zBlockedLayout.manglec              	   C   sb   t t| jt| jt| jt| j| jrt| jnd | jr"t| jnd | jr-t| jfS d fS r   )	hashtupler0   r1   r2   r3   r
   r   r   r   r   r   r   __hash__g   s   zBlockedLayout.__hash__r!   r"   r#   r$   r   int__annotations__r
   r   r   r   r9   r,   rE   r.   rM   __classcell__r   r   r7   r   r/   $   s   
 r/   c                       sP   e Zd ZU dZeed< eed<  fddZdd Zde	fd	d
Z
dd Z  ZS )SliceLayoutz
    Represents a layout corresponding to slicing a distributed tensor along one dimension.

    Args:
        dim (int): The dimension index to slice.
        parent (DistributedLayout): The parent layout before slicing.
    dimparentc                    s,   t  dt| j t  dt| j d S )NrS   rT   )r4   r   r   rS   rT   r   r7   r   r   r9      s   zSliceLayout.__post_init__c                 C   s   | | j| j|S r   )get_slice_layoutrS   rT   r,   r*   r   r   r   r,      s   
zSliceLayout._to_irr=   c                 C   s   d| j  d| j  dS )NSLrA   )rS   rT   r.   r   r   r   r   r.         zSliceLayout.manglec                 C   s   t | j| jfS r   )rK   rS   rT   r   r   r   r   rM      s   zSliceLayout.__hash__r!   r"   r#   r$   rO   rP   r   r9   r,   rE   r.   rM   rQ   r   r   r7   r   rR   s   s   
 rR   c                       s   e Zd ZU dZeee  ed< eee  ed< eee  ed< eee  ed< ee ed<  fddZd	d
 Zdd Z	dd Z
  ZS )DistributedLinearLayouta  
    Represents a linear distributed layout with explicit bases at register, lane, warp, and block levels.
    See: https://arxiv.org/abs/2505.23819 for reference.

    Args:
        reg_bases (List[List[int]]): Bases for register-level distribution.
        lane_bases (List[List[int]]): Bases for lane-level distribution.
        warp_bases (List[List[int]]): Bases for warp-level distribution.
        block_bases (List[List[int]]): Bases for block-level distribution.
        shape (List[int]): The tensor global shape.
    	reg_bases
lane_bases
warp_basesblock_basesshapec                    s   t  dt| j t  dt| j t  dt| j t  dt| j t  dt| j t| j}| jD ]
}t||ksDJ q:| jD ]
}t||ksRJ qH| jD ]
}t||ks`J qV| jD ]
}t||ksnJ qdd S )NrZ   r[   r\   r]   r^   )	r4   r   r   rZ   r[   r\   r]   r^   r5   )r   r   basisr7   r   r   r9      s   




z%DistributedLinearLayout.__post_init__c                 C   s   | | j| j| j| j| jS r   )get_distributed_linear_layoutrZ   r[   r\   r]   r^   r*   r   r   r   r,      s   zDistributedLinearLayout._to_irc                 C   s.   d| j  d| j d| j d| j d| j dS )NDLLrA   )rZ   r[   r\   r]   r^   r   r   r   r   r.      s   .zDistributedLinearLayout.manglec                 C   sH   t ttt| jttt| jttt| jttt| jt| jfS r   )rK   rL   rD   rZ   r[   r\   r]   r^   r   r   r   r   rM      s   z DistributedLinearLayout.__hash__)r!   r"   r#   r$   r   rO   rP   r9   r,   r.   rM   rQ   r   r   r7   r   rY      s   
 rY   c                       sX   e Zd ZU dZeed< eed< eed<  fddZdd Zd	e	fd
dZ
dd Z  ZS )DotOperandLayouta
  
    Represents a layout for a dot operand.

    Args:
        operand_index (int): 0 for LHS and 1 for RHS of the dot operation.
        parent (DistributedLayout): The parent layout, representing the MMA.
        k_width (int): Number of elements per 32-bits.
    operand_indexrT   k_widthc                    s@   t  dt| j t  dt| j t  dt| j d S )Nrc   rT   rd   )r4   r   r   rc   rT   rd   r   r7   r   r   r9      s   zDotOperandLayout.__post_init__c                 C   s   | | j| j|| jS r   )get_dot_operand_layoutrc   rT   r,   rd   r*   r   r   r   r,      rW   zDotOperandLayout._to_irr=   c                 C   s"   d| j  d| j  d| j dS )NDOrA   )rc   rT   r.   rd   r   r   r   r   r.      s   "zDotOperandLayout.manglec                 C   s   t | j| j| jfS r   )rK   rc   rT   rd   r   r   r   r   rM      s   zDotOperandLayout.__hash__rX   r   r   r7   r   rb      s   
 rb   )r'   eqc                       s   e Zd ZU dZee ed< ee ed< ee ed< dZeee  ed< dZ	eee  ed< dZ
eee  ed<  fd	d
Zdd ZdefddZdd Z  ZS )NVMMADistributedLayouta  
    Represents a layout for NVIDIA MMA (tensor core) operations.

    Args:
        version (List[int]): Version identifier for the MMA instruction.
        warps_per_cta (List[int]): Number of warps per CTA.
        instr_shape (List[int]): Instruction shape for MMA.
        ctas_per_cga (Optional[List[int]]): CTAs per CGA grouping.
        cta_split_num (Optional[List[int]]): Split factors for CTAs.
        cta_order (Optional[List[int]]): CTA ordering.
    versionr2   instr_shapeNr
   r   r   c                    s   t  dt| j t  dt| j t  dt| j t  dt| j t  dt| j t  dt| j t	| j}t
| | t	| j|ksOJ t	| j|ksXJ t	| j|ksaJ d S )Nri   r2   rj   r
   r   r   )r4   r   r   ri   r2   rj   r
   r   r   r5   r   r6   r7   r   r   r9      s   

z$NVMMADistributedLayout.__post_init__c                 C   s    | | j| j| j| j| j| jS r   )get_mma_layoutri   r2   r
   r   r   rj   r*   r   r   r   r,     s   zNVMMADistributedLayout._to_irr=   c                 C   s6   d| j  d| j d| j d| j d| j d| j dS )NMMA_rA   _MMA)ri   r2   rj   r
   r   r   r   r   r   r   r.   	  s   6zNVMMADistributedLayout.manglec                 C   sZ   t t| jt| jt| j| jrt| jnd | jrt| jnd | jr)t| jfS d fS r   )rK   rL   ri   r2   rj   r
   r   r   r   r   r   r   rM     s   zNVMMADistributedLayout.__hash__rN   r   r   r7   r   rh      s   
 rh   c                   @   r   )SharedLayoutz;
    Base class for shared memory layouts in Gluon IR.
    c                 C   r   r   r   r   r   r   r   r     r   zSharedLayout.typeNr    r   r   r   r   rn     r&   rn   c                 C   sJ   | }|d ur#t |t | ksJ tt |D ]}||  ||   < q|S r   )r5   r   )r^   r   shape_per_ctarS   r   r   r   _get_shape_per_cta  s   rp   c                       s   e Zd ZU dZeed< eed< eed< dZeed< dZeed< dZ	e
ee  ed	< dZe
ee  ed
< dZe
ee  ed<  fddZdd Zee		dddZdefddZdd Z  ZS )NVMMASharedLayouta4  
    Represents a layout for shared memory suitable for NVIDIA MMA operations.

    Args:
        swizzle_byte_width (int): Width in bytes for swizzling.
        element_bitwidth (int): Bitwidth of element type.
        rank (int): Rank of the tensor.
        transposed (bool): Whether the layout is transposed.
        fp4_padded (bool): Whether FP4 padding is used.
        ctas_per_cga (Optional[List[int]]): CTAs per CGA grouping.
        cta_split_num (Optional[List[int]]): Split factors for CTAs.
        cta_order (Optional[List[int]]): CTA ordering.
    swizzle_byte_widthelement_bitwidthr   F
transposed
fp4_paddedNr
   r   r   c                    s  t  dt| j t  dt| j t  dt| j t  dt| j t  dt| j t  dt| j t  dt| j	 t  dt| j
 | jd	v sWJ | jd
v s^J | j}t| | t| j|ksoJ t| j	|ksxJ t| j
|ksJ d S )Nrr   rs   r   rt   ru   r
   r   r   )          @   )r   rx   ry      )r4   r   r   rr   rs   r   rt   ru   r
   r   r   r   r5   r6   r7   r   r   r9   ?  s   
zNVMMASharedLayout.__post_init__c              	   C   r:   r   )get_nvmma_shared_layoutrr   rs   rt   ru   r
   r   r   r*   r   r   r   r,   Q  r<   zNVMMASharedLayout._to_irc              
   C   s   |rdnd}t | |}t| }	|r|dd |dd  }|d | }
|
|j d }|dkr7|d dkr7d}n|dkrD|d dkrDd}n|d	krQ|d	 dkrQd	}nd}d}|dd D ]}||9 }q[t| dk sl|dk rnd}t||j|	|||||d
S )zReturns an NVMMASharedLayout with default swizzling for a given shape.

        This picks the largest swizzle pattern compatible with the shape, which
        allows emitting the fewest TMA or MMA messages.
           r	   Nrv   rz   r   ry   rx   )rr   rs   r   rt   ru   r
   r   r   )rp   r5   primitive_bitwidthrq   )block_shapedtypert   ru   r
   r   r   packing_factorro   r   contig_dim_sizecontig_dim_bytesrr   flatten_outer_dimsizer   r   r   get_default_for\  s:   	

z!NVMMASharedLayout.get_default_forr=   c              	   C   s&   d| j  d| j d| j d| j d	S )NNVMMA_rA   _NVMMA)rr   rs   rt   ru   r   r   r   r   r.     s   &zNVMMASharedLayout.manglec              
   C   sV   t | j| j| j| j| j| jrt| jnd | jrt| jnd | j	r't| j	fS d fS r   )
rK   rr   rs   r   rt   ru   r
   rL   r   r   r   r   r   r   rM     s   zNVMMASharedLayout.__hash__)FFNNN)r!   r"   r#   r$   rO   rP   rt   boolru   r
   r   r   r   r   r9   r,   staticmethodr   r   rE   r.   rM   rQ   r   r   r7   r   rq   '  s&   
 (rq   c                       s   e Zd ZU dZeed< eed< eed< ee ed< dZeee  ed< dZ	eee  ed< dZ
eee  ed	<  fd
dZdd ZdefddZdd Z  ZS )SwizzledSharedLayouta  
    Represents a generic swizzled shared memory layout.

    Args:
        vec (int): Vector width for swizzling.
        per_phase (int): Elements per swizzle phase.
        max_phase (int): Maximum number of swizzle phases.
        order (List[int]): Dimension ordering for swizzling.
        ctas_per_cga (Optional[List[int]]): CTAs per CGA grouping.
        cta_split_num (Optional[List[int]]): Split factors for CTAs.
        cta_order (Optional[List[int]]): CTA ordering.
    vec	per_phase	max_phaser3   Nr
   r   r   c                    s   t  dt| j t  dt| j t  dt| j t  dt| j t  dt| j t  dt| j t  dt| j	 t
| j}t| | t
| j|ksYJ t
| j|ksbJ t
| j	|kskJ d S )Nr   r   r   r3   r
   r   r   )r4   r   r   r   r   r   r3   r
   r   r   r5   r   r6   r7   r   r   r9     s   

z"SwizzledSharedLayout.__post_init__c              	   C   r:   r   )get_swizzled_shared_layoutr   r   r   r3   r
   r   r   r*   r   r   r   r,     r<   zSwizzledSharedLayout._to_irr=   c                 C   sV   dd }d| j  d| j d| j d|| j d|| j d|| j d|| j dS )Nc                 S   r>   r?   rB   rF   r   r   r   rH     rI   z.SwizzledSharedLayout.mangle.<locals>.stringifySSS_rA   _SSS)r   r   r   r3   r
   r   r   r   rH   r   r   r   r.     s   NzSwizzledSharedLayout.manglec              	   C   sV   t | j| j| jt| j| jrt| jnd | jrt| jnd | jr't| jfS d fS r   )	rK   r   r   r   rL   r3   r
   r   r   r   r   r   r   rM     s   zSwizzledSharedLayout.__hash__)r!   r"   r#   r$   rO   rP   r   r
   r   r   r   r9   r,   rE   r.   rM   rQ   r   r   r7   r   r     s   
 	r   c                       s   e Zd ZU dZeee  ed< ee ed< dZeee  ed< dZ	eee  ed< dZ
eee  ed<  fdd	Zd
d ZdefddZdd Zdd Z  ZS )PaddedSharedLayouta  
    Represents a layout for the access to shared memory. Compared to SwizzledSharedLayout,
    it uses padding to avoid shared memory bank conflicts. After every interval tensor elements,
    the corresponding number of padding elements are inserted.
    If a position corresponds to multiple intervals, the padding amounts are summed.

    In the following example of a tensor,
    `eM` represents original elements in the and `pN` represents padded element.

    Before padding, the shared memory looks like:
    [e0, e1,
     e2, e3,
     e4, e5,
     e6, e7,
     ...]

    After padding with interval-padding list [[2, 1], [4, 2]],
    the shared memory will be
    [e0, e1, p0,
     e2, e3, p1, p2, p3,
     e4, e5, p4,
     e6, e7, p5, p6, p7,
     ...]

    Args:
        interval_padding_pairs (List[int]): List of [interval, padding] pair and both interval and padding must be powers of 2.
        order (List[int]): Order of logical tensor dimensions; fastest-varying first.
        ctas_per_cga (Optional[List[int]]): CTAs per CGA grouping.
        cta_split_num (Optional[List[int]]): Split factors for CTAs.
        cta_order (Optional[List[int]]): CTA ordering.
    interval_padding_pairsr3   Nr
   r   r   c                    sp   t  dt| j t  dt| j t  dt| j t  dt| j t  dt| j | 	  d S )Nr   r3   r
   r   r   )
r4   r   r   r   r   r3   r
   r   r   verifyr   r7   r   r   r9     s   z PaddedSharedLayout.__post_init__c                 C   s*   t | j \}}|||| j| j| j| jS r   )zipr   get_padded_shared_layoutr3   r
   r   r   )r   r+   	intervalspaddingsr   r   r   r,      s   zPaddedSharedLayout._to_irr=   c                 C   sJ   dd }d|| j  d|| j d|| j d|| j d|| j dS )Nc                 S   r>   r?   rB   rF   r   r   r   rH     rI   z,PaddedSharedLayout.mangle.<locals>.stringifyPaddedShared_rA   _PaddedShared)r   r3   r
   r   r   r   r   r   r   r.     s   BzPaddedSharedLayout.manglec                    s   | j }t|dksJ dtdd |D sJ t| \}}tt|}t|t|ks.J dd  t fdd|D sAJ dt fd	d|D sPJ d
t| j}|dks]J dt| | t| j|kskJ t| j	|kstJ t| j
|ks}J d S )Nr   zVPaddedSharedLayout interval_padding_pairs must have at least one interval-padding pairc                 s   s    | ]	}t |d kV  qdS )r|   N)r5   ).0pairr   r   r   	<genexpr>  s    z,PaddedSharedLayout.verify.<locals>.<genexpr>c                 S   s   | dko| | d @ dkS )Nr   r	   r   )nr   r   r   <lambda>  s    z+PaddedSharedLayout.verify.<locals>.<lambda>c                 3       | ]} |V  qd S r   r   r   r   is_power_of_2r   r   r         z;PaddedSharedLayout interval values must all be power of twoc                 3   r   r   r   r   r   r   r   r     r   z:PaddedSharedLayout padding values must all be power of twoz*PaddedSharedLayout order must not be empty)r   r5   allr   r   setr3   r   r
   r   r   )r   pairsr   r   unique_intervalsr   r   r   r   r     s   

zPaddedSharedLayout.verifyc                 C   sX   t ttt| jt| j| jrt| jnd | jrt| jnd | jr(t| jfS d fS r   )rK   rL   rD   r   r3   r
   r   r   r   r   r   r   rM   #  s   zPaddedSharedLayout.__hash__)r!   r"   r#   r$   r   rO   rP   r
   r   r   r   r9   r,   rE   r.   r   rM   rQ   r   r   r7   r   r     s   
 		r   c                 C   sz   dg| }| s	|S d }| D ]-}t dd t|D d }|d ur*|}||  d9  < q|s:|d us2J ||  d9  < q|S )Nr	   c                 s   s     | ]\}}|d kr|V  qdS )r   Nr   )r   ivr   r   r   r   5  s    z bases_per_dim.<locals>.<genexpr>r|   )next	enumerate)basesr   skip_broadcastresultnon_zero_idxr_   idxr   r   r   bases_per_dim+  s   
r   c                 C   s:   t | trt| jt|S t | ttfrt| j|S | jS r   )	
isinstancerY   r   r\   r5   rR   rb   r2   rT   )r   r^   r   r   r   r2   A  s
   
r2   N)T)dataclassesr   typingr   r   triton.language.corer   r   r   triton.runtime.jitr   r   r   r(   r/   rR   rY   rb   rh   rn   rp   rq   r   r   r   r2   r   r   r   r   <module>   s:    	
	N5
/

	
h
?
Z