o
    پi.                  
   @   s  d dl Z d dlmZ d dl mZmZ d dlmZ dejdejfddZdejde	e
 dejfd	d
Zdejde
dee
B dejfddZejdejddfddZejdejddfddZejdejddfddZejdejdejfddZdejdejfddZdejdejfddZejdejdejfdd Zd!ejejB d"ejdejfd#d$Zd%ejd&ejjd'e
d(edejf
d)d*Zd%ejd&ejjd'e
d(edejf
d+d,ZdS )-    N)Int32
const_expr)prmtareturnc                 C   sT   | j d | j d g| j dd R }ddgtdt| R }t| tj||dS )z7Transpose the first two dimensions of a tensor on smem.   r      N)order)shaperangecuterankcompositionmake_ordered_layout)r   r
   r	    r   F/home/ubuntu/.local/lib/python3.10/site-packages/quack/layout_utils.pytranspose_view   s   $r   modec                 C   s   t | jt | j|S N)r   make_tensoriteratorselectlayout)r   r   r   r   r   r      s   r   dimsizec                 C   sh   g | j d | || j |d  R }g | jjd | d| jj|d  R }t| jtj||dS )Nr   stride)r
   r   r   r   r   r   make_layout)r   r   r   r
   r   r   r   r   expand   s   &*r   tc                 C   s  | j jdksJ t| jd dksJ dt| t}tj d }|dkp*|dk}|r1tdntd}|r;tdntd	}|d
 dkrI|d
 nd|d
  }|dA }d}tjj	| }	tjj	d }
|	d> |
B }t
jt|jd
 ddD ]I}||d
 d  ||d
 d  }}|r|n|}|r|n|}tjj|||d}tjj|||d}t|||||d
 d < t|||||d
 d < qtd S )N      r   z7Tensor size must be a multiple of 4 for b16 permutation   iT  iT  i2v  iv2  r   r      Tunroll_fulloffsetmask_and_clamp)element_typewidthr   r   r
   recast_tensorr   archlane_idx	WARP_SIZEcutlassr   shuffle_syncr   )r   t_u32quad_idxlane_03selector_upperselector_lower	upper_idx	lower_idxr*   maskclampr(   iupperlowerupper0lower0r   r   r   permute_gated_Cregs_b16   s,    "r?   c                 C   s  | j jdksJ t| jd dksJ dtj d }|d dkr'|d nd|d  }|dA }d}tjj| }tjjd }|d> |B }tj	t| jd dd	D ]}tj	ddd	D ]e}	| |d |	d  d  | |d |	d  d  }
}|dk r||
n|}|dk r|n|
}tjj
|||d
}tjj
|||d
}|d dkr|n|| |d |	d  d < |d dkr|n|| |d |	d  d < q[| |d d  | |d d  | |d d < | |d d < qRdS )a4  Permute and shuffle within 4 threads to change the layout from
     T0 | T1  | T2  | T3
    a b | c d | e f | g h
    to
    T0 | T1 | T2 | T3 | T0 | T1 | T2 | T3
    a  | b  | c  | d  | e  | f  | g  | h
    This is so that we can use STSM (instead of STS.64) to store C registers without bank conflict.
        r!   r   7Tensor size must be a multiple of 4 for b32 permutationr   r   r#   Tr$   r&   Nr)   r*   r   r   r
   r,   r-   r.   r/   r   r0   r   r2   left_idx	right_idxr*   r8   r9   r(   r:   rleftrightleft0right0r   r   r   permute_Cregs_b32_for_stsm>   s(    2(*<rK   c                 C   s  | j jdksJ t| jd dksJ dtj d }|d dkr'|d nd|d  }|dA }d}tjj| }tjjd }|d> |B }tj	t| jd dd	D ]}| |d d  | |d d  | |d d < | |d d < tj	ddd	D ]e}	| |d |	d  d  | |d |	d  d  }
}|d dkr|
n|}|d dkr|n|
}tjj
|||d
}tjj
|||d
}|dk r|n|| |d |	d  d < |dk r|n|| |d |	d  d < qxqRdS )a4  Permute and shuffle within 4 threads to change the layout from
    T0 | T1 | T2 | T3 | T0 | T1 | T2 | T3
    a  | b  | c  | d  | e  | f  | g  | h
    to
     T0 | T1  | T2  | T3
    a b | c d | e f | g h
    This is so that we can use LDSM (instead of LDS.64) to store C registers without bank conflict.
    r@   r!   r   rA   r   r   r#   Tr$   r&   NrB   rC   r   r   r   permute_Cregs_b32_for_ldsmi   s*    :2$&rL   layoutsc                  G   s*   t jtdd | D tdd | D dS )Nc                 s       | ]}|j V  qd S r   r
   .0lr   r   r   	<genexpr>       z concat_layout.<locals>.<genexpr>c                 s   rN   r   r   rP   r   r   r   rS      rT   r   )r   r   tuple)rM   r   r   r   concat_layout   s   rV   
acc_layoutc                 C   s   t | j}t j|jd d |jd f|jd d g|jd dd |jd R g|jdd R |jd d |jd f|jd d g|jd dd |jd R g|jdd R d}t | |S )z
    For Sm80, convert ((2, 2), MMA_M, MMA_N, ...) to ((2, MMA_M), (2, MMA_N), ...).
    For Sm90, convert ((2, 2, V), MMA_M, MMA_N, ...) to ((2, MMA_M), (2, V, MMA_N), ...).
    r   r   r   Nr"   r   )r   r   r
   r   r   )rW   acc_layout_col_majoracc_layout_mnr   r   r   convert_layout_acc_mn   s0   
rZ   accc                 C   s   t | jt| jS r   )r   r   r   rZ   r   )r[   r   r   r   make_acc_tensor_mn_view   s   r\   c                 C   s@  t t| jd dkrgt| d}tj|jd d |jd d |jd d d f|jd |jd d d |jd ff|jd d |jd d |jd d d f|jd |jd d d |jd ffd}|S t| d}tj|jd |jd d f|jd |jd d f|jd |jd d f|jd |jd d fd}|S )Nr   r"   )NNr   NNr   r   r   r]   )r   r   r   r
   logical_divider   r   )rW   rR   rA_mma_viewr   r   r   convert_layout_acc_frgA   s6   **r`   input
ref_layoutc                    s  t t| tjr| jn| }t| t|fddtt D }fddtt D }t|dkrDt	 fdd|D ndt	 fdd|D f}t|dkrbt	 fd	d|D nd
t	 fdd|D f}tj
||d}t t| tjrt| j|S |S )Nc                    s   g | ]} | j d kr|qS r   r   rQ   r:   ref_layout_flatr   r   
<listcomp>       z.convert_layout_zero_stride.<locals>.<listcomp>c                    s   g | ]} | j d kr|qS rc   r   rd   re   r   r   rg      rh   r   c                 3       | ]} | j V  qd S r   rO   rd   layout_flatr   r   rS          z-convert_layout_zero_stride.<locals>.<genexpr>)r   c                 3   ri   r   rO   rd   rj   r   r   rS      rl   c                 3   ri   r   r   rd   rj   r   r   rS      rl   rc   c                 3   ri   r   r   rd   rj   r   r   rS      rl   r   )r   
isinstancer   Tensorr   flattenr   r   lenrU   r   r   r   )ra   rb   r   nonzero_modes
zero_modes	new_shape
new_stride
out_layoutr   )rk   rf   r   convert_layout_zero_stride   s   

$$rv   sVecthr_mmaexpand_shape	is_colvecc           	      C      t | dks	J | jd dksJ | jd }t|r#| jd ||fn|| jd |f}t|r7dd| jd fndd| jd f}t | jt j||d}t|	|}t|rZ|d S |d S Nr   r   r   r   )Nr   N)r   NN)
r   r   r   r
   r   r   r   r   r\   partition_C	rw   rx   ry   rz   stager
   r   sVec_mmatC_sVecr   r   r   mma_partition_C_vec      
(r   c           	      C   r{   r|   )
r   r   r   r
   r   r   r   r   r\   partition_Ar~   r   r   r   mma_partition_A_vec  r   r   )r/   cutlass.cuter   r   r   quack.utilsr   rn   r   listintr   r   jitr?   rK   rL   LayoutrV   rZ   r\   r`   rv   coreThrMmaboolr   r   r   r   r   r   <module>   s`   " *+'


