o
     i1                     @   s\  d dl Z d dlmZmZ d dlZddlmZ dejdejde	fdd	Z
d
eeej  deeej  deee ee ee f fddZdeeej  dee dee ddfddZd
eeej  deeej  deeej  ddfddZdeeej  dededeej fddZdeej dededeeej  fddZdeej dededeej fddZejjdd d!d"d#eej d$eej dee dee d%ee deej fd&d'Zejdd#eej d$eej dee dee d%ee deej fd(d)Zd*d+ Zd,d- Zejjdeed. d
eeej  deeej  deeej  fd/d0ZdS )1    N)ListTuple   )_is_triton_availabledevicedtypereturnc                 C   s:   t tdds
dS t sdS tj| }|dk rdS dS )N#XFORMERS_TILED_MATMUL_ENABLE_TRITON1F)   r   T)intosgetenvr   torchcudaget_device_capability)r   r   device_capability r   M/home/ubuntu/.local/lib/python3.10/site-packages/xformers/ops/tiled_matmul.py_should_use_triton   s   r   abc                    s  t  dkrt  d dkrt fdd D sJ dt dkr6t d dkr6tfddD s:J dt  }t  d }t |ksVJ d| d	t  t d } fd
dt|D }fddt|D } fddt|D }fddt|D }t|D ]U}	t|D ]N}
 |	 |
 jd ||	 ksJ d|	 d||	  d |	 |
 jd  d|
  |	 |
 jd ||
 ksJ d|
 d||
  d |	 |
 jd  d|	 qqt|D ]W}t|D ]P}
|
 | jd ||
 ksJ d|
 d||
  d|
 | jd  d| |
 | jd || ks<J d| d||  d|
 | jd  d|
 qqt|D ] }
||
 ||
 ksaJ d|
 d|
 d||
  d	||
  qB|}|||fS )N   r   c                 3   $    | ]}t |t  d  kV  qdS r   Nlen.0rowr   r   r   	<genexpr>$      " zcheck_inputs.<locals>.<genexpr>zUthe first operand must be a non-empty two-dimensional regular list of lists of tenorsc                 3   r   r   r   r   r   r   r   r!   (   r"   zVthe second operand must be a non-empty two-dimensional regular list of lists of tenorszYthe first operand's inner dimension must match the second operand's outer dimension, got  and c                       g | ]} | d  j d  qS r   shaper   tile_mr    r   r   
<listcomp>5       z check_inputs.<locals>.<listcomp>c                       g | ]} d  | j d qS r   r   r'   r   tile_nr#   r   r   r+   6   r,   c                    r-   r.   r'   r   tile_kr    r   r   r+   7   r,   c                    r%   r&   r'   r1   r#   r   r   r+   8   r,   the tensors on row zM of the first operand must all have the same size along the m dimension, got  at position 0 and  at position the tensors on column zM of the first operand must all have the same size along the k dimension, got zN of the second operand must all have the same size along the k dimension, got zN of the second operand must all have the same size along the n dimension, got z' of the first operand and those on row zJ of the second operand must have the same size along the k dimension, got r   allranger(   )r   r   m_tilesk_tilesn_tilesmsnsaksbksr*   r2   r0   ksr   )r   r   r   check_inputs    s   44
rB   outr=   r>   c           	         s  t |t |}}t  dkr"t  d dkr"t fdd D s&J dt  |ks.J t  d |ks8J  fddt|D } fddt|D }t|D ]U}t|D ]N} | | jd || ksJ d	| d
||  d | | jd  d|  | | jd || ksJ d| d||  d | | jd  d| qXqRt|D ]}|| || ksJ d	| d| d||  d||  qt|D ]}|| || ksJ d| d| d||  d||  qd S )Nr   r   c                 3   r   r   r   r   rC   r   r   r!   d   r"   zcheck_output.<locals>.<genexpr>zGout must be a non-empty two-dimensional regular list of lists of tenorsc                    r%   r&   r'   r)   rD   r   r   r+   h   r,   z check_output.<locals>.<listcomp>c                    r-   r.   r'   r/   rD   r   r   r+   i   r,   r3   z? of out must all have the same size along the m dimension, got r4   r5   r6   z? of out must all have the same size along the k dimension, got z of out and those on row zI of the first operand must have the same size along the m dimension, got r$   z of out and those on column zJ of the second operand must have the same size along the n dimension, got r7   )	rC   r=   r>   r:   r<   cmscnsr*   r0   r   rD   r   check_output_   sb   rG   c           
      C   s  t | |\}}}t||| t|dkr@t|dkr@t|dkr@t| d d j| d d jr@ddlm} || ||||| d S tt|D ];}tt|D ]2}t	j
| | d |d | || | d tdt|D ]}	|| | | | |	 ||	 |  qkqNqFd S )N   r   r   )_launch_triton_matmulrD   )rB   rG   r   r   r   r   _triton.tiled_matmul_kernelsrI   r9   r   mmaddmm_)
r   r   rC   r=   r>   rA   rI   r*   r0   r2   r   r   r   tiled_matmul_out   s"   (&rM   xrowscolsc                    sP   t | |ksJ t fdd| D sJ dd | D }t ||  ks&J |S )Nc                 3       | ]	}t | kV  qd S Nr   r   rP   r   r   r!          z_flatten.<locals>.<genexpr>c                 S   s   g | ]	}|D ]}|qqS r   r   )r   r   elemr   r   r   r+          z_flatten.<locals>.<listcomp>)r   r8   )rN   rO   rP   flat_xr   rS   r   _flatten   s
   rX   rW   c                    sb   t  | ks
J  fddtd|   D }t ||ks"J t fdd|D s/J |S )Nc                    s   g | ]
}||   qS r   r   )r   
row_offsetrP   rW   r   r   r+      s    z_unflatten.<locals>.<listcomp>r   c                 3   rQ   rR   r   r   rS   r   r   r!      rT   z_unflatten.<locals>.<genexpr>)r   r9   r8   )rW   rO   rP   rN   r   rZ   r   
_unflatten   s   r[   c                 C   s.   t | ||}dd t| D }t|||}|S )Nc                 S   s   g | ]	}d d |D qS )c                 S   s   g | ]}|  qS r   )t)r   rU   r   r   r   r+      s    z3_flattened_transpose.<locals>.<listcomp>.<listcomp>r   )r   colr   r   r   r+      rV   z(_flattened_transpose.<locals>.<listcomp>)r[   ziprX   )rW   rO   rP   rN   transposed_xflat_transposed_xr   r   r   _flattened_transpose   s   ra   z!xformers_python::tiled_matmul_fwdr   r   )mutates_argsdevice_typesflat_aflat_brA   c                    s^   t | t|t| t |t|t} fdd|D }t ||d t|t|tS )Nc                        g | ]  fd dD qS )c                    "   g | ]} d  d   |fqS r&   	new_emptyr   n)r   mr   r   r+         " z/tiled_matmul_fwd.<locals>.<listcomp>.<listcomp>r   r   r   r>   rl   r   r+           z$tiled_matmul_fwd.<locals>.<listcomp>rD   )r[   r   rM   rX   )rd   re   r=   r>   rA   r   cr   ro   r   tiled_matmul_fwd   s
   rs   c                    s(    fdd|D }t |t|t|S )Nc                    rf   )c                    rg   r&   rh   rj   )rd   rl   r   r   r+      rm   z4tiled_matmul_fwd_fake.<locals>.<listcomp>.<listcomp>r   rn   rd   r>   rp   r   r+      rq   z)tiled_matmul_fwd_fake.<locals>.<listcomp>)rX   r   )rd   re   r=   r>   rA   rr   r   rt   r   tiled_matmul_fwd_fake   s   ru   c                 C   s,   |\}}| _ | _| _| jg ||R   d S rR   )r=   r>   rA   save_for_backward)ctxinputsoutputrd   re   r   r   r   tiled_matmul_setup_context   s   rz   c                 C   s   t | jt | jt | j t | jt | j  ksJ | jd t | jt | j  }| jt | j t | j d  }t|t | jt | j}t|t | jt | j}t||| j| j| j}t||| j| j| j}||d d d fS rR   )r   saved_tensorsr=   rA   r>   ra   rs   )rw   flat_grad_crd   re   flat_transposed_aflat_transposed_bflat_grad_aflat_grad_br   r   r   tiled_matmul_bwd   s   $ r   )setup_contextc           	      C   s`   t | |\}}}t| t|t|}t|t|t|}t|||||}t|t|t|}|S )ax  Multiply two matrices given as grids of tiles

    It performs the matmul between A and B, which are given as two-dimensional
    grids of tiles (i.e., blocks), represented as lists of lists of tensors.
    The output will itself be a matrix in such a form. Formally:

        out[m][n] = sum(a[m][k] @ b[k][n] for k in range(...))

    with the obvious constraints needed to make it work, in terms of number of
    tiles and sizes of each tile.

    The interest of this operator is to improve performance by avoding wave
    quantization effects when doing independent matrix multiplications in
    series. Sometimes, when these matmuls have one operand in common, this can
    also be addressed by concatenating the other operands into a single matrix,
    and issuing a single matmul. However this isn't always possible (e.g., might
    break the checkpoint format) and it's an anti-pattern, as it obscures the
    logic (e.g., changing the modelling code out of performance reasons). This
    tiled matmul performs the same computation as if the matrices were merged,
    without merging them, simply through a smarter memory addressing scheme.

    The tiled matmul is less generic than a grouped matmul, which can also help
    with wave quantization, and doesn't need the matmuls to have the same lhs
    or rhs operand. However, a grouped matmul will write the result of each
    matmul to a separate output matrix, whereas the tiled matmul allows to add
    them together into a single output. This is needed during the backward pass
    of a linear layer, and it's the reason we wrote this instead of using a
    grouped matmul.

    The tiled matmul is implemented using a custom Triton kernel, which puts
    constraints on the strides of the tiles. All rows of A must have the same
    K stride, all columns of A must have the same M stride, and so on.

    Currently the tiled matmul supports at most three tiles on each dimension,
    although fewer can also be given. This is because we needed it to fuse the
    query, key and value weights of an attention layer. This limit can be
    increased if needed.

    This operator is differentiable.

    )rB   rX   r   rs   r[   )	r   r   r=   r>   rA   rd   re   flat_crr   r   r   r   tiled_matmul
  s   /r   )r   typingr   r   r    r   r   r   boolr   Tensorr   rB   rG   rM   rX   r[   ra   library	custom_oprs   register_fakeru   rz   r   register_autogradr   r   r   r   r   <module>   s   
,?*
*


