o
    i                     @   s   d dl Z d dlZd dlmZ d dlmZ ejdejdejfddZ	ejdejdejfddZ
G d	d
 d
eZdddZdd ZdS )    N)Functioniters
BLOCK_SIZEc	                 C   s  t d}	t d|}
t d|}|
d d d f |k |d d d f |k @ }| |	|  }t j||
d d d f |  |d d d f |  |dd}t |D ]|}t jt ||ddd}t ||d d d f  }t ||d}|t t j	|dd }||d d d f  }t ||d}t jt ||ddd}t ||d d d f  }t ||d}|t t j	|dd }||d d d f  }t ||d}qLt |}t ||d}||	|  }t j
||
d d d f |  |d d d f |  ||d d S )Nr   g    _©maskotheraxis           r   )tl
program_idarangeloadstatic_rangemaxwhereexplogsumstore)	input_ptr
output_ptrMNstride_bstride_mstride_nr   r   pid_boffs_moffs_nr   curr_input_ptr	log_alpha_col_maxexp_weights_colcol_lserow_maxexp_weights_rowrow_lseresult_alphacurr_output_ptr r-   U/home/ubuntu/.local/lib/python3.10/site-packages/hyper_connections/triton_sinkhorn.pysinkhorn_kernel_forward_log   s.   
	(8
<r/   c
                 C   s  t d}
t d|	}t d|	}|d d d f |k |d d d f |k @ }||
|  }| |
|  }t j||d d d f |  |d d d f |  |dd}t j||d d d f |  |d d d f |  |dd}t ||d}t ||d}t |D ]@}t jt ||| ddd}||d d d f  }t ||d}t jt ||| ddd}||d d d f  }t ||d}q||| }||
|  }t j||d d d f |  |d d d f |  ||d d S )Nr   r
   r   r   r   r   )r   r   r   r   r   r   r   r   )grad_output_ptrr   grad_input_ptrr   r   r   r   r   r   r   r   r    r!   r   r,   curr_grad_output_ptralpha
grad_alphar$   row_sum_grad_alphacol_sum_grad_alpha
grad_inputcurr_grad_input_ptrr-   r-   r.   sinkhorn_kernel_backward_log0   s(   

(88<r9   c                   @   s&   e Zd ZedddZedd ZdS )TritonSinkhornFunction   c                 C   s   |j dd  \}}t||dkrddlm} |||S |j d d }|d|| }|j d }t|}	tdt	t||}
t
|f ||	|||d|d|d||
d	d

 | |	 || _|	jg |||R  S )N   r   )log_domain_sinkhorn_knopps    r         r   r   	num_warps)shaper   hyper_connections.mHCv2r>   view
contiguoustorch
empty_liketritonnext_power_of_2r/   stridesave_for_backwardr   )ctxr#   r   r   r   r>   batch_shapelog_alpha_flatBoutputr   r-   r-   r.   forward[   s*   




zTritonSinkhornFunction.forwardc           	      C   s   | j \}| j}|j\}}}tdtt||}| }t|}t	|f |
||||||||d|d|d||dd ||d fS )Nr@   r   r   rA   rB   rC   )saved_tensorsr   rE   r   rK   rL   rH   rI   rJ   r9   rG   rM   view_as)	rO   grad_outputrS   r   rR   r   r   r   r7   r-   r-   r.   backwardx   s"   
zTritonSinkhornFunction.backwardNr;   )__name__
__module____qualname__staticmethodrT   rX   r-   r-   r-   r.   r:   Z   s
    r:   r;   c                 C   s@   | j rzt| |W S  ty   Y nw ddlm} || |dS )Nr   )sinkhorn_knopps)r   )is_cudar:   apply	ExceptionrF   r^   )r#   r   r^   r-   r-   r.   triton_sinkhorn   s   rb   c                  C   s*   z
dd l } tj W S  ty   Y dS w )Nr   F)rK   rI   cudais_availableImportError)rK   r-   r-   r.   is_triton_available   s   rf   rY   )rI   rK   triton.languagelanguager   torch.autogradr   jit	constexprr/   r9   r:   rb   rf   r-   r-   r-   r.   <module>   s&    ))
6