o
     i/                     @   s  d dl mZmZmZ d dlZd dlmZ ddlmZm	Z	m
Z
mZ ddlmZmZmZmZ ddlmZmZ ejjdd	d
ddejdeej dededeej f
ddZejddejdeej dededeej f
ddZejjdd	d
ddejdeej deej dededeejeej f fddZejddejdeej deej dededeejeej f fddZdd Zdd Zejjdeed d ejd!eej ded"ej j!deej f
d#d$Z"ejjd%d	d
dd&ejd'ejdededejf
d(d)Z#ejd%d&ejd'ejdededejf
d*d+Z$ejjd,d	d
dd&ejd'ejd-ejdededeejejf fd.d/Z%ejd,d&ejd'ejd-ejdededeejejf fd0d1Z&d2d3 Z'd4d5 Z(ejjd%e(e'd d ejd6ejded"ej j!dejf
d7d8Z)dS )9    )CallableListTupleN)_resolve_process_group   )gather_along_first_dimgather_along_first_dim_asyncreduce_scatter_along_first_dim$reduce_scatter_along_first_dim_async)fused_allgather_and_anythingfused_allgather_and_linear fused_anything_and_reducescatterfused_linear_and_reducescatter)tiled_matmultiled_matmul_outz5xformers_python::sequence_parallel_leading_matmul_fwd cuda)mutates_argsdevice_typesscattered_inputweightsfuseprocess_group_namereturnc                 C   sT   t |}|rt| dd |D |d}|S t| |d}t|ggdd |D g\}|S )Nc                 S   s   g | ]}|  qS r   t.0wr   r   G/home/ubuntu/.local/lib/python3.10/site-packages/xformers/ops/seqpar.py
<listcomp>*       z8sequence_parallel_leading_matmul_fwd.<locals>.<listcomp>groupprocess_groupc                 S      g | ]}|qS r   r   r   r   r   r   r    2       )r   r   r   r   )r   r   r   r   r%   gathered_outputsgathered_inputr   r   r   $sequence_parallel_leading_matmul_fwd   s   r*   c                    s    t |   fdd|D S )Nc                    s*   g | ]} jd    |jd fqS )r   r   )	new_emptyshaper   mp_sizer   r   r   r    ?   s    z=sequence_parallel_leading_matmul_fwd_fake.<locals>.<listcomp>)r   size)r   r   r   r   r   r-   r   )sequence_parallel_leading_matmul_fwd_fake7   s   r0   z5xformers_python::sequence_parallel_leading_matmul_bwdgrad_gathered_outputsc                    sz  t |}| dd |D }|rtt| }dd D fdd|D dttj dtdtg tjj	f dd ffd	d
}t
||g|d dd D  dttj dtdtg tjj	f dd f fdd}t| g||d |fS t| |d\}	}
tdd |D gdd D \\}|
d ur|
  t||d\}}
tdd |D |	gg}|
d ur|
  dd |D |fS )Nc                 S   s.   g | ]}t d d | D r| n|qS )c                 s       | ]}|d kV  qdS r   Nr   r   sr   r   r   	<genexpr>W       zBsequence_parallel_leading_matmul_bwd.<locals>.<listcomp>.<genexpr>)anystridecloner   grad_gor   r   r   r    V   s     z8sequence_parallel_leading_matmul_bwd.<locals>.<listcomp>c                 S      g | ]}t |qS r   )torch
zeros_liker   r   r   r   r    ]       c                    s   g | ]	}|j  d dqS )r   dim)tensor_splitr;   )r.   r   r   r    _   s    grad_gathered_inputsdst_rankstream_factoryr   c                    sf   | \}t j|  t fddD gdd D |ggd W d    d S 1 s,w   Y  d S )Nc                    s   g | ]}|  qS r   r   )r   grad_gosrE   r   r   r    k   r!   zNsequence_parallel_leading_matmul_bwd.<locals>.my_si_matmul.<locals>.<listcomp>c                 S      g | ]}|  gqS r   r   r   r   r   r   r    l   r@   out)r>   r   streamr   )rD   rE   rF   grad_gi)grad_gathered_outputssr   rH   r   my_si_matmulc   s   "z:sequence_parallel_leading_matmul_bwd.<locals>.my_si_matmulr"   c                 S   s   g | ]}t j qS r   )r>   r   Event)r   _r   r   r   r    y   r@   gathered_inputs_shardsrc_rankc              	      sx   | \}t  D ]0\}}}tj|  |  | ||  | |  W d    n1 s4w   Y  q	d S N)zipr>   r   rL   waitr   addmm_record)rR   rS   rF   gi_shardrG   grad_wevent)eventsrN   grad_weightsr   r   my_w_matmul{   s   
z9sequence_parallel_leading_matmul_bwd.<locals>.my_w_matmulr$   c                 S   r&   r   r   r;   r   r   r   r       r'   c                 S   rI   r   r   r   r   r   r   r       r@   c                 S   rI   r   r   r;   r   r   r   r       r@   c                 S   s   g | ]\}|  qS r   r   )r   rZ   r   r   r   r       r@   )r   r/   r>   
empty_liker   Tensorintr   r   Streamr   r   r   r   rV   r
   )r   r   r1   r   r   r%   grad_scattered_inputrO   r^   r)   handlegrad_gathered_inputgrad_weights_tuplesr   )r\   rN   r]   r.   r   r   $sequence_parallel_leading_matmul_bwdE   sz   

	

rg   c                 C   s   t | dd |D fS )Nc                 S   r=   r   r>   r_   r   r   r   r   r       r@   z=sequence_parallel_leading_matmul_bwd_fake.<locals>.<listcomp>rh   )r   r   r1   r   r   r   r   r   )sequence_parallel_leading_matmul_bwd_fake   s   ri   c                 C   s.   |\}}}}| j |g|R   || _|| _d S rT   save_for_backwardr   r   )ctxinputsoutputr   r   r   r   r   r   r   .sequence_parallel_leading_matmul_setup_context   s   
ro   c                 C   s6   | j ^}}t|t|t|| j| j\}}||d d fS rT   )saved_tensorsrg   listr   r   )rl   r1   r   r   rc   r]   r   r   r   +sequence_parallel_leading_matmul_bwd_bridge   s   
rr   )setup_contextxwsr%   c                   s0   t  dd|||j} fddt||D S )Nr   c                    s6   g | ]\}}|j d g jdd  |jd R  qS )r   )viewr,   )r   or   rt   r   r   r       s   6 z4sequence_parallel_leading_matmul.<locals>.<listcomp>)r*   flatten
group_namerU   )rt   ru   r   r%   osr   rz   r    sequence_parallel_leading_matmul   s   r~   z6xformers_python::sequence_parallel_trailing_matmul_fwdr)   weightc                 C   s>   t |}|rt| | |d}|S t| |}t||d}|S )Nr"   r$   )r   r   r   r>   matmulr	   )r)   r   r   r   r%   scattered_outputgathered_outputr   r   r   %sequence_parallel_trailing_matmul_fwd   s   
r   c                 C   s*   t | }| | jd | |jd fS )Nr   r   )r   r/   r+   r,   )r)   r   r   r   r.   r   r   r   *sequence_parallel_trailing_matmul_fwd_fake   s   r   z6xformers_python::sequence_parallel_trailing_matmul_bwdgrad_scattered_outputc           
         s   t |}| }tdd | D r| }|rXt| }t| j|dd |j|dddt	tj
 dtdtg tjjf dd f fd	d
}t|g||d |fS t||d}	t|	 }t|	 |  |fS )Nc                 s   r2   r3   r   r4   r   r   r   r6     r7   z8sequence_parallel_trailing_matmul_bwd.<locals>.<genexpr>r   rA   grad_gathered_outputs_shardrS   rF   r   c                    s   | \}t j|  t j| | d W d    n1 s!w   Y  t j|   |  |  W d    d S 1 sEw   Y  d S )NrJ   )r>   r   rL   r   r   rW   )r   rS   rF   grad_go_shardgathered_inputsrD   grad_weightr   r   r   my_gi_and_w_matmul  s   "zAsequence_parallel_trailing_matmul_bwd.<locals>.my_gi_and_w_matmulr"   r$   )r   r/   r8   r9   r:   r>   r_   r?   rC   r   r`   ra   r   r   rb   r   r   r   r   )
r)   r   r   r   r   r%   r.   re   r   grad_gathered_outputr   r   r   %sequence_parallel_trailing_matmul_bwd  s<   

r   c                 C   s   t | t |fS rT   rh   )r)   r   r   r   r   r   r   r   *sequence_parallel_trailing_matmul_bwd_fake8  s   r   c                 C   s(   |\}}}}|  || || _|| _d S rT   rj   )rl   rm   rn   r)   r   r   r   r   r   r   /sequence_parallel_trailing_matmul_setup_contextC  s   
r   c                 C   s.   | j \}}t|||| j| j\}}||d d fS rT   )rp   r   r   r   )rl   r   r)   r   re   r   r   r   r   ,sequence_parallel_trailing_matmul_bwd_bridgeJ  s   
r   r   c                C   s>   t | dd|||j}|jdg| jdd |jd R  S )Nr   rv   rw   r   )r   r{   r|   rx   r,   )rt   r   r   r%   ry   r   r   r   !sequence_parallel_trailing_matmul]  s   &r   )*typingr   r   r   r>   "torch.distributed.distributed_c10dr   differentiable_collectivesr   r   r	   r
   sequence_parallel_fused_opsr   r   r   r   r   r   library	custom_opr`   boolstrr*   register_faker0   rg   ri   ro   rr   register_autograddistributedProcessGroupr~   r   r   r   r   r   r   r   r   r   r   r   <module>   sD  

_



2
