o
     ╟╧iГ  у                   @   s0  U d dl Z d dlmZmZmZmZmZmZmZ d dl	Z	d dl
mZ d dlZ	d dlmZ d ZdZdZde	jfddДZG d	d
Д d
ГZi Zeeee f ed< dejdefddДZde	jdejdee fddДZde	jjfddДZ eddddddЬde	j!de	j!dejdee	j! dedee	j! deee	j!ee	j! f  dee	j de	j!fddДГZ"eddddddЬde	j!dee	j! dejdeee	j!  dedee	j! deee	j!ee	j! f  dee	j dee	j! fd dДГZ"ddddddЬde	j!dee	j!ee	j! f dejdeee	j!ee	j! f  dedee	j! deee	j!ee	j! f  dee	j dee	j!ee	j! f fd!dДZ"e	j#j$d"d#hd$d%Нde	j!d&ee	j! d'e%d#ee	j! ded(ed)ede	j!d*eee	j!  ddfd+d,ДГZ&dd-Ьd.ee	j! d/eee	j! eeg e	jjf gdf dejdeddf
d0d1ДZ'edddddd2Ьd3e	j!de	j!dejdee	j! ded4ee	j! deee	j!ee	j! f  dee	j de	j!fd5d6ДГZ(edddddd2Ьd3e	j!dee	j! dejdeee	j!  ded4ee	j! deee	j!ee	j! f  dee	j dee	j! fd7d6ДГZ(dddddd2Ьd3e	j!dee	j!ee	j! f dejdeee	j!ee	j! f  ded4ee	j! deee	j!ee	j! f  dee	j dee	j!ee	j! f fd8d6ДZ(e	j#j$d9d:hd$d%Нd3e	j!d&ee	j! d'e%d:ee	j! ded(ed)ed4e	j!d*eee	j!  ddfd;d<ДГZ)dd-Ьd/eee	j! eeg e	jjf gdf d:ee	j! dejdeddf
d=d>ДZ*dS )?щ    N)┌Callable┌Dict┌List┌Optional┌Sequence┌Union┌overload)┌get_symm_mem_workspaceщ   iш  ┌dtc                 C   s   | j o
tа| бjdkS йNщ   )┌is_floating_point┌torch┌finfo┌bits)r   й r   ·\/home/ubuntu/.local/lib/python3.10/site-packages/xformers/ops/sequence_parallel_fused_ops.py┌_is_fp8_dtype   s   r   c                   @   sц   e Zd ZdZdejdejfddДZdej	j
deg ej	j
f fdd	ДZ	
	
ddeej deeej eeg ej	j
f gdf dededef
ddДZ	
	
ddeeej eeg ej	j
f gdf deej deej dededefddДZdS )┌_FusedSequenceParallelaI	  Set up a communication ring and perform fused ops on it

    Stores the persistent state needed to support a ring of connections between
    processes, and the logic that can do fused comms + matmuls on it.

    We want to achieve overlap between:
    - a computation which reads from the data we received from a remote GPU
    - and the communication where we send some data to another GPU
    And in order to do that we need some staging buffers and a way to
    synchronize access to them across processes.

    To perform the communication over NVLink we make the processes exchange
    their staging buffers using IPC (Inter-Process Communication) handles, which
    "mounts"/"mmaps" an allocation on one GPU into the virtual address space of
    another GPU: the memory remains backed by the original GPU but the other GPU
    can access it as if it were local. We exchange these IPC handles using
    multiprocessing Connections (and the "reductions" provided by PyTorch),
    which we establish over UNIX domain sockets, whose addresses we exchange by
    using a ProcessGroup.

    To synchronize accesses we use a set of counters/sequence numbers that are
    also allocated in memory shared over IPC handles. Processes signal that they
    completed an operation by launching a kernel that increases that value, and
    they wait for anoher process to complete an operation by launching a kernel
    that busy-waits for that value to increase. Currently we implement these
    kernels manually, but on recent CUDA drivers (515.43.04+, corresponding to
    CUDA 11.7) we could use standard stream memory operations (see
    https://docs.nvidia.com/cuda/archive/11.7.0/cuda-driver-api/group__CUDA__MEMOP.html).

    We prefer to use these kernels (or the stream memory ops) over IPC events
    because IPC events require signaling between processes at launch time to
    ensure that the wait on one process occurs after the record on another
    process. This signaling means that _launching_ our fused operation becomes a
    synchronization barrier, which can increase the launch overhead. It would
    also behave differently from NCCL, where launching is async and all the
    synchronization happens on device in the kernels. A previous version of this
    code which uses IPC events can be found here:
    https://github.com/fairinternal/xformers/pull/504.

    ┌device┌groupc                 C   sf   || _ |аб | _|аб | _|| _tjаб | _	tjjddН| _
tjjddН| _tjjddН| _d| _d S )Nщ    )┌priorityr   )┌	my_device┌rank┌my_rank┌size┌
world_sizer   r   ┌cuda┌Stream┌second_stream┌memcpy_stream┌compute_wait_stream┌memcpy_wait_stream┌next_stream_idx)┌selfr   r   r   r   r   ┌__init__D   s   


z_FusedSequenceParallel.__init__┌current_stream┌returnc                    s   З ЗfddД}|S )Nc                     s0   И Иj gИj } И jd7  _И jd;  _| S )Nr
   щ   )r!   r%   )┌streamйr(   r&   r   r   ┌result\   s   z:_FusedSequenceParallel.make_stream_factory.<locals>.resultr   )r&   r(   r-   r   r,   r   ┌make_stream_factoryY   s   z*_FusedSequenceParallel.make_stream_factoryT┌scattered_inputs┌	my_matmulN┌	timeout_s┌_wait┌_memcpyc              	      sT  Иd j Й tЗfddДИD ГГsJ ВtЗ fddДИD ГГsJ ВddД ИD ГЙtИГЙtjаИjбП& tИjj	Иj
И И j ГЙЗ ЗЗЗЗЗfddДtИj
ГD Г}W d  Г n1 sZw   Y  tjаб }td	Иj
ГD ](}Иj| Иj
 Й|rТtjа|бП ИаИtб W d  Г n1 sНw   Y  qjИjа|б Иjа|б Иjа|б Иа|б}	td	Иj
ГD ]Р}Иj| Иj
 }
|r▌tjаИjбП Иj|
t|t d
Н W d  Г n1 s╪w   Y  ИjаИjб |РrtjаИjбП t||
 ИГD ]\}}|Иj а|б qЎW d  Г n	1 Рsw   Y  |Рr@tjаИjбП ИjИа|
бИj
t Иj d	d	dН W d  Г n	1 Рs;w   Y  q░|ИИj|	Г td	Иj
ГD ]M}Иj| Иj
 Й|Рr}tjаИjбП ИjИt|t d
Н W d  Г n	1 Рsxw   Y  |аИjб ИjаИjб |ЗfddД|Иj D ГИ|	Г РqN|аИjб |аИjб dS )z5Perform a fused all-gather followed by a linear layerr   c                 3   є   Б | ]	}|j И jkV  qd S йNйr   r   й┌.0┌siйr&   r   r   ┌	<genexpr>q   є   А z>_FusedSequenceParallel.allgather_and_linear.<locals>.<genexpr>c                 3   є   Б | ]}|j И kV  qd S r5   й┌dtyper7   r>   r   r   r;   r   є   А c                 S   є   g | ]}|а б СqS r   й┌numelr7   r   r   r   ┌
<listcomp>t   є    z?_FusedSequenceParallel.allgather_and_linear.<locals>.<listcomp>c              	      є>   g | ]}Зfd dДt Иа|ИjИgИ бjИddНИГD ГСqS )c                    є$   g | ]\}}|а И jf|j бСqS r   й┌viewr   ┌shape)r8   ┌sr9   r:   r   r   rD   ~   є      zJ_FusedSequenceParallel.allgather_and_linear.<locals>.<listcomp>.<listcomp>r   й┌dimй┌zip┌
get_bufferr   ┌splitйr8   r   )r?   ┌scattered_input_numelsr/   r&   ┌symm_mem┌total_scattered_input_numelr   r   rD   }   є    

ў ■№■ Nr
   й┌
timeout_msй┌val┌countc                    є   g | ]}|И  СqS r   r   йr8   rK   й┌src_rankr   r   rD   ╦   rE   )r?   ┌all┌sumr   r   r   r   r	   r   ┌
group_namer   ┌itemsize┌ranger(   r   r+   ┌
put_signal┌OP_FINISHED_CHANNELr!   ┌wait_streamr#   r$   r.   ┌wait_signal┌MS_IN_Sr"   rP   ┌copy_┌memset32┌get_signal_pad┌COMMS_READY_CHANNEL)r&   r/   r0   r1   r2   r3   ┌buffersr(   ┌iter_┌stream_factory┌dst_rank┌bsr9   r   )r?   rT   r/   r&   r`   rU   rV   r   ┌allgather_and_lineard   sО   
■
Ў·
 А
¤   № А¤  z+_FusedSequenceParallel.allgather_and_linear┌gathered_outputs┌scattered_outputsc              	      sю  |d j ЙtЗfddД|D ГГsJ ВtЗfddД|D ГГsJ ВtЗfddДИD ГГs,J ВtЗfddДИD ГГs9J ВddД ИD ГЙtИГЙtjаИjбП& tИjj	Иj
И Иj ГЙЗЗЗЗЗЗfd	dДtИj
ГD Г}W d
  Г n1 stw   Y  tjаб }tdИj
ГD ](}	Иj|	 Иj
 }
|rмtjа|бП Иа|
tб W d
  Г n1 sзw   Y  qДИjа|б Иjа|б Иjа|б Иа|б}tdИj
ГD ]Н}	Иj|	 Иj
 Й |rўtjаИjбП ИjИ t|t dН W d
  Г n1 sЄw   Y  |аИjб ИjаИjб |З fddД|Иj D ГИ |Г |ИjgИjd d  }|а|б |аИjб |РrWtjа|бП ИjИаИ бИj
t Иj dddН W d
  Г n	1 РsRw   Y  q╩|ЗfddД|D ГИj|Г tdИj
ГD ]i}	Иj|	 Иj
 }
|РrЫtjаИjбП Иj|
t|t dН W d
  Г n	1 РsЦw   Y  ИjаИjб |Рr╘tjаИjбП t|||
 ГD ]\}}||
 а|Иj б Рq┤W d
  Г n	1 Рs╧w   Y  Рql|аИjб |аИjб t|ИГD ]\}}tj|d|dН Рqчd
S )z9Perform a fused linear layer followed by a reduce-scatterr   c                 3   r4   r5   r6   йr8   ┌gor:   r   r   r;   ▀   r<   zB_FusedSequenceParallel.linear_and_reducescatter.<locals>.<genexpr>c                 3   r=   r5   r>   rw   r>   r   r   r;   р   r@   c                 3   r4   r5   r6   йr8   ┌sor:   r   r   r;   с   r<   c                 3   r=   r5   r>   ry   r>   r   r   r;   т   r@   c                 S   rA   r   rB   ry   r   r   r   rD   ф   rE   zC_FusedSequenceParallel.linear_and_reducescatter.<locals>.<listcomp>c              	      rF   )c                    rG   r   rH   )r8   rK   rz   r:   r   r   rD   ю   rL   zN_FusedSequenceParallel.linear_and_reducescatter.<locals>.<listcomp>.<listcomp>r   rM   rO   rS   )r?   ┌scattered_output_numelsrv   r&   rU   ┌total_scattered_output_numelr   r   rD   э   rW   Nr
   rX   c                    r]   r   r   r^   йrr   r   r   rD     rE   r*   rZ   c                    s   g | ]}|И j  СqS r   )r   )r8   ┌or:   r   r   rD   0  s    )rN   ┌out) r?   ra   rb   r   r   r   r   r	   r   rc   r   rd   re   r(   r   r+   rf   rg   r!   rh   r#   r$   r.   ri   rj   r%   rl   rm   rn   r"   rP   rk   )r&   r0   ru   rv   r1   r2   r3   ro   r(   rp   r`   rq   ┌final_streamrx   rs   rz   r   )rr   r?   r{   rv   r&   rU   r|   r   ┌linear_and_reducescatter╤   sм   
■
Ў·
 А
¤   
№ А¤¤   А z/_FusedSequenceParallel.linear_and_reducescatter)TT)┌__name__┌
__module__┌__qualname__┌__doc__r   r   ┌dist┌ProcessGroupr'   r   r    r   r.   r   ┌Tensor┌int┌boolrt   rБ   r   r   r   r   r      sX    )■
¤ 
■°■ ¤·∙
°uў ■√·∙°	ўr   ┌CACHEr   r)   c                 C   s   | а б dkS r   )r   )r   r   r   r   ┌-_can_ranks_communicate_all_to_all_over_nvlinkV  s   
rМ   r   c                 C   sz   |а б }z	tt|Г }W |S  ty<   ttjаddбГr d }n|dkr'd }nt|Гs.d }nt	| |Г}|tt|Г< Y |S w )N┌DISABLE_FUSED_SEQUENCE_PARALLEL┌0r
   )
r   rЛ   ┌id┌KeyErrorrЙ   ┌os┌environ┌getrМ   r   )r   r   r   ┌objr   r   r   ┌
_lazy_initc  s   Ў
ЎrХ   c                   C   s
   t jаб S r5   )r   r   r(   r   r   r   r   ┌_default_stream_factoryv  s   
rЦ   i  )r   r1   ┌scale_scattered_input┌scale_weight┌	out_dtype┌scattered_input┌weightr   r1   rЧ   rШ   rЩ   c          	      K   є   d S r5   r   й	rЪ   rЫ   r   r   r1   rЧ   rШ   rЩ   ┌private_args_DO_NOT_USEr   r   r   ┌fused_allgather_and_linearz  є   rЯ   c          	      K   rЬ   r5   r   rЭ   r   r   r   rЯ   К  rа   c                   sJ  |а б }	t|tГr|n|g}
|du |du ksJ В|durWt|tГt|tГks(J Вt|tГr/|n|g}t|
Гt|Гks<J ВtИjГsCJ ВtddД |
D ГГsNJ ВИdusVJ dГВndgt|
Г }tddД |
D ГГsiJ ВИjdkspJ ВtЗfddД|
D ГГs}J ВИаб sГJ В|	fИj	 Й З fdd	Д|
D Г}|durэt|tГt|tГksвJ Вt|tГrй|n|g}t|Гt|Гks╢J Вtd
dД t
||ГD ГГs─J ВtddД |D ГГs╧J ВИdurьt|tГrх|D ]	}|jИksуJ Вq┌n|jИksьJ Вn
ЗЗfdd	Д|D Г}tjjjИ|
|j|||аddб|аddб||dН	 t|tГРrdd	Д |D ГS |d аddбS )aГ  Performs a fused all-gather followed by a linear op

    It is equivalent to the following plain PyTorch code:

    # like scattered_input but with first dim multiplied by group's world size
    gathered_input = scattered_input.new_empty(...)
    dist.all_gather_into_tensor(gathered_input, scattered_input, group=group)
    return torch.nn.functional.linear(gathered_input, weight)

    It achieves this by breaking down the matmul into smaller partial ops (as
    many as the world size), each needing as input a different "contribution"
    to the all-gather (by a different rank), and writing to a different chunk of
    the output. Then, on one stream, it sends the local contribution to all
    other ranks (first one rank over, then two, ...) while, on another stream,
    it launches the sub-matmuls in the order in which the remote contributions
    (which are the sub-matmuls' inputs) are supposed to arrive, so that ideally
    none of the sub-matmuls will ever have to wait.

    The idea comes from this paper: https://arxiv.org/abs/2302.05442

    This method uses a staging buffer, which persists across calls, of the same
    size as the all-gathered input tensor (i.e., the input's size times the
    world size). If multiple inputs of multiple sizes are used, the staging
    buffer will be the maximum needed by any of them. Each call, when it starts,
    must first wait for the previous call to finish using the staging buffer. In
    normal conditions, where there's some other operation between two calls,
    this isn't an issue.

    Supports FP8 gemm for tensor-wise quantized weight and input tensors.
    To enable FP8 gemm:
    1. pass scattered_input and weight as quantized FP8 datatype
    2. pass scale_scattered_input and scale_weight, the scales used to
    quantize input and weight, respectively.
    3. set out_dtype, if not specified, will be inferred from scattered_input type.

    Nc                 s   є   Б | ]}t |jГV  qd S r5   йr   r?   йr8   ┌wr   r   r   r;   ╘  r@   z-fused_allgather_and_linear.<locals>.<genexpr>·!output_dtype is required with FP8c                 s   є   Б | ]}|j d kV  qdS йr*   Nй┌ndimrг   r   r   r   r;   ╪  r@   r*   c                 3   є$   Б | ]}И j d  |j d  kV  qdS йr   NйrJ   rг   )rЪ   r   r   r;   ┌  є   А" c                    s&   g | ]}И d dЕ |j d dЕ  СqS йNr   rм   rг   )┌gathered_input_shaper   r   rD   ▌  s   & z.fused_allgather_and_linear.<locals>.<listcomp>c                 s   є   Б | ]
\}}|j |kV  qd S r5   rм   )r8   rx   ┌gosr   r   r   r;   т  s   А 
 c                 s   є   Б | ]}|а б V  qd S r5   й┌is_contiguousrw   r   r   r   r;   х  є   А c                    s(   g | ]}Иj |И d urИ nИjdНСqS йNr>   й┌	new_emptyr?   йr8   r▒   )rЩ   rЪ   r   r   rD   э  є    №■ r2   Tr3   )r1   r2   r3   rЧ   ┌scales_weightsc                 S   s   g | ]}|а d dбСqS )r   r
   )┌flattenrw   r   r   r   rD     є    r   r
   )r   ┌
isinstance┌list┌lenr   r?   ra   rй   r┤   rJ   rP   r   ┌ops┌xformers_python┌ _fused_allgather_and_linear_implrc   rУ   r╝   )rЪ   rЫ   r   r   r1   rЧ   rШ   rЩ   rЮ   r   ┌weightsr╗   ┌gathered_output_shapesru   r~   r   )rп   rЩ   rЪ   r   rЯ   Ъ  sd   0  
 А√

ўz1xformers_python::_fused_allgather_and_linear_implru   r   )┌mutates_args┌device_typesr─   ┌process_group_namer2   r3   r╗   c	                    s\   t jа|б}	dttj dtdtg tjj	f dd fЗ ЗЗЗfddД}
t
| g|
|	|||dН d S )N┌inputsr`   rq   r)   c              
      sа   t ИИИ ГD ]G\}}}tjа|Г бП3 Иd ur0|d ur0tj| d |аб || jИ||| dН ntj| d |аб || dН W d   Г n1 sHw   Y  qd S )Nr   йrЩ   ┌scale_a┌scale_br   йr   йrP   r   r   r+   ┌
_scaled_mm┌tr?   ┌matmul)r╔   r`   rq   rд   rШ   rx   йru   rЧ   r╗   r─   r   r   r0     s    ·	АїА z8_fused_allgather_and_linear_custom_op.<locals>.my_matmulйr   r1   r2   r3   )rЖ   ┌distributed_c10d┌_resolve_process_groupr   r   rИ   rЙ   r   r   r    ┌fused_allgather_and_anything)rЪ   r─   r╚   ru   r1   r2   r3   rЧ   r╗   ┌process_groupr0   r   r╥   r   ┌%_fused_allgather_and_linear_custom_op  s$    ■¤№
·r╪   )r1   r/   r0   c          
   	      s^  |а б ЙtИ ГdkrtИГD ]Й|g ИtГ qd S tddД И D ГГs$J ВtЗ fddДИ D ГГs1J ВtЗ fddДИ D ГГs>J ВЗfddДИ D Г}tИ d j|Г}Иdkr[|И dtГ d S |d u rСd	dД tИ |ГD Г}tИ |ГD ]\}}	tj	|	||d
Н qntИГD ]Й|ЗfddД|D ГИtГ qd S И d j|j
ksЫJ В|jИ |||аddб|аddбdН d S )Nr   c                 s   r▓   r5   r│   r7   r   r   r   r;   G  r╡   z/fused_allgather_and_anything.<locals>.<genexpr>c                 3   є    Б | ]}|j И d  j kV  qdS йr   Nйr   r7   йr/   r   r   r;   H  є   А c                 3   r┘   r┌   r>   r7   r▄   r   r   r;   I  r▌   c                    є   g | ]}И f|j  СqS r   rм   r7   йr   r   r   rD   K  r╜   z0fused_allgather_and_anything.<locals>.<listcomp>r
   c                 S   є   g | ]	\}}|а |бСqS r   йr╕   )r8   r9   ┌gisr   r   r   rD   T  є      )┌output_tensor┌input_tensorr   c                    r]   r   r   )r8   ┌gir_   r   r   rD   \  rE   r2   Tr3   йr1   r2   r3   )r   r└   re   rЦ   ra   rХ   r   rP   rЖ   ┌all_gather_into_tensorr   rt   rУ   )
r/   r0   r   r1   rЮ   ┌gathered_input_shapesrФ   ┌gathered_inputsr9   rц   r   )r/   r`   r   r   r╓   6  sB   
■¤ 	


√r╓   )r   r1   ┌scale_gathered_inputrШ   rЩ   ┌gathered_inputrы   c          	      K   rЬ   r5   r   й	rь   rЫ   r   r   r1   rы   rШ   rЩ   rЮ   r   r   r   ┌fused_linear_and_reducescatterm  rа   rю   c          	      K   rЬ   r5   r   rэ   r   r   r   rю   }  rа   c                   sЬ  |а б }	t|tГr|n|g}
|du |du ksJ В|durWt|tГt|tГks(J Вt|tГr/|n|g}t|
Гt|Гks<J ВtИ jГsCJ ВtddД |
D ГГsNJ ВИdusVJ dГВndgt|
Г }tddД |
D ГГsiJ ВИ jdkspJ ВtЗ fddД|
D ГГs}J ВИ аб sГJ ВИ j	d |	 dksОJ ВИ а
|	И j	d |	 fИ j	d	dЕ  бЙ З fd
dД|
D Г}ddД |D Г}|duРrt|tГt|tГks┬J Вt|tГr╔|n|g}t|Г|ks╘J ВtЗ fddД|D ГГsсJ ВtЗ fddД|D ГГsюJ ВtddД t||ГD ГГs№J ВИduРrt|tГРr|D ]}|jИkРsJ ВРq	n|jИkРsJ Вn
З ЗfddД|D Г}tjjjИ |
|j|||аddб|аddб||dН	 t|tГРrJ|S |d S )a	  Performs a fused linear op followed by a reduce-scatter

    It is equivalent to the following plain PyTorch code:

    gathered_output = torch.nn.functional.linear(gathered_input, weight)
    # like gathered_output but with first dim divided by group's world size
    scattered_output = gathered_output.new_empty(...)
    dist.reduce_scatter_tensor(scattered_output, gathered_output, group=group)

    Supports FP8 gemm with tensor-wise quantized weights. To enable FP8 gemm:
    1. pass weight and gathered_input as FP8 tensors
    2. Set `scale_gathered_input` and `scale_weight` to the scales used to quantize
    inputs and weight, respectively.
    3. Set out_dtype to the desired output dtype. If not specified, it will be inferred from
    gathered_input datatype.
    Nc                 s   rб   r5   rв   rг   r   r   r   r;   │  r@   z1fused_linear_and_reducescatter.<locals>.<genexpr>rе   c                 s   rж   rз   rи   rг   r   r   r   r;   ╖  r@   r*   c                 3   rк   rл   rм   rг   йrь   r   r   r;   ╣  rн   r   r
   c                    s(   g | ]}И j d dЕ |j d dЕ  СqS rо   rм   rг   rя   r   r   rD   ┐  s   ( z2fused_linear_and_reducescatter.<locals>.<listcomp>c                 S   s   g | ]}|d dЕ СqS )r
   Nr   r╣   r   r   r   rD   └  r╜   c                 3   є   Б | ]	}|j И j kV  qd S r5   r█   ry   rя   r   r   r;   ┼  r<   c                 3   rЁ   r5   r>   ry   rя   r   r   r;   ╞  r<   c                 s   r░   r5   rм   )r8   rz   ┌sosr   r   r   r;   ╟  s
   А  
 c                    s(   g | ]}И j |Иd urИnИ jdНСqS r╢   r╖   )r8   rё   йrь   rЩ   r   r   rD   ╥  r║   r2   Tr3   )r1   r2   r3   rы   r╗   )r   r╛   r┐   r└   r   r?   ra   rй   r┤   rJ   rI   rP   r   r┴   r┬   ┌$_fused_linear_and_reducescatter_implrc   rУ   )rь   rЫ   r   r   r1   rы   rШ   rЩ   rЮ   r   r─   r╗   r┼   ┌scattered_output_shapesrv   r~   r   rЄ   r   rю   Н  sn     
■
 А√

ўz5xformers_python::_fused_linear_and_reducescatter_implrv   c	                    sZ   t jа|б}	dttj dtdtg tjj	f dd fЗ ЗЗЗfddД}
t
|
||	|||dН d S )N┌outputsrr   rq   r)   c              
      sФ   t ИИ| ГD ]A\}}}tjа|Г бП- Иd ur,|d ur,tjИ | |аб |jИ||dН ntjИ | |аб |dН W d   Г n1 sBw   Y  qd S )Nr╩   r═   r╬   )rї   rr   rq   rд   rШ   r~   йrь   rы   r╗   r─   r   r   r0   ■  s    ·	АїА z<_fused_linear_and_reducescatter_custom_op.<locals>.my_matmulr╙   )rЖ   r╘   r╒   r   r   rИ   rЙ   r   r   r    ┌ fused_anything_and_reducescatter)rь   r─   r╚   rv   r1   r2   r3   rы   r╗   r╫   r0   r   rЎ   r   ┌)_fused_linear_and_reducescatter_custom_opь  s$    ■¤№
·r°   c          
   
      sr  |а б ЙtИГdkrtИГD ]Й | g И tГ qd S tddД ИD ГГs$J ВtЗfddДИD ГГs1J ВtЗfddДИD ГГs>J ВЗfddДИD Г}tИd j|Г}Иdkr[| ИdtГ d S |d u rСd	dД tИ|ГD Г}tИГD ]Й | З fd
dД|D ГИ tГ qmt|ИГD ]\}}	tj	|	||dН qВd S Иd j|j
ksЫJ ВЗfddД|D Г}|j| |И||аddб|аddбdН d S )Nr   c                 s   r▓   r5   r│   ry   r   r   r   r;   ,  r╡   z3fused_anything_and_reducescatter.<locals>.<genexpr>c                 3   r┘   r┌   r█   ry   йrv   r   r   r;   -  r▌   c                 3   r┘   r┌   r>   ry   r∙   r   r   r;   .  r▌   c                    r▐   r   rм   ry   r▀   r   r   rD   0  r╜   z4fused_anything_and_reducescatter.<locals>.<listcomp>r
   c                 S   rр   r   rс   )r8   rz   r▒   r   r   r   rD   9  rу   c                    r]   r   r   rw   r}   r   r   rD   ?  rE   )┌output┌inputr   c                    s   g | ]	}И d  а |бСqS )r   rс   r╣   r∙   r   r   rD   I  s     r2   Tr3   rч   )r   r└   re   rЦ   ra   rХ   r   rP   rЖ   ┌reduce_scatter_tensorr   rБ   rУ   )
r0   rv   r   r1   rЮ   r┼   rФ   ru   rx   rz   r   )rr   rv   r   r   rў     sJ   
■¤ 
 


·rў   )+rС   ┌typingr   r   r   r   r   r   r   r   ┌torch.distributed┌distributedrЖ   ┌ torch.multiprocessing.reductions┌#torch.distributed._symmetric_memoryr	   rg   rn   rj   r?   r   r   rЛ   rЙ   ┌__annotations__rЗ   rК   rМ   r   rХ   r   r    rЦ   rИ   rЯ   ┌library┌	custom_op┌strr╪   r╓   rю   r°   rў   r   r   r   r   ┌<module>   sЎ  
$  ;  
■ў ■№√·∙°	ўїў ■№√·∙°	ўїў ■№√·∙°	ў
їm¤ ■¤№√·∙°	ў
Ў1∙  ■·∙	
ў7ў ■№√·∙°	ўїў ■№√·∙°	ўїў ■№√·∙°	ў
ї_¤ ■¤№√·∙°	ў
Ў1∙  №·∙	ў