o
    闦iF                     @   s   U d dl Z d dlmZmZ d dlZd dlmZ d dlmZ d dl	m
Z
 G dd deZG dd	 d	eZG d
d deZG dd deZdaeeeej   ed< dejfddZdS )    N)ListOptional_get_device_index)Function)commc                   @   $   e Zd Zedd Zedd ZdS )	Broadcastc                    s   t dd |D sJ ddd |D }|| _t|dkrdS t|| _|d  | _t|| j}g }t| j	dd  D ]\ }|sO|
 fd	d|D  q=| j|  td
d |D S )Nc                 s       | ]	}|j jd kV  qdS cpuNdevicetype.0i r   Z/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/torch/nn/parallel/_functions.py	<genexpr>       
z$Broadcast.forward.<locals>.<genexpr>z2Broadcast function not implemented for CPU tensorsc                 S      g | ]}t |d qS Tr   r   xr   r   r   
<listcomp>       z%Broadcast.forward.<locals>.<listcomp>r   r      c                 3   s    | ]}|  V  qd S Nr   )r   outputidxr   r   r          c                 S   s   g | ]	}|D ]}|qqS r   r   )r   tensorstr   r   r   r      s    )alltarget_gpuslen
num_inputs
get_deviceinput_devicer   broadcast_coalesced	enumerateneeds_input_gradextendmark_non_differentiabletuple)ctxr&   inputsoutputsnon_differentiablesinput_requires_gradr   r    r   forward   s&   


zBroadcast.forwardc                 G   s   dt j| j| jg|R   S )Nr   )ReduceAddCoalescedapplyr*   r(   r1   grad_outputsr   r   r   backward   s
   
zBroadcast.backwardN__name__
__module____qualname__staticmethodr6   r;   r   r   r   r   r	   
   s
    
r	   c                   @   r   )r7   c                    sL    fddt dt D | _ fddt dt D }t||S )Nc                    s   g | ]} |   qS r   r)   r   )gradsr   r   r   (   s    z.ReduceAddCoalesced.forward.<locals>.<listcomp>r   c                    s   g | ]
} ||  qS r   r   r   rB   r(   r   r   r   ,   s    )ranger'   r&   r   reduce_add_coalesced)r1   destinationr(   rB   grads_r   rC   r   r6   &   s
   
 zReduceAddCoalesced.forwardc                 G   s   dt j| jg|R   S )NNN)r	   r8   r&   r9   r   r   r   r;   /   s   zReduceAddCoalesced.backwardNr<   r   r   r   r   r7   %   s
    
r7   c                   @   r   )Gatherc                    s   t dd |D sJ d|dkrd _nt|d}| _| _tdd |D  _t dd |D rI|dkrItd	d |D }td
 d _nd _t fdd|D  _	t
| j jS )Nc                 s   r
   r   r   r   r   r   r   r   :   r   z!Gather.forward.<locals>.<genexpr>z/Gather function not implemented for CPU tensorsr   Tc                 s   s    | ]}|  V  qd S r   rA   r   r   r   r   r   C   r"   c                 s   s    | ]	}|  d kV  qdS r   N)dimr   r$   r   r   r   r   D       r   c                 s   s    | ]}| d V  qdS )r   N)viewrL   r   r   r   r   E   s    zvWas asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.Fc                 3   s    | ]	}|  jV  qd S r   )sizerK   r   r1   r   r   r   N   rM   )r%   target_devicer   rK   r0   
input_gpuswarningswarnunsqueezed_scalarinput_sizesr   gather)r1   rQ   rK   r2   r   rP   r   r6   8   s(   

zGather.forwardc                 C   s6   t | j| j| j|}| jrtdd |D }d| S )Nc                 s   s    | ]}|d  V  qdS rJ   r   )r   gr   r   r   r   W   r"   z"Gather.backward.<locals>.<genexpr>rH   )Scatterr8   rR   rV   rK   rU   r0   )r1   grad_outputscattered_gradsr   r   r   r;   Q   s   zGather.backwardNr<   r   r   r   r   rI   7   s
    
rI   c                   @   r   )rY   c           
   	   C   s   dd |D }|| _ |jjdkr| nd| _d }tj r*| jdkr*dd |D }t	|||| j |}|d urjt
|D ]-\}}tj||  tj }	|	||  ||	 W d    n1 sdw   Y  q<|S )Nc                 S   r   r   r   r   r   r   r   r   ^   r   z#Scatter.forward.<locals>.<listcomp>r   c                 S   s   g | ]
}t td |qS )cuda)_get_streamtorchr   )r   r   r   r   r   r   d   s    )rK   r   r   r)   r*   r_   r]   is_availabler   scatterr,   current_streamwait_streamrecord_stream)
r1   r&   chunk_sizesrK   inputstreamsr3   r   r   main_streamr   r   r   r6   \   s$   
zScatter.forwardc                 G   s    d d d t j| j| jg|R  fS r   )rI   r8   r*   rK   )r1   rZ   r   r   r   r;   q   s    zScatter.backwardNr<   r   r   r   r   rY   [   s
    
rY   _streamsr   c                 C   sh   | j dkrdS tt| j d}|du rdS tdu rdg|  at| j du r/|| jt| j< t| j S )zBGet a background stream for copying between CPU and target device.r   N)r   getattrr_   ri   device_countindexStream)r   
device_modr   r   r   r^   z   s   

r^   )rS   typingr   r   r_   torch._utilsr   torch.autogradr   torch.nn.parallelr   r	   r7   rI   rY   ri   rm   __annotations__r   r^   r   r   r   r   <module>   s   
 $