o
    mi6                     @   s   d Z ddlZddlZdd Zdd Zdd Zejjj	fd	ej
fd
dZdd Zdejej
 fddZddejej
 defddZdddZdd Zd dejeef fddZdS )!zTorch distributed utilities.    Nc                   C      t j r
t j S dS )Nr   )torchdistributedis_initializedget_rank r   r   D/home/ubuntu/SpeechTokenizer/speechtokenizer/quantization/distrib.pyrank      

r	   c                   C   r   N   )r   r   r   get_world_sizer   r   r   r   
world_size   r
   r   c                   C   s
   t  dkS r   )r   r   r   r   r   is_distributed   s   
r   tensorc                 C   s   t  r
tj| |S d S N)r   r   r   
all_reduce)r   opr   r   r   r       s   r   c                 C   s   t | p	t | S r   )r   is_floating_point
is_complex)r   r   r   r   _is_complex_or_float%   s   r   paramsc                 C   sb   t  r| sd S tjt| g| d jtjd}t| | t| t  kr/t	dt|  dd S )Nr   devicedtypez&Mismatch in number of params: ours is z*, at least one worker has a different one.)
r   r   r   lenr   longr   itemr   RuntimeError)r   r   r   r   r   _check_number_of_params)   s   
r   tensorssrcc                 C   sb   t  sdS dd | D } t|  g }| D ]}tjj|j|dd}|| q|D ]}|  q(dS )zBroadcast the tensors from the given parameters to all workers.
    This can be used to ensure that all workers have the same model to start with.
    Nc                 S   s   g | ]}t |r|qS r   )r   ).0r   r   r   r   
<listcomp>>   s    z%broadcast_tensors.<locals>.<listcomp>Tr!   async_op)r   r   r   r   	broadcastdataappendwait)r    r!   handlesr   handler   r   r   broadcast_tensors8   s   
r,   Tc                 C   s   t  sdS g }| D ])}t|jr2|r!tjj|jtjjjdd}n
tjj|jddd}|	||f q	|D ]\}}|
  |rF| jt  _q5dS )zU
    Sync grad for buffers. If average is False, broadcast instead of averaging.
    NTr   r%   r   r$   )r   r   r   r'   r   r   ReduceOpSUMr&   r(   r)   r   )buffersaverager*   bufferr+   r   r   r   sync_bufferI   s(   r3   c                 C   sx   t  sdS g }| D ]}|jdur%tjj|jjtjjjdd}|||f q	|D ]\}}|	  |j jt
   _q(dS )z
    Simpler alternative to DistributedDataParallel, that doesn't rely
    on any black magic. For simple models it can also be as fast.
    Just call this on your model parameters after the call to backward!
    NTr-   )r   gradr   r   r   r'   r.   r/   r(   r)   r   )r   r*   pr+   r   r   r   	sync_grad_   s   
r6         ?metricsc                 C   s   t  s| S t|   \}}tj rdnd}tjt|dg |tjd}||9 }t	| |dd |d  
  }tt||S )znAverage a dictionary of metrics across all workers, using the optional
    `count` as unormalized weight.
    cudacpur   r   N)r   zipitemsr   r9   is_availabler   listfloat32r   r:   tolistdict)r8   countkeysvaluesr   r   averagedr   r   r   average_metricsr   s   rG   )r   )T)r7   )__doc__typingtpr   r	   r   r   r   r.   r/   Tensorr   r   Listr   Iterableintr,   r3   r6   DictstrfloatrG   r   r   r   r   <module>   s   
