o
    ߥiC                     @   sx   d dl Z d dlmZ d dlmZ d dlmZmZ d dl	m
Z
 d dlmZ d dlmZ G dd deZG d	d
 d
eZdS )    N)mpu)_flatten_dense_tensors_unflatten_dense_tensors)Variable)Module)DistributedDataParallelc                   @   s4   e Zd ZddedefddZdd	d
ZdddZdS )PyTorchDistributedDataParallel Tprefixrecursec                 C      | j j||dS N)r
   r   modulenamed_parametersselfr
   r    r   `/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/mglm/model/distributed.pyr         z/PyTorchDistributedDataParallel.named_parametersNFc                 C      | j |||}|S Nr   
state_dictr   destinationr
   	keep_varssdr   r   r   r         z)PyTorchDistributedDataParallel.state_dictc                 C   r   N)strictr   load_state_dictr   r   r    r   r   r   r"   !   r   z.PyTorchDistributedDataParallel.load_state_dictr	   TNr	   FT)__name__
__module____qualname__strboolr   r   r"   r   r   r   r   r      s    
r   c                       sL   e Zd Z fddZdd Zddd	ZdddZddedefddZ	  Z
S )r   c                    s   t t  tjtjjkrdnd_|_t	
 _t	 }j D ]}t|r4tj||jd q$			dfdd	 g _g _tj D ]} fdd}qL _d S )	NTFgroupc                    s*   j rd _ i } j D ] \}}|jr-|jd ur-|j }||vr&g ||< || | q jr>t	j
j|v r>td d _|D ]R}|| }dd |D }t|}	|rW|	 }	|sd| sd|	tj jd }	tj|	 jd t	j
  |s~| r~|	tj jd }	t|t|	|D ]	\}
}|
| qq@d S d S )NFz}WARNING: gloo dist backend for half parameters may be extremely slow. It is recommended to use the NCCL backend in this case.c                 S   s   g | ]}|j jqS r   )graddata).0paramr   r   r   
<listcomp>F   s    zNDistributedDataParallel.__init__.<locals>.allreduce_params.<locals>.<listcomp>r,   )needs_reductionr   r   requires_gradr.   r/   typeappendwarn_on_halftorchcuda
HalfTensorprintr   floatdistget_world_sizedata_parallel_group
all_reducesynchronizezipr   copy_)reduce_afterno_scalefp32_allreducebucketsnamer1   tpbucketgrads	coalescedbufsynced)r   r   r   allreduce_params2   sN   


z:DistributedDataParallel.__init__.<locals>.allreduce_paramsc                     s   t j  d S r   )r   _execution_enginequeue_callback)unused)rO   r   r   allreduce_hookZ   r   z8DistributedDataParallel.__init__.<locals>.allreduce_hook)TFF)superr   __init__r=   _backenddist_backendGLOOr7   r   r   get_data_parallel_groupr?   get_model_parallel_rank
parametersr8   	is_tensor	broadcasthook_handleshookslistrO   )r   r   src_rankpr1   rS   	__class__)rO   r   r   rU   '   s$   

$
z DistributedDataParallel.__init__c                 O   s   d| _ | j|i |S )NT)r3   r   )r   inputskwargsr   r   r   forward_   s   zDistributedDataParallel.forwardNr	   Fc                 C   r   r   r   r   r   r   r   r   c   r   z"DistributedDataParallel.state_dictTc                 C   r   r   r!   r#   r   r   r   r"   g   r   z'DistributedDataParallel.load_state_dictr
   r   c                 C   r   r   r   r   r   r   r   r   j   r   z(DistributedDataParallel.named_parametersr%   r&   r$   )r'   r(   r)   rU   rg   r   r"   r*   r+   r   __classcell__r   r   rc   r   r   %   s    8

r   )r8   torch.distributeddistributedr=   megatron_utilr   torch._utilsr   r   torch.autogradr   torch.nn.modulesr   torch.nn.parallel.distributedr   DDPr   r   r   r   r   <module>   s   