o
    i                     @  sn   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z d dlZd dlmZ G dd dZdS )    )annotationsN)DistributedDataParallelc                   @  s   e Zd ZdZd9d:dd	Zd;d
dZdd Zdd Zdd Ze	j
jfd<ddZd=ddZejdd Zedd Zdd Zd>d!d"Zd?d%d&Zd'd( Zd)d*d+dd,d@d5d6ZedAd7d8Zd+S )BAcceleratora  
    Simplified accelerator that mirrors the behaviour of the minicpm-audio
    training utilities. It initializes a distributed process group when
    ``torchrun`` is used and exposes helpers for AMP, gradient scaling and
    preparing models/dataloaders for DDP.
    F*   ampboolseedintc                 C  s   t tdd| _| jdkrt stjddd t r!t nd| _t tj	
dd	| _|| _| | G d
d d}|rJtj rJtjdn| | _tj rZtj| jnd | _d | _d S )N
WORLD_SIZE1   ncclzenv://)init_methodr   
LOCAL_RANK0c                   @  s,   e Zd Zdd Zdd Zdd Zdd Zd	S )
z)Accelerator.__init__.<locals>.DummyScalerc                 S  s   |   d S N)stepself	optimizer r   O/home/ubuntu/.local/lib/python3.10/site-packages/voxcpm/training/accelerator.pyr   %   s   z.Accelerator.__init__.<locals>.DummyScaler.stepc                 S     |S r   r   r   lossr   r   r   scale(      z/Accelerator.__init__.<locals>.DummyScaler.scalec                 S  r   r   r   r   r   r   r   unscale_+   r   z2Accelerator.__init__.<locals>.DummyScaler.unscale_c                 S  s   d S r   r   r   r   r   r   update.   r   z0Accelerator.__init__.<locals>.DummyScaler.updateN)__name__
__module____qualname__r   r   r   r   r   r   r   r   DummyScaler$   s
    r#   cuda)r	   osgetenv
world_sizedistis_initializedinit_process_groupget_rankrankenvironget
local_rankr   	_set_seedtorchr$   is_available
GradScalerscalerdevice
device_ctx
_ddp_model)r   r   r   r#   r   r   r   __init__   s   
"
zAccelerator.__init__c                 C  s>   t | tj| t| t j rt j| dS dS )zOSet random seed to ensure model initialization consistency across multiple GPUsN)r1   manual_seednprandomr   r$   r2   manual_seed_all)r   r   r   r   r   r0   7   s   


zAccelerator._set_seedc                 C  s   | j d ur
| j   | S r   )r6   	__enter__r   r   r   r   r=   ?   s   

zAccelerator.__enter__c                 C  s"   | j d ur| j ||| d S d S r   )r6   __exit__)r   exc_type	exc_value	tracebackr   r   r   r>   D   s   
zAccelerator.__exit__c                 C  s   t  r
t   dS dS )zSynchronize all processesN)r(   r)   barrierr   r   r   r   rB   H   s   zAccelerator.barriertensortorch.Tensorc                 C  s   t  rt j||d |S )z"All-reduce tensor across processes)op)r(   r)   
all_reduce)r   rC   rE   r   r   r   rF   M   s   zAccelerator.all_reducemodeltorch.nn.Modulec                 K  sX   t |dr	| j|_|| j}| jdkr*tjj|}t|fd| j	gi|}|| _
|S )Nr5   r   
device_ids)hasattrr5   tor'   r1   nnSyncBatchNormconvert_sync_batchnormr   r/   r7   )r   rG   kwargsr   r   r   prepare_modelV   s   

zAccelerator.prepare_modelc                 c  sL    | j dur!| j   dV  W d   dS 1 sw   Y  dS dV  dS )z
        Context manager to skip gradient synchronization during gradient accumulation.
        Only used outside the last micro-batch.
        N)r7   no_syncr   r   r   r   rQ   `   s   
"
zAccelerator.no_syncc                 C  s8   t j rt d| jS t jj rt dS t dS )Nr$   mpscpu)r1   r$   r2   r5   r/   backendsrR   r   r   r   r   r5   l   s
   


zAccelerator.devicec                 O  s    t jjdg|R d| ji|S )Nr$   enabled)r1   r   autocast)r   argsrO   r   r   r   rV   w   s    zAccelerator.autocastr   c                 C  s   | j |  d S r   )r4   r   backwardr   r   r   r   rX   z   s   zAccelerator.backwardr   torch.optim.Optimizerc                 C  s   | j | d S r   )r4   r   r   r   r   r   r   }   s   zAccelerator.stepc                 C  s   | j   d S r   )r4   r   r   r   r   r   r      s   zAccelerator.updater   TN)num_workersshuffle
collate_fn	drop_lastdatasettyping.Iterable
batch_sizerZ   r[   r]   returntorch.utils.data.DataLoaderc             
   C  sZ   | j dkrtjjjj|| j | j|d}d}nd }tjjj|||d u r$|nd||||ddS )Nr   )num_replicasr,   r[   FT)r`   r[   samplerrZ   r\   r]   
pin_memory)r'   r1   utilsdatadistributedDistributedSamplerr,   
DataLoader)r   r^   r`   rZ   r[   r\   r]   rd   r   r   r   prepare_dataloader   s    


zAccelerator.prepare_dataloaderc                 C  s   t | dr| jS | S )Nmodule)rJ   rl   )rG   r   r   r   unwrap   s   zAccelerator.unwrap)Fr   )r   r   r   r	   )r   r	   )rC   rD   )rG   rH   )r   rD   )r   rY   )r^   r_   r`   r	   rZ   r	   r[   r   r]   r   ra   rb   )rG   rH   ra   rH   )r    r!   r"   __doc__r8   r0   r=   r>   rB   r(   ReduceOpAVGrF   rP   
contextlibcontextmanagerrQ   propertyr5   rV   rX   r   r   rk   staticmethodrm   r   r   r   r   r      s0    
 
	





r   )
__future__r   rq   r%   r;   typingnumpyr:   r1   torch.distributedrh   r(   torch.utils.datatorch.nn.parallelr   r   r   r   r   r   <module>   s    