o
    3wiI                     @   s  d dl mZ d dlZd dlm  mZ d dlmZ d dl	m
Z
 d dlmZ d dlmZmZmZ d dlmZ d dlmZmZ d d	lmZmZ d
d Zd#ddZd$ddZdd Zdd Zdd Zd%ddZ	d%ddZ dd Z!dd Z"d d! Z#e$d"kre"  dS dS )&    )deepcopyN)AdamW)LambdaLR)
DataLoader)AcceleratorDataLoaderConfigurationGradientAccumulationPlugin)GradientState)RegressionDatasetRegressionModel)DistributedTypeset_seedc              	   K   s   t |  | D ]G\}}|jsq	|s2tj|j|jfi |du s1J d| d|j d|j dq	tj|j|jfi |du sPJ d| d|j d|j dq	d S )	NF7Gradients in sync when they should not be at iteration z:
model_a grad (z) == model_b grad ()T7Gradients not in sync when they should be at iteration z) != model_b grad ()zip
parametersrequires_gradtorchallclosegrad)model_amodel_bdid_step	iterationkwargsparam
grad_param r   d/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/accelerate/test_utils/scripts/test_sync.pycheck_model_parameters   s   r    Tc                 C   sL   |    | |}t|||j}|s||j }|  d S || d S N)trainFmse_losstodevicegradient_accumulation_stepsbackward)modelinputtargetacceleratordo_backwardoutputlossr   r   r   
step_model-   s   
r0   Fc           	      C   s   t d t }t|}tdd}t|dd}|| j |r>t| dd}t| dd}t	|dd	 d
}t	|dd	 d
}|rM| 
||||\}}}}n| 
||\}}|r`|||||||fS |||fS )z3Returns everything needed to perform basic training*   P   length   
batch_sizeMbP?)paramslrc                 S      | d S Ng?r   epochr   r   r   <lambda>C       z$get_training_setup.<locals>.<lambda>)	lr_lambdac                 S   r;   r<   r   r=   r   r   r   r?   D   r@   )r   r   r   r
   r   r%   r&   r   r   r   prepare)	r,   schedr)   	ddp_modeldset
dataloaderoptddp_opt	ddp_schedr   r   r   get_training_setup8   s"   

rJ   c              	   C   s>  t | \}}}tt| \}}tdD ]}| ||f\}}|| j|| j}}t||||  |d dkrW| 	| t||||  W d    n1 sQw   Y  nt||||  t
||d| t| | D ]\}	}
|	jsvqnt|	j|
jsJ d|	j d|
j dqntd|  |tt| }qd S )	N      r   T7Gradients not in sync when they should be:
Model grad () != DDP grad (r   9  )rJ   nextitervaluesrangegatherr%   r&   r0   no_syncr    r   r   r   r   r   r   manual_seedrandpermlenr,   r)   rD   rF   	ddp_input
ddp_targetr   r*   r+   r   	ddp_paramr   r   r   test_noop_syncO   s,   r]   c              	   C   sp  t | \}}}tt| \}}tdD ]}| ||f\}}|| j|| j}}t||||  |d dkrW| 	| t||||  W d    n1 sQw   Y  nt||||  t
| | D ]=\}	}
|	jsoqg|d dkrt|	j|
jdu sJ d|	j d|
j dqgt|	j|
jdu sJ d	|	j d
|
j dqgtd|  |tt| }qd S )NrK   rL   r   F7Gradients in sync when they should not be:
Model grad () == DDP grad (r   TrM   rN   rO   )rJ   rP   rQ   rR   rS   rT   r%   r&   r0   rU   r   r   r   r   r   r   rV   rW   rX   rY   r   r   r   test_distributed_syncq   s2   r`   c              	   C   s  t | \}}}g }d}t|D ]Q}tt| \}}| ||f\}	}
|	| j|
| j}	}
t||	|
|  | 	| ||}t
|||j}|| W d    n1 s[w   Y  qt|D ]~}|| }||d k r| | t| | D ]\}}|jsqt|j|jdu sJ d|j d|j dqqe| | | | W d    n1 sw   Y  t| | D ]\}}|jsqt|j|jdu sJ d|j d	|j dqqed S )
NrK      Fr^   r_   r   TrM   rN   )rJ   rS   rP   rQ   rR   rT   r%   r&   r0   rU   r#   r$   appendr(   r   r   r   r   r   r   trigger_sync_in_backward)r,   r)   rD   rF   lossesnum_iterationsr   rZ   r[   r*   r+   
ddp_outputr/   r   r\   r   r   r   "test_distributed_sync_multiple_fwd   sH   

rg   c              
   C   s  t d|d}t| |d}t||d}t|\}}}t|D ]\}	}
|
 \}}|||f\}}||j||j}}t	||||d |
| t	|||| W d    n1 s\w   Y  t| | D ]O\}}|jsrqj|	d d dks|	t|d ks|rt|j|jdu sJ d	|	 d
|j d|j dqjt|j|jdu sJ d|	 d
|j d|j dqjtd|	  |tt| }qt  d S )NrL   	num_stepssync_each_batchsplit_batchesdispatch_batchesdataloader_configgradient_accumulation_pluginFra   r   Tr   z:
Model grad (rN   r   r   r_   rO   )r   r   r   rJ   	enumeraterR   rT   r%   r&   r0   
accumulater   r   r   rX   r   r   r   rV   rW   r	   _reset_state)rl   rm   rj   rp   ro   r,   r)   rD   rF   r   batchrZ   r[   r*   r+   r   r\   r   r   r   test_gradient_accumulation   s8   $ru   c              	   C   s  t d|d}t| |d}t||d}t|d\}}}}	}
}}t|	D ]\}}| \}}|||f\}}||j||j}}|	  |
	  t
||||d |  |d d dksf|d t|	kry| rm|  nt|jD ]}|  qr||
 t
|
||| |  |  W d    n1 sw   Y  |jd d	 |jd d	 ksJ d
|jd d	  d|jd d	  d|d d dkp|d t|	k}|jdkrt||
|p||dd |r|  |  td|  q"t  d S )NrL   rh   rk   rn   TFra   r   r:   z:Learning rates found in each optimizer did not align
opt: z

DDP opt: 
r8   )rtolrO   )r   r   r   rJ   rq   rR   rT   r%   r&   r"   r0   steprX   rS   num_processesrr   param_groupsr    	zero_gradr   rV   r	   rs   )rl   rm   rj   rp   ro   r,   r)   rG   rC   rF   rD   rH   rI   r   rt   rZ   r[   r*   r+   _r   r   r   r   1test_gradient_accumulation_with_opt_and_scheduler   sT    


$ 
r}   c                  C   s  t  } tdd}t|dd}tdd}t|dd}| ||\}}| jjd u s)J t|D ]S\}}t| jjt|ks=J |t|d k rz| jj	rKJ |dkryt|D ]%\}}t| jjt|kscJ |t|d k rr| jj	rqJ qS| jj	sxJ qSq-| jj	sJ q-| jjd u sJ d S )Nr2   r3   r5   r6   `   ra   )
r   r
   r   rB   gradient_stateactive_dataloaderrq   idrX   end_of_dataloader)r,   
first_dsetfirst_dataloadersecond_dsetsecond_dataloaderr   r|   	batch_numr   r   r   test_dataloader_break2  s(   

r   c                  C   s  t  } | j}|jdkrtd |jtjkrt  |jtjkr+|jdkr'td t	|  |jtj
tjtjtjtjtjtjfv rX|jdkrGtd t|  |jdkrTtd t|  |jtj
tjtjtjtjtjfv rdD ]&}dD ]!}dD ]}|jdkrtdd| d	| d
| d t||| qsqoqk|jdkrtdd t  |jtj
tjtjtjtjtjfv rdD ]-}dD ](}dD ]#}|s|s|sq|jdkrtdd| d	| d
| d t||| qqq|  d S )Nr   zA**Test `accumulate` gradient accumulation with dataloader break**z'**Test NOOP `no_sync` context manager**z.**Test Distributed `no_sync` context manager**zE**Test Distributed `no_sync` context manager with multiple forwards**)TFz+**Test `accumulate` gradient accumulation, z`split_batches=z` and `dispatch_batches=z` and `sync_each_batch=z`**zH**Test `accumulate` gradient accumulation with optimizer and scheduler, zJ`split_batches=False`, `dispatch_batches=False`, `sync_each_batch=False`**)r   statelocal_process_indexprintdistributed_typer   XLAr   NOr]   	MULTI_GPU	MULTI_NPU	MULTI_MLU
MULTI_SDAA
MULTI_MUSA	MULTI_CPU	MULTI_HPUr`   rg   ru   r}   destroy_process_group)r,   r   split_batchrm   rj   r   r   r   mainK  s   


	





r   c                 C   s
   t   d S r!   )r   )indexr   r   r   _mp_fn  s   
r   __main__)T)F)FFF)%copyr   r   torch.nn.functionalnn
functionalr#   torch.optimr   torch.optim.lr_schedulerr   torch.utils.datar   accelerate.acceleratorr   r   r   accelerate.stater	   accelerate.test_utilsr
   r   accelerate.utilsr   r   r    r0   rJ   r]   r`   rg   ru   r}   r   r   r   __name__r   r   r   r   <module>   s2   

"(
6*
:I
