o
    ãÊiE]  ã                   @   sÎ  d dl Z d dlZd dlZd dlmZ d dlZd dlZd dlmZ d dlm	Z	m
Z
mZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZ ejdu rPd	e_d d
lmZ d dlmZ d dlmZmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z* zd dl+Z,W n e-y—   dZ,Y nw zd dl.Z.W n e-y©   dZ.Y nw ej/j0dur·ej1ddd e)ƒ Z2G dd„ deƒZ3G dd„ deƒZ4dZ5G dd„ deƒZ6ee3ƒ ee4ƒ e7dkråeƒ  dS dS )é    N)ÚPath)Únn)ÚCPUOffloadPolicyÚOffloadPolicyÚfully_shard)Úcommon_utils)Úskip_if_lt_x_gpu)ÚFSDPTest)ÚTestCaseÚinstantiate_parametrized_testsÚparametrizeÚ	run_testsiÒ  )ÚVersion)Úoptim)Ú_fp32_to_bf16_srÚquantize_4bit_with_qmapÚquantize_8bit_with_qmap)ÚOptimState4bit)ÚOptimState8bit)ÚOptimStateFp8)Úskip_if_rocm)Úget_available_devicesÚtorch_version_at_leastzSkipping the test in ROCmT)Úallow_module_levelc                   @   s”   e Zd Zedeƒdd„ ƒZedeƒdd„ ƒZedeƒdd„ ƒZedeƒdd	„ ƒZedeƒed
ddgƒdd„ ƒƒZ	edeƒed
ddgƒdd„ ƒƒZ
dS )ÚTestQuantizeÚdevicec                 C   ó`   t jdd|d}t jd|d ¡ j}| d¡|  ¡  d¡ t j¡}t	||ƒ}t j
 ||¡ d S )Né    é   ©r   é   éÿÿÿÿ)ÚtorchÚrandÚsortÚvaluesÚ	unsqueezeÚabsÚargminÚtoÚuint8r   ÚtestingÚassert_close©Úselfr   ÚxÚqmapÚactualÚexpected© r3   úK/home/ubuntu/.local/lib/python3.10/site-packages/test/test_low_bit_optim.pyÚ(test_quantize_8bit_with_qmap_correctnessA   ó
    
z5TestQuantize.test_quantize_8bit_with_qmap_correctnessc                 C   óX   t jdd|d}t jd|d ¡ j}t jtdd}|||ƒ}t||ƒ}t j ||¡ d S )Nr   r   r   r    T©Ú	fullgraph)r"   r#   r$   r%   Úcompiler   r+   r,   ©r.   r   r/   r0   Ú
compiled_fr1   r2   r3   r3   r4   Ú$test_quantize_8bit_with_qmap_compileK   ó   

z1TestQuantize.test_quantize_8bit_with_qmap_compilec                 C   r   )Nr   r   r   é   r!   )r"   r#   r$   r%   r&   r'   r(   r)   r*   r   r+   r,   r-   r3   r3   r4   Ú(test_quantize_4bit_with_qmap_correctnessV   r6   z5TestQuantize.test_quantize_4bit_with_qmap_correctnessc                 C   r7   )Nr   r   r   r?   Tr8   )r"   r#   r$   r%   r:   r   r+   r,   r;   r3   r3   r4   Ú$test_quantize_4bit_with_qmap_compile`   r>   z1TestQuantize.test_quantize_4bit_with_qmap_compiler:   FTc                 C   st   t jd|dd }| dd¡ dd¡}t jtdd| d	}||ƒ}|jt ju s)J ‚t jj	| 
¡  d¡|d
d
d d S )Nr   r   éd   r!   é   é † TF©r9   ÚdynamicÚdisablegiUMuÿ>)ÚatolÚrtol)r"   r#   ÚviewÚrepeatr:   r   ÚdtypeÚbfloat16r+   r,   ÚfloatÚmean)r.   r   r:   r/   Úx_repÚfuncÚ
x_rep_bf16r3   r3   r4   Útest_bf16_stochastic_roundk   s   
ÿ"z'TestQuantize.test_bf16_stochastic_roundc                 C   s*  t  d¡ dd lm} ddlm} ddlm}m} d}| 	¡ r6| 
¡ s6| dddd	¡}|jd
|ddd d	}zVt tj¡ tjd|dd }	|	 dd¡ dd¡}
tjtd	d| d}||
ƒ}||dƒ}|j|
||ƒ gdd}||ƒ}t||ƒsyJ ‚tj | ¡ |¡ W |r‹| ¡  d S d S |r”| ¡  w w )Nztorch.distributedr   )Úinit_device_mesh)ÚDTensorÚ	ReplicateFz	127.0.0.1i<s  rC   TÚgloo)ÚbackendÚstoreÚrankÚ
world_sizer   r   rB   r!   rD   rE   )rC   )Ú	run_check)ÚpytestÚimportorskipÚtorch.distributedÚdistributedÚtorch.distributed.device_meshrT   Útorch.distributed.tensorrU   rV   Úis_availableÚis_initializedÚTCPStoreÚinit_process_groupr"   Úmanual_seedr   ÚSEEDr#   rJ   rK   r:   r   Ú
from_localÚ
isinstancer+   r,   Úto_localÚdestroy_process_group)r.   r   r:   ÚdistrT   rU   rV   Ú
created_pgrY   r/   rP   rQ   Ú	out_plainÚmeshÚx_dtÚout_dtr3   r3   r4   Ú"test_bf16_stochastic_round_dtensorz   s@   
ü
ÿ
ÿ
ÿz/TestQuantize.test_bf16_stochastic_round_dtensorN)Ú__name__Ú
__module__Ú__qualname__r   Ú_DEVICESr5   r=   r@   rA   rS   rs   r3   r3   r3   r4   r   @   s    
	


	

r   c                   @   sÎ  e Zd Zedg d¢ƒedejejgƒedeƒedƒdd„ ƒƒƒƒZ	edg d¢ƒedeƒd	d
„ ƒƒZ
edg d¢ƒedeƒdd„ ƒƒZedeeegƒedddgƒedeƒdd„ ƒƒƒZejjedu ddejjej ¡  ddedƒejjedƒddedddgƒdd„ ƒƒƒƒƒZejjedu ddejjej ¡  ddeddd gƒd!d"„ ƒƒƒZejjej ¡  o·ej ¡  d#ded$g d%¢ƒd&d'„ ƒƒZejjej ¡  oÔej ¡  d#dd(d)„ ƒZedeƒd*d+„ ƒZdS ),Ú	TestOptimÚ
optim_name)ÚAdam8bitÚ	AdamW8bitÚAdam4bitÚ	AdamW4bitÚAdamFp8ÚAdamWFp8rL   r   úROCm enablement in progressc                 C   s   |  d¡r|dkrtj ¡ dk rt d¡ t t dd¡t 	¡ t dd¡¡}|j
||d tt|ƒ| ¡ ƒ}tjdd||d}||ƒ ¡ }| ¡  | ¡  | ¡  t ¡ }t | ¡ |j¡ tj|jd	d
}	W d   ƒ n1 sqw   Y  t |¡}
tt|ƒ|
 ¡ ƒ}| |	¡ tdƒD ]+}tjdd||d}||ƒ ¡  ¡  | ¡  | ¡  |
|ƒ ¡  ¡  | ¡  | ¡  qt| ¡ |
 ¡ ƒD ]\}}tj ||¡ qÂd S )NÚFp8Úcuda©é   é	   ú+FP8 CUDA requires compute capability >= 8.9r   r    )r   rL   é   Úcpu©Úmap_locationé   ) Úendswithr"   r‚   Úget_device_capabilityr]   Úskipr   Ú
SequentialÚLinearÚReLUr)   Úgetattrr   Ú
parametersÚrandnÚsumÚbackwardÚstepÚ	zero_gradÚtempfileÚNamedTemporaryFileÚsaveÚ
state_dictÚnameÚloadÚcopyÚdeepcopyÚload_state_dictÚrangeÚzipr+   r,   )r.   ry   rL   r   ÚmodelÚ	optimizerr/   ÚlossÚfrœ   Úmodel2Úoptim2Ú_Úp1Úp2r3   r3   r4   Útest_optim_smoke£   s:   
"
þ


ÿzTestOptim.test_optim_smoke)rz   r|   r~   c              	   C   sÐ   |  d¡r|dkrtj ¡ dk rt d¡ t ¡ }t tj¡ zBt	 
t	 dd¡t	 ¡ t	 dd¡¡}|j|d tt|ƒ| ¡ ƒ}tjdd|d}||ƒ ¡ }| ¡  | ¡  | ¡  W t |¡ d S t |¡ w )	Nr   r‚   rƒ   r†   r   r    r   r‡   )rŒ   r"   r‚   r   r]   rŽ   Úget_default_dtypeÚset_default_dtyperM   r   r   r   r‘   r)   r’   r   r“   r”   r•   r–   r—   r˜   )r.   ry   r   Ú	old_dtyper¤   r¥   r/   r¦   r3   r3   r4   Útest_optim_default_dtype_bf16Ð   s   
"
z'TestOptim.test_optim_default_dtype_bf16c                 C   sÎ   |  d¡r|dkrtj ¡ dk rt d¡ t t dd¡t 	¡ t dd¡¡}|j
|d tt|d  ¡ ƒd	d
tt|d  ¡ ƒdd
g}tt|ƒ|ƒ}tjdd|d}||ƒ ¡ }| ¡  | ¡  | ¡  d S )Nr   r‚   rƒ   r†   r   r    r   r   g-Cëâ6?)ÚparamsÚlrr‹   çñhãˆµøä>r‡   )rŒ   r"   r‚   r   r]   rŽ   r   r   r   r‘   r)   ÚdictÚlistr“   r’   r   r”   r•   r–   r—   r˜   )r.   ry   r   r¤   Úparam_groupsr¥   r/   r¦   r3   r3   r4   Útest_param_groupsè   s   
"þzTestOptim.test_param_groupsÚsubclassÚshape)i   )r    r    c                 C   sš   |t kr|dkrtj ¡ dk rt d¡ |j||d}|d d }tj | 	¡ d |… |d |…  	¡ ¡ tj | 	¡ ||d … |||d …  	¡ ¡ d S )Nr‚   rƒ   r†   r   r   r‹   )
r   r"   r‚   r   r]   rŽ   Úzerosr+   r,   Ú
dequantize)r.   r¹   rº   r   ÚtensorÚoffsetr3   r3   r4   Útest_subclass_slice  s   
ÿþzTestOptim.test_subclass_sliceNzbitsandbytes is not available)Úreasonz+bitsandbytes 8-bit Adam only works for CUDAz2.7.0zFailing in CIrz   r{   c                 C   s  d}t  t  dd¡t  ¡ t  dd¡¡}| |¡ t |¡}ttj	ƒtdƒkr(dnd}t
tj|ƒ| ¡ ƒ}t
t|ƒ| ¡ |d}td	ƒD ].}tjd
d|d}	||	ƒ ¡ }
|
 ¡  | ¡  | ¡  ||	ƒ ¡ }| ¡  | ¡  | ¡  qCt| ¡ | ¡ ƒD ]\}}tjj||ddd q{d S )Nr‚   r   r   é€   z0.44.0r    i   )Ú
block_sizer‹   r‡   r   r´   ©rI   rH   )r   r   r   r‘   r)   rŸ   r    r   ÚbnbÚ__version__r’   r   r“   r¢   r"   r”   r•   r–   r—   r˜   r£   r+   r,   )r.   ry   r   Úmodel1r¨   rÂ   Úoptim1r©   rª   r/   Úloss1Úloss2r«   r¬   r3   r3   r4   Útest_optim_8bit_correctness  s(   "


ÿz%TestOptim.test_optim_8bit_correctnesszlpmm is not availablez#lpmm 4-bit Adam only works for CUDAr|   r}   c                 C   s.  d}t  t  dd¡t  ¡ t  dd¡¡}| |¡ t |¡}|dkr,tjj	| 
¡ dd}n|dkr9tj 	| 
¡ ¡}ntd	|› d
ƒ‚tt|ƒ| 
¡ ƒ}tdƒD ].}tjdd|d}||ƒ ¡ }	|	 ¡  | ¡  | ¡  ||ƒ ¡ }
|
 ¡  | ¡  | ¡  qNt| 
¡ | 
¡ ƒD ]\}}tjj||ddd q†d S )Nr‚   r   r   rÁ   r|   r   )Úweight_decayr}   zUnsupported z optimizer for lpmmr‹   r‡   r   r´   rÃ   )r   r   r   r‘   r)   rŸ   r    Úlpmmr   ÚAdamWr“   Ú
ValueErrorr’   r¢   r"   r”   r•   r–   r—   r˜   r£   r+   r,   )r.   ry   r   rÆ   r¨   rÇ   r©   rª   r/   rÈ   rÉ   r«   r¬   r3   r3   r4   Útest_optim_4bit_correctness<  s.   "


ÿz%TestOptim.test_optim_4bit_correctnessz&optim CPU offload requires CUDA or XPUzoffload_grad,grad_accum))FrC   )Fr‹   )TrC   c                 C   sº  t d }t t dd¡t ¡ tjddddt ¡ tjddddt ¡ tjdddd¡}| |¡ |d  d	¡ t |¡}t	j
 | ¡ ¡}t
j| ¡ t	j
j|d
}t	j
j |d¡}t	j
j |d¡}	t	j|d}
|
 d¡ tdƒD ]&}t|ƒD ]}t	jdd||
d}||ƒ ¡  ¡  qu| ¡  | ¡  | ¡  qo|
 d¡ tdƒD ]&}t|ƒD ]}t	jdd||
d}||ƒ ¡  ¡  q¥| ¡  | ¡  |	 ¡  qŸt| ¡ | ¡ ƒD ]\}}t	j ||¡ qÏd S )Nr!   r   i   é@   T©ÚbiasrÁ   r‹   F)Úoffload_gradientsrB   r   é*   r‡   )r   Ú	generator)rw   r   r   r   r‘   r)   Úrequires_grad_rŸ   r    r"   r   rÍ   r“   ÚCPUOffloadOptimizerÚlr_schedulerÚCosineAnnealingLRÚ	Generatorrg   r¢   r”   r•   r–   r—   r˜   r£   r+   r,   )r.   Úoffload_gradÚ
grad_accumr   rÆ   r¨   rÇ   r©   Ú
scheduler1Ú
scheduler2Úrngrª   r/   r«   r¬   r3   r3   r4   Ú"test_optim_cpu_offload_correctness`  sR   
ù
	
ý



ÿz,TestOptim.test_optim_cpu_offload_correctnessc              	   C   sŒ  t d }t tjddddt ¡ tjdddd¡}| |¡ t | ¡ t	jj
¡}tdƒD ]}t	jdd|d	}||ƒ ¡  ¡  | ¡  | ¡  q,t ¡ }t	 | ¡ |j¡ t	j|jd
d}W d   ƒ n1 sgw   Y  t |¡}t | ¡ t	jj
¡}	|	 |¡ tdƒD ]*}t	jdd|d	}||ƒ ¡  ¡  | ¡  | ¡  ||ƒ ¡  ¡  |	 ¡  |	 ¡  q„t| ¡ | ¡ ƒD ]\}
}t	j ||
¡ q¸d S )Nr!   r   r   TrÑ   rÁ   r‹   r‡   r   rˆ   r‰   )rw   r   r   r   r‘   r)   r   r×   r“   r"   rÍ   r¢   r”   r•   r–   r—   r˜   r™   rš   r›   rœ   r   rž   rŸ   r    r¡   r£   r+   r,   )r.   r   rÆ   rÇ   rª   r/   Úfilerœ   r¨   r©   r«   r¬   r3   r3   r4   Ú test_optim_cpu_offload_save_load¡  s:   "ÿ


þ


ÿz*TestOptim.test_optim_cpu_offload_save_loadc           	   	      s(  t  d¡ t t dd¡t ¡ t dd¡¡}| |¡ t |¡ 	¡ }t j
j| ¡ dd}t
j| ¡ ddd}t jd	d|d
}tdƒD ]O‰ t j|t j	d ||ƒ}W d   ƒ n1 s[w   Y  | ¡ }| ¡  | ¡  | ¡  || 	¡ ƒ ¡ }| ¡  | ¡  | ¡  t jj||‡ fdd„d qBd S )Niè  r   r   rÁ   r´   ©r³   T)r³   Úbf16_stochastic_roundr‡   r   é   )rL   c                    s   dˆ › d| › S )Nz
Iteration z. r3   ©Úmsg©Úidxr3   r4   Ú<lambda>ð  s    zHTestOptim.test_optim_bf16_stochastic_round_correctness.<locals>.<lambda>ræ   )r"   rg   r   r   r   r‘   r)   rŸ   r    rM   r   rÍ   r“   Ú_AdamWr”   r¢   Úautocastr•   r–   r—   r˜   r+   r,   )	r.   r   rÆ   r¨   rÇ   r©   r/   rÈ   rÉ   r3   rè   r4   Ú,test_optim_bf16_stochastic_round_correctnessÍ  s6   
"
ý
ÿÿñz6TestOptim.test_optim_bf16_stochastic_round_correctness)rt   ru   rv   r   r"   Úfloat32rM   rw   r   r­   r±   r¸   r   r   r   r¿   r]   ÚmarkÚskipifrÄ   r‚   rc   r   rÊ   rÌ   rÏ   Úxpurà   râ   rí   r3   r3   r3   r4   rx   ¢   sb    þ&
þÿÿþ<þ
(rx   r‹   c                   @   sV   e Zd Zedefdd„ƒZeeƒedƒdd„ ƒƒZ	dd„ Z
eeƒedƒd	d
„ ƒƒZdS )Ú	TestFSDP2Úreturnc                 C   s   t S )N)Ú_FSDP_WORLD_SIZE)r.   r3   r3   r4   r[   ø  s   zTestFSDP2.world_sizer€   c                 C   sP   t jtft jtft jtfg}tj ¡ dkr| t j	tf¡ |  
d|i| j¡ d S )Nrƒ   Úargs)r   r{   r   r}   r   r"   r‚   r   Úappendr   Úrun_subtestsÚ_test_fsdp2)r.   Ú	args_listr3   r3   r4   Ú
test_fsdp2ü  s   ýþzTestFSDP2.test_fsdp2c           $      C   s†  dd l m} dd lm  m} dd lm  m} ddlm} ddl	m
}m}m} |\}	}
d}d}d}|ddd||dd	}t d
¡ t d¡ ||ƒ}W d   ƒ n1 sUw   Y  |	| ¡ dd}t |¡}| ¡ D ]}t||ƒrxt||
d qkt||
d |	| ¡ dd}t d
| j d ¡ tdƒD ]Y}tjd|||fdd}|j|d dkd ||ƒ ¡ }| ¡  | ¡  |j|d dkd ||ƒ ¡ }| ¡  | ¡ D ]}|jd urã|j|j|jj d qÒ| ¡  |  !||¡ q•|j"d d d }|j#| d }|j"d d d }|j#| d }| $¡ }|  !| %¡ | %¡ ¡ d|	j&› }t'|ƒ (¡ r/t) *|¡ |j+| ,¡ |d |	| ¡ dd}| ¡ D ]	} t -| ¡| _qD| ¡  |j.| ,¡ |d | /¡ dkrgt) *|¡ t0t1t2f}!t3| 4| ,¡ ¡| 4| ,¡ ¡ƒD ]E\}"}#|"j5|#j5ksŽJ |"j5|#j5fƒ‚t|"|ƒr«|" 6¡ }"|# 6¡ }#|"j5|#j5ks«J |"j5|#j5fƒ‚t|"|!ƒr¹|" %¡ }"|# %¡ }#|  !|"|#¡ q{d S )Nr   )rU   )Ú	ModelArgsÚTransformerÚTransformerBlocké   r   rÐ   r‡   é   )Ún_layersÚn_headsÚdimÚ
vocab_sizeÚmax_seq_lenÚ	dropout_prÔ   r‚   g{®Gáz„?rã   )Úoffload_policyrC   rå   r   r‹   )Úset_to_none)Úopr²   Úexp_avgÚ_fsdp_low_bit_optim_)Úcheckpoint_id)7r_   r`   Útorch.distributed.checkpointÚ
checkpointÚtorch.utils._pytreeÚutilsÚ_pytreerb   rU   Ú:torch.testing._internal.distributed._tensor.common_dtensorrû   rü   rý   r"   rg   r   r“   rŸ   r    Úmodulesrj   r   rZ   r¢   Úrandintr˜   rO   r–   r—   ÚgradÚ
all_reduceÚReduceOpÚAVGÚassertEqualr·   ÚstateÚfull_tensorr¼   rt   r   ÚexistsÚshutilÚrmtreer›   rœ   Ú
zeros_likerž   Úget_rankr   r   r   r£   Ú	tree_iterÚ	__class__rk   )$r.   rõ   rm   ÚdcpÚpytreerU   rû   rü   rý   Ú	optim_clsr  Ú
batch_sizer  Úseq_lenÚ
model_argsÚ
base_modelÚ
base_optimÚ
fsdp_modelÚmÚ
fsdp_optimÚiter_idxÚinpÚ	fsdp_lossÚ	base_lossÚparamÚ
base_paramÚbase_exp_avgÚ
fsdp_paramÚfsdp_exp_avgÚfull_fsdp_exp_avgr  Úresumed_fsdp_optimÚpÚ
subclassesÚv1Úv2r3   r3   r4   rø     sš   ú

ÿ

€
€


þôzTestFSDP2._test_fsdp2c                 C   s   d}t d d }tj||dd}|jjd t  dksJ ‚t|ƒ t | ¡ ¡}t	dƒD ]}t
jd|dd}||ƒ ¡  ¡  | ¡  | ¡  q+d S )Nrÿ   r?   rC   r‚   r   r   r‹   )rô   r   r   Úweightrº   r   r   r{   r“   r¢   r"   r”   r•   r–   r—   r˜   )r.   Úin_dimÚout_dimr¤   r¥   rª   Úinputsr3   r3   r4   Útest_uneven_shardo  s   
üzTestFSDP2.test_uneven_shardN)rt   ru   rv   ÚpropertyÚintr[   r   rô   r   rú   rø   r@  r3   r3   r3   r4   rò   ÷  s    brò   Ú__main__)8rŸ   r  r™   Úpathlibr   r]   r"   r   Ú"torch.distributed._composable.fsdpr   r   r   Útorch.testing._internalr   Ú*torch.testing._internal.common_distributedr   Ú#torch.testing._internal.common_fsdpr	   Ú$torch.testing._internal.common_utilsr
   r   r   r   rh   Úpackaging.versionr   Útorchaor   Útorchao.optim.quant_utilsr   r   r   Útorchao.optim.subclass_4bitr   Útorchao.optim.subclass_8bitr   Útorchao.optim.subclass_fp8r   Útorchao.testing.utilsr   Útorchao.utilsr   r   ÚbitsandbytesrÄ   ÚImportErrorrÌ   ÚversionÚhiprŽ   rw   r   rx   rô   rò   rt   r3   r3   r3   r4   Ú<module>   s^   
ÿÿb  T 
ÿ