o
    i/i                     @   s|  d dl Z d dlZd dlZd dlZd dlmZmZmZmZ d dl	m
Z
 d dlmZmZ d dlZd dlmZ d dlmZmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZm Z  	 dd Z!d#ddZ"dd Z#d#ddZ$dd Z%dd Z&d$ddZ'G dd de
j(Z)G dd de
j(Z*G dd deZ+G d d! d!e
j(Z,e
-e) e
-e* e
-e+ e.d"kre/  dS dS )%    N)
DeviceMeshDTensor	ReplicateShard)common_utils)DTensorTestBase
with_comms)AOBaseConfig)AffineQuantizedTensorto_affine_quantized_intx)Int8WeightOnlyConfig	quantize_)MappingType)_QUANTIZE_CONFIG_HANDLER)LlamaModelsLlama4Experts)DummyModuleget_compute_capabilityget_current_accelerator_devicec                    s   dd l  fdd}|S )Nr   c                    s    fdd}|S )Nc                     s(   t   k rd  | i |S )Nz Compute capability is less than )r   SkipTestargskwargs)min_capability	test_funcunittest I/home/ubuntu/.local/lib/python3.10/site-packages/torchao/testing/utils.pywrapper>   s
   
zHskip_if_compute_capability_less_than.<locals>.decorator.<locals>.wrapperr   r   r   r   r   r   r   	decorator=   s   z7skip_if_compute_capability_less_than.<locals>.decoratorr   )r   r!   r   r   r   $skip_if_compute_capability_less_than:   s   
r#   c                    s2   ddl  fdd}t r }d ||S |S )zDecorator to skip tests on ROCm platform with custom message.

    Args:
        message (str, optional): Additional information about why the test is skipped.
    r   Nc                    s   t   fdd}|S )Nc                     s:   t jjd urd}r|d 7 }|  | i |S )NzSkipping the test in ROCm: )torchversionhipskipr   r   skip_message)funcmessagepytestr   r   r   S   s   
z0skip_if_rocm.<locals>.decorator.<locals>.wrapper	functoolswrapsr+   r   r,   r-   r+   r   r!   R   s   zskip_if_rocm.<locals>.decorator)r-   callabler,   r!   r+   r   r2   r   skip_if_rocmJ   s   r6   c                     sD   zdd l d W n ty   d dd lY nw  fdd} | S )Nr   TFc                    s    t   fdd}|S )Nc                     s6   t j sd}r| n|  | i |S )NzNo XPU availabler%   xpuis_availabler(   r)   )r+   
has_pytestr-   r   r   r   r   p   s   

z2skip_if_no_xpu.<locals>.decorator.<locals>.wrapperr.   r1   r:   r-   r   r3   r   r!   o   s   	z!skip_if_no_xpu.<locals>.decorator)r-   ImportErrorr   r!   r   r;   r   skip_if_no_xpuf   s   r>   c                    s^   zddl d W n ty   d ddlY nw  fdd}tr-}d||S |S )z
    Decorator to skip tests on XPU platform with custom message.

    Args:
        message (str, optional): Additional information about why the test is skipped.
    r   NTFc                    s"   t   fdd}|S )Nc                     sH   t j rd}r|d 7 }r| n|  | i |S )NzSkipping the test in XPUr$   r7   r)   )r+   r:   r,   r-   r   r   r   r      s   

z/skip_if_xpu.<locals>.decorator.<locals>.wrapperr.   r1   r:   r,   r-   r   r3   r   r!      s   zskip_if_xpu.<locals>.decorator)r-   r<   r   r4   r5   r   r?   r   skip_if_xpu   s   r@   c                        dd l   fdd} | S )Nr   c                       t   fdd}|S )Nc                     s"   t j s
d | i |S )NzNo cuda available)r%   cudar9   r   r   r   r   r   r   r      s   

z3skip_if_no_cuda.<locals>.decorator.<locals>.wrapperr.   r   r"   r    r   r!      s   z"skip_if_no_cuda.<locals>.decoratorr"   r=   r   r"   r   skip_if_no_cuda   s   	rE   c                     rA   )Nr   c                    rB   )Nc                     s,   zdd l }W n   d | i |S )Nr   zNo gemlite available)gemliter   )r   r   rF   rD   r   r   r      s
   
z6skip_if_no_gemlite.<locals>.decorator.<locals>.wrapperr.   r   r"   r    r   r!      s   z%skip_if_no_gemlite.<locals>.decoratorr"   r=   r   r"   r   skip_if_no_gemlite   s   rG   c           
      C   s   | j  D ]S\}}|drXt||fdd}t|j |_ |d ur.t||r.t	|}|o4|
|}|d urM||jv rM|jrFtdntj	}	|	|}t|| d| | qd S )Ntest_c                 S   s   || S Nr   )selfvaluer   r   r   new_test   s   zcopy_tests.<locals>.new_testzSkipped!_)__dict__items
startswithr/   r0   copydeepcopyhasattrr   expectedFailuregetsuffixesis_skipr(   setattr)
my_cls	other_clssuffixtest_failures
xfail_propnamerK   rL   tf	skip_funcr   r   r   
copy_tests   s"   

ra   c                   @   s   e Zd Zdgej rdgng  Zejejej	gZ
eZeZejdejdZdZdd Zedeed	e
d
d Zedeededd Zedeed	e
dd Zedeed	e
dd ZdS )TorchAOBasicTestCasecpurC          mapping_type
block_sizetarget_dtype(   c                    sx   t dd}| j|fi | j   \}} fdd|D }  }  }| j||||}| 	 
 |
  d S )N      c                    s   i | ]}|t  |qS r   )getattr).0r^   	lp_tensorr   r   
<dictcomp>   s    z?TorchAOBasicTestCase.test_flatten_unflatten.<locals>.<dictcomp>)r%   randn
FACTORY_FNr   __tensor_flatten__sizestrideTENSOR_SUBCLASS__tensor_unflatten__assertEqual
dequantize)rJ   	hp_tensortensor_data_name_dicttensor_attributestensor_data_dict
outer_sizeouter_stridereconstructedr   rp   r   test_flatten_unflatten   s   
z+TorchAOBasicTestCase.test_flatten_unflattendevicedtypec                 C   s*   t jdd||d}| j|fi | j d S )Nrl   rm   r   r   )r%   rs   rt   r   )rJ   r   r   r|   r   r   r   test_hp_tensor_device_dtype  s   z0TorchAOBasicTestCase.test_hp_tensor_device_dtypedevice1device2c                 C   s   t jdd|t jd}| j|fi | j}|j|d t jdd|t jd}| j|fi | j}|| t jdd|t jd}| j|fi | j}|  t jdd|t jd}| j|fi | j}|  dS )zNote: this should be parametrized with device1 and device2
        e.g. device1 = ["cpu", "cuda"], device2 = ["cpu", "cuda"]
        rl   rm   r   )r   N)r%   rs   bfloat16rt   r   torC   rc   )rJ   r   r   r|   rq   r   r   r   test_device1_to_device2  s   
z,TorchAOBasicTestCase.test_device1_to_device2c                 C   s@   t jdd||d}| j|fi | j}| }| |jd d S )Nrl   rm   r   )rm   rl   )r%   rs   rt   r   trz   shape)rJ   r   r   r|   rq   r   r   r   test_transpose  s   z#TorchAOBasicTestCase.test_transposec                 C   sv   t jdd||d}| j|fi | j}t jdd||d}t jj||}t jj||}| tj	j
||| j d S )Nrl   rm   r   rf   )r%   rs   rt   r   nn
functionallinearassertGreatertorchaoquantizationutilscompute_errorLINEAR_MIN_SQNR)rJ   r   r   r|   rq   hp_act_tensorhp_reslp_resr   r   r   test_linear&  s   z TorchAOBasicTestCase.test_linearN)__name__
__module____qualname__r%   rC   r9   COMMON_DEVICESfloat32float16r   COMMON_DTYPESr
   rx   r   rt   r   
ASYMMETRICuint8r   r   r   r   parametrizer   r   r   r   r   r   r   r   rb      s.    







rb   c                   @   s   e Zd Zdgej rdgng  Zejejej	gZ
eZeZejdejdZdZdZedeede
d	d
 Zedeede
dd Zedeede
dd Zedeede
dd ZdS )TorchAOCompileTestCaserc   rC   rd   rg   rk   2   r   r   c                 C   sv   t jdd||d}| j|fi | j}dd }||}t |}||}| t||| j | |	 |	  d S )Nrl   rm   r   c                 S   s   | S rI   r   tensorr   r   r   fK  s   zCTorchAOCompileTestCase.test_input_output_tensor_subclass.<locals>.f)
r%   rs   rt   r   compile
assertTrue
isinstancerx   rz   r{   rJ   r   r   r|   rq   r   refcompiledr   r   r   !test_input_output_tensor_subclassE  s   
z8TorchAOCompileTestCase.test_input_output_tensor_subclassc                 C   sn   t jdd||d}| j|fi | j}dd }||}t |}||}| t||| j | || d S )Nrl   rm   r   c                 S   s   |   S rI   )r{   r   r   r   r   r   Z  s   z<TorchAOCompileTestCase.test_input_tensor_subclass.<locals>.f)	r%   rs   rt   r   r   assertFalser   rx   rz   r   r   r   r   test_input_tensor_subclassT  s   
z1TorchAOCompileTestCase.test_input_tensor_subclassc                    s   t jdd||d} fdd}||}t |}||} t|| j |t jkr? tj	j
| |  j d S d S )Nrl   rm   r   c                    s    j | fi  jS rI   )rt   r   )r|   rJ   r   r   r   h  s   z=TorchAOCompileTestCase.test_output_tensor_subclass.<locals>.f)r%   rs   r   r   r   rx   r   r   r   r   r   r   r{   COMPILE_MIN_SQNR)rJ   r   r   r|   r   r   r   r   r   r   test_output_tensor_subclassc  s   

z2TorchAOCompileTestCase.test_output_tensor_subclassc           	      C   s   t jdd||d}| j|fi | j}t jdd||d}t jj||}t jjddd||d}t j||_	t 
||}| tjj||| j d S )Nrl   rm   r   rf   F)biasr   r   )r%   rs   rt   r   r   r   r   Linear	Parameterweightr   r   r   r   r   r   r   )	rJ   r   r   r|   rq   r   r   r   r   r   r   r   test_linear_compilex  s   z*TorchAOCompileTestCase.test_linear_compileN)r   r   r   r%   rC   r9   r   r   r   r   r   r
   rx   r   rt   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   5  s.    







r   c                   @   s   e Zd ZdZejejejgZe	Z
eeZi ZedejjdedejjfddZedejjdedejjfddZdejjdejjfd	d
Zedeeeej  ddd ZdS )TorchAOTensorParallelTestCasez%Basic test case for tensor subclassesmmeshreturnc                 C   sn   | j j}|d|  }| }||| |d | ddf }t||tdg}tjj	|dd| j _| S )zH
        Shard linear layer of the model in column-wise fashion
        r   re   NFrequires_grad
r   r   rv   get_local_rankr   
from_localr   r%   r   r   )r   r   orig_weightn_local_rowsranklocal_sharddtensorr   r   r   colwise_shard      z+TorchAOTensorParallelTestCase.colwise_shardc                 C   sn   | j j}|d|  }| }|dd|| |d | f }t||tdg}tjj	|dd| j _| S )zE
        Shard linear layer of the model in row-wise fashion
        re   NFr   r   )r   r   r   n_local_colsr   r   r   r   r   r   rowwise_shard  r   z+TorchAOTensorParallelTestCase.rowwise_shardc                 C   s   t || jdi | j |S )z$
        Quantize the model
        Nr   )r   QUANT_METHOD_FNQUANT_METHOD_KWARGS)rJ   r   r   r   r   quantize  s   z&TorchAOTensorParallelTestCase.quantizer   zNeed CUDA availablec                 C   s  d}t d G dd dt jj}t d| jt j   }|dd||}|dd||}dt j	d	d||d
 }||| | 
|}| 
|}||| |  }	d|	_| ||	}
| ||	}t||	t g}||
| t |
}||}t |}|| d S )NrC      c                       s4   e Zd Zd fddZdejdejfddZ  ZS )	z0TorchAOTensorParallelTestCase.test_tp.<locals>.Mr   Nc                    s,   t  jdi | tjj||ddd| _d S )NFrC   )r   r   r   )super__init__r%   r   r   r   )rJ   in_featuresout_featuresr   	__class__r   r   r     s   z9TorchAOTensorParallelTestCase.test_tp.<locals>.M.__init__xc                 S   s
   |  |S rI   )r   )rJ   r   r   r   r   forward  s   
z8TorchAOTensorParallelTestCase.test_tp.<locals>.M.forward)r   N)r   r   r   r   r%   Tensorr   __classcell__r   r   r   r   M  s    r   zcuda:      d   rm   r   )r%   manual_seedr   Moduler   r   rC   device_countr   rs   r   build_device_meshdevice_typer   r   r   r   r   r   )rJ   r   r   r   proj_upproj_dnexample_inputup_quantdn_quantr   up_distdn_distinput_dtensorup_compiledy_updn_compiledr   r   r   test_tp  s*   




z%TorchAOTensorParallelTestCase.test_tpN)r   r   r   __doc__r%   r   r   r   r   r
   rx   staticmethodr   r   r   r   r   r   r   r   r   r   r   r   r   skipIfrC   r9   r   r   r   r   r   r     s      
r   c                   @   sF   e Zd ZdefddZdd ZdefddZdefdd	Zd
d ZdS )TorchAOIntegrationTestCaseconfigc              	   C   s|  t j}t j sJ dt }t jjdd||d}t|| t jdd|t j}t jj	|j
d dt jdd||d  dd|_
t|| d}d	}d
D ]m}|| }	|j
}
|
j}|||	|}|jd }|j
}|||	|}t ||jd r{J || t |jd |jd sJ t|drt |j|jsJ t|drt |j|jsJ t|drt |j|jsJ qNd S )Nzno accelerator device foundr   r   g      ?   Fr   r   i   )r   re   scale
zero_pointscale_and_zero)r%   r   acceleratorr9   r   r   r   r   r   r   r   rs   datanarrowqdataequalcopy_rS   r   r   r   )rJ   r   r   r   ldummy_l
output_dim
shard_sizetp_rank	start_idxparam
param_dataorig_valuesloaded_weightr   r   r   $_test_slice_and_copy_similar_to_vllm  sL   






z?TorchAOIntegrationTestCase._test_slice_and_copy_similar_to_vllmc                    s  t j}d}dd }d}d}d}d}d}	t|||	||}
t|||	||}t||d|	 ||}t j|||||d	}|
|g}|D ]}|| t jj|jd
d dd|_t jj|j	d
d dd|_	t jj|j
d
d dd|_
||| ||}|jjd }t jj|jd
dd| dd|_|j	jd }t jj|j	d
dd| dd|_	|j
jd }t jj|j
d
dd| dd|_
t jj|jd|dfjdddd|_t jj|j	d|dfjdddd|_	t jj|j
d|dfjdddd|_
t jj|jd
d dd|_t jj|j	d
d dd|_	t jj|j
d
d dd|_
||}| || q=|
 | gdD ]/  fddD }td
t|D ]}|d j|| _t|| drx|d j|| _q^qLdtdt jffdd}i }dD ]	 | | < qt jj|jd
ddd|_t jj|j	d
ddd|_	t jj|j
d
ddd|_
|j|dd || dS )zThis is testing the op call sequence in saving and loading quantization
        checkpoints in llama-models for llama4
        (https://github.com/meta-llama/llama-models/tree/main/models/llama4)
        rC   c           
      S   sf   |   D ],\}}t|tsq|}dD ]}t||}tt| }t|}|||}	t|||	j qqd S )Nw1w2w3)	named_modulesr   r   rn   r   typer   rX   r   )
modelr   rM   moduleexpert_moduleweight_namer   config_handler	dummy_mod	quant_modr   r   r   _quantize_experts)  s   


zRTorchAOIntegrationTestCase._test_moe_weight_reshape_ops.<locals>._quantize_expertsrl   r   @   rm      )r   r   re   Fr   r   dimr  c                       g | ]}|  qS r   r   )ro   stkeyr   r   
<listcomp>      zKTorchAOIntegrationTestCase._test_moe_weight_reshape_ops.<locals>.<listcomp>r   r#  r   c                    s6    fddD } dkrt j|ddS t j|ddS )Nc                    r   r   r   )ro   sr"  r   r   r$    r%  z`TorchAOIntegrationTestCase._test_moe_weight_reshape_ops.<locals>.process_key.<locals>.<listcomp>r  r  r  r  )r%   cat)r#  tensors)state_dictsr"  r   process_key  s   zLTorchAOIntegrationTestCase._test_moe_weight_reshape_ops.<locals>.process_keyT)assignN)r%   r   r   rs   r   r   r  	transpose
contiguousr  r  r   reshape	unflattensqueezerz   
state_dictrangelenr   rS   r   strr   load_state_dict)rJ   r   r   r   r  
batch_sizenum_experts	input_dimr  
hidden_dimmoe1moe2moe_combinedinputmoesmoebeforenew_last_dimafterweightsir*  new_state_dictr   )r#  r)  r   _test_moe_weight_reshape_ops   s   

	z7TorchAOIntegrationTestCase._test_moe_weight_reshape_opsc           	      C   s   t j}t jjddd|d}t|| |j}|ddd}|jD ]"}t||}t||}t	|j
t	|j
ksAJ d|j
 d|j
 qd S )Nr   rC   r   re   r   zshape mismatch: z vs )r%   r   r   r   r   r   r   tensor_data_namesrn   r3  r   )	rJ   r   r   r  orignewdata_attr_name	orig_attrnew_attrr   r   r   _test_narrow_similar_to_vllm  s   



z7TorchAOIntegrationTestCase._test_narrow_similar_to_vllmc              	   C   sz   t j}t d t jjddd|d}W d    n1 sw   Y  t jt jdddd|d|_t|| |jd }d S )	Nmetar   rC   r   <   i   r   r   )	r%   r   r   r   r   r   rs   r   r   )rJ   r   r   r  _w_slicer   r   r   '_test_quantize_3d_param_similar_to_vllm  s   
zBTorchAOIntegrationTestCase._test_quantize_3d_param_similar_to_vllmc                 C   s<   |j d|d}tj||d}tjj| | ddd d S )Nr   r  r   )atolrtol)chunkr%   r'  testingassert_closer{   )rJ   	ao_tensorr  ao_tensor_chunkedao_tensor_unchunkedr   r   r   "_test_chunk_similar_to_vllm_llama4  s
   
z=TorchAOIntegrationTestCase._test_chunk_similar_to_vllm_llama4N)	r   r   r   r	   r  rF  rM  rQ  rZ  r   r   r   r   r     s    2 r   __main__rI   )NN)0rQ   r/   r   r%   torch.distributed._tensorr   r   r   r   torch.testing._internalr   :torch.testing._internal.distributed._tensor.common_dtensorr   r   r   torchao.core.configr	   torchao.dtypesr
   r   torchao.quantizationr   r   %torchao.quantization.quant_primitivesr   %torchao.quantization.transform_moduler   #torchao.testing.model_architecturesr   torchao.utilsr   r   r   r#   r6   r>   r@   rE   rG   ra   TestCaserb   r   r   r   instantiate_parametrized_testsr   mainr   r   r   r   <module>   sD   

&
PTd 
c

