o
    i                  	   @   s  d dl Z d dlmZ d dlZd dlmZ d dlm  mZ d dl	m
Z
mZmZ d dlmZ d dlmZmZmZmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZ G dd dej Z!G dd dej Z"			ddedeeef de#de#fddZ$dS )    N)Union)	ReplicateSharddistribute_tensor)
DeviceMesh)ColwiseParallelPrepareModuleInputRowwiseParallelparallelize_module)Float8LinearConfig)convert_to_float8_training)Float8ColwiseParallelFloat8RowwiseParallelPrepareFloat8ModuleInput)MXLinearConfig)	quantize_c                       s(   e Zd ZdZ fddZdd Z  ZS )FeedForwardzMLP based modelc                    sT   t t|   tj||d dd| _tj||d dd| _tj|d |dd| _d S )N   F)bias)superr   __init__nnLinearw1w2out_projselfsize	__class__ Z/home/ubuntu/.local/lib/python3.10/site-packages/torchao/testing/training/dtensor_utils.pyr   #   s   zFeedForward.__init__c                 C   s(   t | || | }| |}|S N)Fsilur   r   r   r   xr!   r!   r"   forward)   s   
zFeedForward.forward)__name__
__module____qualname____doc__r   r(   __classcell__r!   r!   r   r"   r       s    r   c                       s$   e Zd Z fddZdd Z  ZS )ToyModelc                    s   t t|   t|| _d S r#   )r   r.   r   r   ffnr   r   r!   r"   r   0   s   zToyModel.__init__c                 C   s
   |  |S r#   )r/   r&   r!   r!   r"   r(   4   s   
zToyModel.forward)r)   r*   r+   r   r(   r-   r!   r!   r   r"   r.   /   s    r.       Fmeshconfigcompileallgather_in_lowpc              
   C   s  | j }t}t|trt}t||}t|}|||d t|}	||	|d t|}
||
|d |s=t	}t
}t}nt}t}t}t|	| | | | d}	t|
| |tdt d| | |tdddd}
t|}|||d |s|tdt d}n
|tdt dd	}t|| || | |tdddd}|rt|	}	t|
}
t|}tjd
|d
 ||dd}tjd
|d
 ||dd}| }| }t| | tdg}t| | tdg}|	|}|| |
|}|| ||}|| tj|| tj| | tj|	jjjj|
jjjj tj|	jjjj|
jjjj ||}|| tj| | tj|	jjjj|jjjj tj|	jjjj|jjjj d S )N)r2   )ffn.w1ffn.w2ffn.out_proj   )input_layoutsdesired_input_layoutsF)output_layoutsuse_local_output)r/   r5   r6   r7   r   )r9   r:   fwd_config_submodule_fqnr   )devicerequires_gradr   ) device_typer   
isinstancer   r   r.   tocopydeepcopyr   r	   r   r   r   r   r
   r   r   torchr3   randcloner   backwardtestingassert_closefull_tensorr/   r   weightgradr   )r1   r2   r   r3   r4   r>   convert_model_func	toy_modeltoy_model_fp8tp_modelsp_modelcolwise_parallel_clsrowwise_parallel_clsprepare_input_cls	sp_model2prepare_inputx_fp32go_fp32x_fp32_tp_input
go_fp32_tpx_fp32_sp_input
go_fp32_sptp_outsp_out
global_outsp_out2r!   r!   r"   &_test_lowp_mlp_tensor_parallelism_base8   s   












rb   )r0   FF)%rC   typingr   rE   torch.nnr   torch.nn.functional
functionalr$   torch.distributed._tensorr   r   r   torch.distributed.device_meshr   !torch.distributed.tensor.parallelr   r   r	   r
   torchao.float8r   "torchao.float8.float8_linear_utilsr   %torchao.float8.float8_tensor_parallelr   r   r   #torchao.prototype.mx_formats.configr   torchao.quantizationr   Moduler   r.   boolrb   r!   r!   r!   r"   <module>   s6   
