o
    iP                     @   s  U d dl mZmZmZmZ d dlZd dlmZ d dlm	Z	m
Z
 d dlmZmZ d dlmZ ejjZejjZejjZi Zeeef ed< 			d=d	ejd
ejdejdejdejdeej deej dedejfddZdd Zdd Zeejjejjejjej jej!j"ej#jgd>ddZ$eej%jgd>ddZ&eej'jej(j)gd>ddZ*eej+jgd>ddZ,eej-jgd>d d!Z.eej/jgd>d"d#Z0eej1j2gd>d$d%Z3d&e	d'e	fd(d)Z4eej5jej6jgd>d*d+Z7eej8jgd>d,d-Z9eej:jgd>d.d/Z;eej<jgd>d0d1Z=eej>jej>jgd>d2d3Z?eej@jej@jgd>d4d5ZAed6rCeejBjgd>d7d8ZCeejDjgd>d9d:ZEeejFjgd>d;d<ZGdS )?    )AnyDictOptionalTupleN)tree_map)Float8TrainingTensorchoose_scaled_mm_config)is_row_majorpad_tensor_for_matmul)torch_version_at_leastFLOAT8_OPS_TABLEFa_dataa_scaleb_datab_scaleoutput_dtypeoutput_scalebiasuse_fast_accumreturnc              
   C   s   |  }|  }	d}
|j| jd dfko|jd|jd fk}|r0|s0||	 }
|d}|d}	|}|tjtjfv r?|r?tj}d}|tjkrJ|}d}tj| |||	||||d}|
dur_||
9 }|durg||7 }|tjtjfv rv|rv||}|S )z
    This is the unwrapped version of addmm_float8, which does not take in Float8TrainingTensors
    as inputs. This is used to standardize the logic between subclassed and non subclassed
    versions of the linear module.
    Nr       )scale_ascale_br   scale_result	out_dtyper   )	
reciprocalshapenew_onestorchfloat16float32bfloat16
_scaled_mmto)r   r   r   r   r   r   r   r   a_inverse_scaleb_inverse_scalepost_inverse_scaleis_rowwise_scaling
orig_dtype	post_biasoutputr   r   M/home/ubuntu/.local/lib/python3.10/site-packages/torchao/float8/float8_ops.pyaddmm_float8_unwrapped   sF   



r-   c                 C   s    t |jdv sJ |  dd S )N)r   r   z+ with axiswise scaling is not supported yet)lenr   )aten_opscaler   r   r,   _assert_tensorwise_scale_   s   r1   c                    s    fdd}|S )z(Register aten ops to the float8 op tablec                    s8    D ]}|t v rtd| dt | j | t |< q| S )Nz
Float8 op z is already registered to )r   RuntimeError__name__)funcopaten_opsr   r,   	decoratorj   s   
zimplements.<locals>.decoratorr   )r7   r8   r   r6   r,   
implementsg   s   	r9   c                 C   s\   t | |d j | |d jg|dd  R i |}t||d j|d j|d j|d jS Nr   r   )r1   _scale_datar   _orig_dtype_linear_mm_config_gemm_input_role)r/   argskwargsnew_datar   r   r,   float8_desugar_opv   s   $rC   c                 C   sj   | |d j g|dd  R i |}| |d jg|dd  R i |}t|||d j|d j|d jS r:   )r<   r;   r   r=   r>   r?   )r/   r@   rA   rB   	new_scaler   r   r,    float8_desugar_data_and_scale_op   s   $$rE   c                 C   s   | |d j g|dd  R i |}|d jjdkr-| |d jg|dd  R i |}n|d j}| tjjkr@t| |d j |d j}|}|d urX|dkrT|dk n|dk t|||d j	|d j
|d j|S )Nr   r   )r<   r;   ndimaten	transposeintr1   _axiswise_dimr   r=   r>   r?   )r/   r@   rA   rB   rD   old_axiswise_dimnew_axiswise_dimr   r   r,   float8_transpose   s(   $&


rN   c           
      C   s  |d |d }}|t |jjkr;| |d jg|dd  R i |}t||d j|d j|d j|d j|d jS t	|d jjdk rKt
| ||S |j}t	|dkr|dkr| |j|fi |}d|d g}| |j|fi |}t|||j|j|j|jS |dks|t	|jd kr| |j|fi |}|d dg}| |j|fi |}d}	t|||j|j|j|	S t|  d|j d|jj d|j d| d	
)
Nr   r      rF   z# with axiswise scaling and t.shape z t._scale.shape z t._axiswise_dim z new_shape z is not supported yet.)listr<   r   r   r;   r=   r>   r?   rK   r.   rC   AssertionError)
r/   r@   rA   t	new_shaperB   axiswise_dimnew_scale_shaperD   rM   r   r   r,   float8_view   sV   $		(rV   c                    sR   |  d j g dd  R i |}t|  d j  fdd}t||}t|S )Nr   r   c                    s(   t |  d j d j d j d jS )Nr   )r   r;   r=   r>   r?   )datar@   r   r,   make_float8   s   z!float8_split.<locals>.make_float8)r<   r1   r;   maprP   )r/   r@   rA   new_data_tensorsrY   outr   rX   r,   float8_split   s
   $
	r]   c                 C   s  |d }|d j }|d j}|d j}|d jj}|d j}g }	|D ]I}
t|
ts-J d|
j |ks6J d|
j|u s?J d|
j|u sHJ d|
jj|ksRJ d|
j|u s[J dt| |
j |		|
j
tj q"| |	g|dd  R i |}|
|}t|||||S )	Nr   z7Expecting all chunks to be of type Float8TrainingTensorz,Expecting all chunks to be of the same dtypezCExpecting all chunks to have thee same scale as a result of a splitzGExpecting all chunks to have thee same mm config as a result of a splitzCExpecting all chunks to be of the same dtype as a result of a splitzLExpecting all chunks to have the same gemm_input_role as a result of a splitr   )r=   r;   r>   r<   dtyper?   
isinstancer   r1   appendviewr   uint8)r/   r@   rA   chunked_tensorsr)   r0   	mm_config	fp8_dtypegemm_input_role
chunk_datachunkrB   r   r   r,   
float8_cat	  s>   




ri   c                 C   s:   t | |d j dd }t||}t||}| |i |S )a)  Be careful with this function, this is a "fallback" op that
    casts the output of the op to the original precision. And performs the op.

    We currently need this to support the backward for admmm bias.
    "addmm" -> out
    "hp_gradBias" <-"sum" <- "identity" <- gradOut <- "hp_gradOut"
    r   c                 S   s   t | tr	|  S | S N)r_   r   to_original_precision)xr   r   r,   unwrap9  s   
z!float8_cast_up_op.<locals>.unwrap)r1   r;   r   )r/   r@   rA   rm   new_args
new_kwargsr   r   r,   float8_cast_up_op.  s
   	

rp   abc                 C   s  | j }| j}|j }t| j| j|j|j}|jrA| j d|j dks5J d| j d d|j d t|dd}t|dd}t|	 sK|
 }t|	 rY| 
  }|j}| jd u rs|jd urs||jd dd}n| jd ur|jd u r||jd dd}||||fS )Nr   r   z"Inner dims must match for mm, got z and )dimsrF   )r<   r;   r   r?   r>   pad_inner_dimsizer
   r	   stride
contiguousrR   rK   repeatr   reshape)rq   rr   r   r   r   scaled_mm_configr   r   r   r,   preprocess_addmmC  s2   
r{   c              
   C   s   |d }|d }t |trt |tsJ dt|t|t||\}}}}|j}	t|j|j|j|j}
|
j	rMt
|j |j |j |j |	S t|||||	d d |
jd}|S )Nr   r   zFExpecting  both Float8TrainingTensor for mm inputs but found {} and {}r   r   r   )r_   r   formattyper{   r=   r   r?   r>   emulater   mmr<   floatr;   r$   r-   r   )r/   r@   rA   rq   rr   r   r   r   r   r   rz   
tensor_outr   r   r,   	float8_mml  s@   $
r   c              
   C   s   t |d tjrt |d trt |d tsJ |d }|d }|d }t||\}}}}	|j}
|j|
ks9J dt|j|j	|j|j	}|j
r`t|j |j |j |j |
}|| S t||||	|
d ||jd}|S )Nr   r   rO   z"bias dtype must match output dtyper|   )r_   r   Tensorr   r{   r=   r^   r   r?   r>   r   r   r<   r   r;   r$   r-   r   )r/   r@   rA   r   rq   rr   r   r   r   r   r   rz   r\   r   r   r   r,   float8_addmm  sD   $
r   c                 C   s$   t | |d j |d j|d jkS r:   )r1   r;   r   r/   r@   rA   r   r   r,   float8_is_same_size  s   r   c                 C   s~   t |d ts	J t|dkrd|v sJ d|d tjtjhv s%J dt|d j|d j|d |d j|d j	|d j
S )zThis gets called when running matmul under autocast
    when the input is a Float8TrainingTensor, presenting as a fp32
    tensor.
    r   r   r^   z%Only support dtype kwarg for autocastzKOnly support floating point conversion for autocast w/ Float8TrainingTensor)r_   r   r.   r   r    r"   r<   r;   r>   r?   rK   r   r   r   r,   autocast_to_copy  s$   r   c                 C   sx   t | |d j |d }t|tsJ dt| |j}| }| |g|dd R i |}t||j|j|j|j	S )z+
    override funcol with FP8 handling
    r   z9expecting a Float8TrainingTensor for allgather but found r   N)
r1   r;   r_   r   r~   r<   rw   r=   r>   r?   r/   r@   rA   	fp8_inputfp8_datafp8_outr   r   r,   allgather_fp8  s   
r   c                 C   sb   t | |d j |d }t|tsJ |j}| |g|dd  R i |}t||j|j|j|jS r:   r1   r;   r_   r   r<   r=   r>   r?   r   r   r   r,   wait_tensor_fp8  s   r   z
2.11.0.devc                 C   sb   t | |d j |d }t|tsJ |j}| |g|dd R i |}t||j|j|j|jS )z
        Handle _wrap_tensor_autograd for Float8TrainingTensor.
        This wraps the underlying fp8 data in AsyncCollectiveTensor while
        preserving the Float8TrainingTensor wrapper with its scale and metadata.
        r   r   Nr   r   r   r   r,   wrap_tensor_autograd_fp8  s   r   c                 C   s   |d }|d }t |tsJ t |tsJ t||d j |j|jks&J |j|jks.J |j|jks6J |j}|j}| ||d |g|dd  R i |}t||j|j|j|jS )Nr   rO   r      )	r_   r   r1   r;   r^   r=   r<   r>   r?   )r/   r@   rA   fp8_self
fp8_valuesr   fp8_values_datar   r   r   r,   index_put_fp8  s$   &r   c                 C   s$  |d }|d }t |ts,t |tr,| }t| |j | ||g|dd  R i |S t |trt |trt| |j |j|jksFJ d|j|jksPJ d|j|jksZJ d|jj|jjksfJ d|j	|j	kspJ d| |j|jg|dd  R i |}t||j|j|j|j	S t
d	)
Nr   r   rO   z<Expecting both Float8TrainingTensors to be of the same dtypez<Expecting both Float8TrainingTensors to have thee same scalez@Expecting both Float8TrainingTensors to have thee same mm configz=Expecting both Float8TrainingTensors to be of the same dtypetzEExpecting both Float8TrainingTensors to have the same gemm_input_rolez7Unsupported semantics for copy_ in Float8TrainingTensor)r_   r   rk   r1   r;   r=   r>   r<   r^   r?   r2   )r/   r@   rA   selfsrcsrc_hpr   r   r   r,   copy_fp8/  sH   	 $r   )NNFrj   )Htypingr   r   r   r   r   torch.utils._pytreer   %torchao.float8.float8_training_tensorr   r   torchao.float8.float8_utilsr	   r
   torchao.utilsr   opsrH   c10d_functional_c10d_functionalr   __annotations__r   r^   boolr-   r1   r9   _unsafe_viewdefault
as_stridedcloneslicefill_Scalarry   rC   detachrE   rR   rI   rJ   rN   ra   rV   splitr]   catri   sumdim_IntListrp   r{   r   matmulr   addmmr   is_same_sizer   _to_copyr   all_gather_into_tensorr   wait_tensorr   _wrap_tensor_autogradr   
index_put_r   copy_r   r   r   r   r,   <module>   s   	
B
6$)"$
