o
    i8                  &   @   s  d dl Z d dlmZ d dlZd dlmZ ejddZed ed ed ed	 ejd
ej	j
jgd ed ed ed ed ed ed dd Zdd Ze jdd Z						 		 		 		 		 dzdedededee ded ed!ee d"ed#ed$ed%ed&ed'ed(ed)ed*ed+ed,ef$d-d.Zed/						 		 		 		 		 dzdedededee ded ed!ee d"ed#ed$ed%ed&ed'ed(ed)ed*ed+ed,ef$d0d1Z		d{d2ed3ed4ed5ed6ed7ee d8eej d,efd9d:Zed;		d{d2ed3ed4ed5ed6ed7ee d8eej d,efd<d1Zd4ed,eeffd=d>Zed?d4ed,eeffd@d1ZdAedBedCedDed,ef
dEdFZedGdAedBedCedDed,ef
dHd1ZdAedBedCedDedIedJed7ee dKee d8eej d,efdLdMZedNdAedBedCedDedIedJed7ee dKee d8eej d,efdOd1Ze  dPdQ ZdRdS ZedTdUedVedWedXefdYdZZd4ed[ed\ed,efd]d^Zed_d4ed[ed\ed,efd`d1Zd2edaedbed4edceddedeed7ee d8ejfdfdgZedhd2edaedbed4edceddedeed7ee d8ejd,efdid1Zedjdkedledmedned*edoedped8ejd,efdqd1Zd4ed[ed,efdrdsZedtd4ed[ed,efdud1Zd2edaed4edced7ee d8ejfdvdwZ edxd2edaed4edced7ee d8ejd,efdyd1ZdS )|    N)Optional)TensortorchaoFRAGMENTzrowwise_scaled_linear_sparse_cutlass_f8f8(Tensor input, Tensor input_scale, Tensor weight, Tensor weight_meta, Tensor weight_scale, Tensor? bias=None, ScalarType? out_dtype=None) -> TensorzLto_sparse_semi_structured_cutlass_sm9x_f8(Tensor weight) -> (Tensor, Tensor)z\swizzle_mm(Tensor mat1, Tensor mat2, bool mat1_is_swizzled, bool mat2_is_swizzled) -> Tensorzswizzle_scaled_mm(Tensor mat1, Tensor mat2, bool mat1_is_swizzled, bool mat2_is_swizzled, Tensor scale_a, Tensor scale_b, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None) -> TensorzImx_fp8_bf16(Tensor a, Tensor b, Tensor a_scale, Tensor b_scale) -> Tensor)tagsa6  qscaled_dot_product(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, float? scale=None, float q_scale=1.0, int q_zp=0, float k_scale=1.0, int k_zp=0, float v_scale=1.0, int v_zp=0, float a_scale=1.0, int a_zp=0, float o_scale=1.0, int o_zp=0) -> Tensorzida8w4_linear_prepack_cpu(Tensor weight, Tensor scales, Tensor qzeros) -> (Tensor, Tensor, Tensor, Tensor)zda8w4_linear_cpu(Tensor input, Tensor input_scales, Tensor input_qzeros, Tensor weight, Tensor weight_scales, Tensor weight_qzeros, Tensor compensation, Tensor? bias, ScalarType output_dtype) -> Tensorz_scaled_embedding_bag(Tensor qweight, Tensor indices, Tensor offsets, Tensor weight_scale, float o_scale, int mode, bool include_last_offset, ScalarType output_dtype) -> TensorzKfloat8_linear_prepack_cpu(Tensor weight, Tensor scales) -> (Tensor, Tensor)zfloat8_linear_cpu(Tensor input, Tensor input_scales, Tensor weight, Tensor weight_scales, Tensor? bias, ScalarType output_dtype) -> Tensorc                        fdd}|S )Nc                    s   t j  | S N)torchlibraryregister_fakefuncname ?/home/ubuntu/.local/lib/python3.10/site-packages/torchao/ops.py	decorator4   s   z%register_custom_op.<locals>.decoratorr   r   r   r   r   r   register_custom_op3      r   c                    r   )Nc                    s   t jj  dd| S )Nr   )mutates_args)r	   r
   	custom_opr   r   r   r   r   ;   s   z*register_custom_op_impl.<locals>.decoratorr   r   r   r   r   register_custom_op_impl:   r   r   c                  C   s&   t jt j } | jd | j }|S )N
   )r	   cudaget_device_propertiescurrent_devicemajorminor)device_propscompute_capabilityr   r   r   cached_compute_capabilityA   s   r!           F      ?querykeyvalue	attn_mask	dropout_p	is_causalscaleq_scaleq_zpk_scalek_zpv_scalev_zpa_scalea_zpo_scaleo_zpreturnc                 C   s0   t jjj| |||||||||	|
||||||S )a  
    Quantized SDPA with quantized inputs and outputs.
    Arguments
        query: input query tensor,
        key: input key tensor,
        value: input value tensor,
        attn_mask: attention mask tensor,
        dropout_p: dropout probability,
        is_causal: causal flag,
        scale: scaling factor applied prior to softmax,
        q_scale: scale for query from linear quantization,
        q_zp: zero point for query from linear quantization,
        k_scale: scale for key from linear quantization,
        k_zp: zero point of key from linear quantization,
        v_scale: zero point for value from linear quantization,
        v_zp: zero point of value from linear quantization,
        a_scale: scale for attention from softmax quantization,
        a_zp: zero point for attention from softmax quantization,
        o_scale: scale for output from linear quantization,
        o_zp: zero point for output from linear quantization,
    Returns
        output of quantized SDPA
    )r	   opsr   qscaled_dot_productdefaultr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r   r   r   r7   H   s&   
*r7   ztorchao::qscaled_dot_productc                 C   s   | S r   r   r9   r   r   r   _   s   r:   inputinput_scaleweightweight_metaweight_scalebias	out_dtypec              	   C   s   t jjj| ||||||S )a.  
    CUTLASS-based row-wise scaled F8F8 linear operator, for sparsified weight case.
    Args:
        input: quantized input tensor, in row-major layout.
        input_scale: scale factors for input tensor, has to be tensor of the same shape as the input tensor, minus the last dimension.
        weight: sparsified quantized weight matrix, in row-major layout.
        weight_meta: sparsify metadata for weight tensor.
        weight_scale: scale factors for weight tensor, one value per row of weight matrix (thus also tensor of the same shape as the weight tensor, minus the last dimension).
        bias: an optional vector of size equal to number of rows of weight tensor, or None.
        out_dtype: optional data type for output tensor.
    Returns:
        output: result tensor, in row-major layout.
    )r	   r6   r   )rowwise_scaled_linear_sparse_cutlass_f8f8r8   )r;   r<   r=   r>   r?   r@   rA   r   r   r   rB      s   
rB   z2torchao::rowwise_scaled_linear_sparse_cutlass_f8f8c           	      C   sB   |d ur|n|j }| j}tjg | jd d |jd R ||dS )Nr   dtypedevice)rE   rF   r	   emptyshape)	r;   r<   r=   r>   r?   r@   rA   rE   rF   r   r   r   r:      s   *c                 C   s   t jjj| S )a  
    CUTLASS-based conversion from sparsified input tensor to corresponding compressed tensor, along with corresponding metadata tensor.
    Args:
        weight: input tensor, in row-major layout.
    Returns:
        weight_compressed: compressed weight tensor, with sparsity eliminated, in row-major layout.
        weight_meta: metadata tensor, describing the sparsity structure of the input tensor, also in row-major layout.
    )r	   r6   r   )to_sparse_semi_structured_cutlass_sm9x_f8r8   r=   r   r   r   rI      s   rI   z2torchao::to_sparse_semi_structured_cutlass_sm9x_f8c                 C   s<   |  | d | d d | j | d t| d d dtjdfS )Nr               rE   )	new_emptymaxr	   charrJ   r   r   r   r:      s   "mat1mat2mat1_is_swizzledmat2_is_swizzledc                 C   s   t jjj| |||S )zO
    Similar to torch.mm but Tensor inputs can be SwizzleTensor instances.
    )r	   r6   r   
swizzle_mmr8   rS   rT   rU   rV   r   r   r   rW      s   
rW   ztorchao::swizzle_mmc                 C      |  | jd |jd S Nr   rK   rP   rH   rX   r   r   r   r:      s   scale_ascale_bscale_resultc	           	      C       t jjj| ||||||||	S )zP
    Similar to torch.mm but Tensor inputs can be SwizzleTensor instances.

    )r	   r6   r   swizzle_scaled_mmr8   	rS   rT   rU   rV   r\   r]   r@   r^   rA   r   r   r   r`      s   
r`   ztorchao::swizzle_scaled_mmc	           	      C   rY   rZ   r[   ra   r   r   r   r:     s   c                   C   s   t tdrtjtjfS tjfS )zGTODO: when e8m0 is hardened and major release lets remove uint8 supportfloat8_e8m0fnu)hasattrr	   uint8rb   r   r   r   r   _get_dtypes%  s   
re   c                    s>   t  }t j|v  fdd tj|v fdd d S )Nc                         d j  S )Nz4A_scale tensor must be uint8 or float8_e8m0fnu, got rO   r   )A_scaler   r   <lambda>2      z%_check_scale_dtypes.<locals>.<lambda>c                      rf   )Nz4B_scale tensor must be uint8 or float8_e8m0fnu, got rO   r   )B_scaler   r   rh   6  ri   )re   r	   _checkrE   )rg   rj   allowed_dtypesr   )rg   rj   r   _check_scale_dtypes-  s   

rm   ztorchao::mx_fp8_bf16ABrg   rj   c                 C   s$   t j| d|dft j| jdS )zMeta impl for mx_fp8_bf16r   rK   rD   )r	   rG   sizebfloat16rF   )rn   ro   rg   rj   r   r   r   meta_mx_fp8_bf16:  s   $rr   scalesqzerosc                 C   s   t jjj| ||S )z
    Prepack weights for DA8W4 linear operator on CPU.
    Args:
        weight: weight tensor.
        scales: scales for weight tensor.
        qzeros: zero points for weight tensor.
    Returns:
        packed weight, scales, and zero points.
    )r	   r6   r   da8w4_linear_prepack_cpur8   r=   rs   rt   r   r   r   ru   @  s   ru   z!torchao::da8w4_linear_prepack_cpuc                 C   s   | ||t  fS r   )r	   r   rv   r   r   r   r:   Q  s   input_scalesinput_qzerosweight_scalesweight_qzeroscompensationc	           	      C   r_   )a  
    DA8W4 linear operator on CPU.
    Args:
        input: input tensor.
        input_scales: scales for input tensor.
        input_qzeros: zero points for input tensor.
        weight: weight tensor.
        weight_scales: scales for weight tensor.
        weight_qzeros: zero points for weight tensor.
        compensation: compensation tensor for weight.
        bias: optional bias tensor.
        out_dtype: output data type.
    Returns:
        output tensor in out_dtype.
    )r	   r6   r   da8w4_linear_cpur8   )	r;   rw   rx   r=   ry   rz   r{   r@   rA   r   r   r   r|   V  s   
r|   ztorchao::da8w4_linear_cpuc	           
      C   sL   |  dksJ |d|d d }	| jg | jd d |	R d|iS )N   r      rL   rC   rE   dimrp   rP   rH   )
r;   rw   rx   r=   ry   rz   r{   r@   rA   Nr   r   r   r:   }  s   $ztorchao::_scaled_embedding_bagqweightindicesoffsetsw_scalesmodeinclude_last_offsetc           	      C   s0   |dksJ |j d d }| j|| j d |dS )NTr   rK   rO   )rH   rP   )	r   r   r   r   r3   r   r   rA   
batch_sizer   r   r   r:     s   c                 C   s   t jjj| |S )z
    Prepack weights for float8 linear operator on CPU.
    Args:
        weight: weight tensor.
        scales: scales for weight tensor.
    Returns:
        packed weight, packed scales
    )r	   r6   r   float8_linear_prepack_cpur8   r=   rs   r   r   r   r     s   r   z"torchao::float8_linear_prepack_cpuc                 C   s   | |fS r   r   r   r   r   r   r:     s   c                 C   s   t jjj| |||||S )aH  
    float8 linear operator on CPU.
    Args:
        input: input tensor.
        input_scales: scales for input tensor.
        weight: weight tensor.
        weight_scales: scales for weight tensor.
        bias: optional bias tensor.
        out_dtype: output data type.
    Returns:
        output tensor in out_dtype.
    )r	   r6   r   float8_linear_cpur8   )r;   rw   r=   ry   r@   rA   r   r   r   r     s   
r   ztorchao::float8_linear_cpuc                 C   s^   |  dv sJ |  dkr|d|d n|d}| jg | jd d |R d|iS )N)rL   r}   r}   r   r~   rC   rE   r   )r;   rw   r=   ry   r@   rA   r   r   r   r   r:     s   	*$)Nr"   FNr#   r   r#   r   r#   r   r#   r   r#   r   )NN)!	functoolstypingr   r	   r   r
   Librarylibdefine_CTagneeds_fixed_stride_orderr   r   	lru_cacher!   floatboolintr7   r:   rE   rB   rI   rW   r`   re   rm   rr   ru   r|   r   r   r   r   r   r   <module>   s  


	

?	



	

	


	
'	
	

