o
    i?(                     @   s  d Z ddlZddlmZmZmZmZmZ ddlZddl	m
Z
mZ ddlmZ ddlmZmZ ejZG dd deZd	ed
ededeeef fddZdejdeedf fddZ			d7d	eded
ededejdee dee dedefddZdejdee ded ed!ed"edejfd#d$Zd%ejdefd&d'Zd%ejdefd(d)Zd%ejdefd*d+Zd%ejdefd,d-Zd.eeeeef ee f defd/d0Zd1eeeeeef ee f  deeef fd2d3Z d4eeef ddfd5d6Z!dS )8z;
Defines an nn module designed to be used during inference
    N)List
NamedTupleOptionalTupleUnion)is_row_majorpad_tensor_for_matmul)FP8Granularity)is_MI300is_sm_at_least_89c                   @   s6   e Zd ZU dZdZeed< dZeed< dZeed< dS )Float8MMConfiga  
    Configuration for the scaled_mm in the forward and backward pass.

    Attributes:
        emulate (bool): Whether to emulate the matmuls in fp32.
        use_fast_accum (bool): Whether to use the fast-accumulation option for scaled_mm.
        pad_inner_dim (bool): Whether to pad the inner dimension of a and b with 0s.
                              This is needed for matmuls not aligned to 16.
    Femulateuse_fast_accumpad_inner_dimN)	__name__
__module____qualname____doc__r   bool__annotations__r   r    r   r   L/home/ubuntu/.local/lib/python3.10/site-packages/torchao/float8/inference.pyr      s
   
 
r   a_datab_datascaled_mm_configreturnc                 C   s   |j r)| d|dksJ d| d d|d t| dd} t|dd}t|  s3|  } t| rA|   }| |fS )a  Preprocess the inner fp8 data tensors for admmm
    Args:
        a_data: Input tensor A.
        b_data: Input tensor B.
        scaled_mm_config: Configuration for _scaled_mm.
    Returns:
        Preprocessed tensors A and B in the format for _scaled_mm.
       r   z"Inner dims must match for mm, got z and )dims)r   sizer   r   stride
contiguoust)r   r   r   r   r   r   preprocess_data)   s   r"   input_scaleinput_shape.c                 C   sD   |   dkr| ddS | d} |  dkr | d| jd } | S )z:Ensures input tensor is correctly formatted for _scaled_mmr      )numelreshape	unsqueezedimshape)r#   r$   r   r   r   preprocess_scaleC   s   
r,   Fa_scaleb_scaleoutput_dtypeoutput_scalebiasr   c           	   
   C   sL   |t jkr|durt j| ||||||d}|| S t j| |||||||dS )z
    This is the unwrapped version of addmm_float8, which does not take in Float8TrainingTensors
    as inputs. This is used to standardize the logic between subclassed and non subclassed
    versions of the linear module.
    N)scale_ascale_bscale_result	out_dtyper   )r2   r3   r1   r4   r5   r   )torchfloat32
_scaled_mm)	r   r-   r   r.   r/   r0   r1   r   outputr   r   r    addmm_float8_unwrapped_inferenceU   s*   	r:   scale
data_shaper*   startendstepc                    s   t jj}j kr|j||||S t fddtt D }|t|kr+S || }|dkr=|j||||S |durE|| nd}	|durS|| d | nd}
|dkr]t	d|j||	|
dS )z
    Slice the scale tensor appropriately based on the data tensor slicing.
    This function calculates how the scale should be sliced when the data tensor
    is sliced along a given dimension, taking into account the block structure.
    c                 3   s"    | ]} | j |  V  qd S )N)r+   .0ir<   r;   r   r   	<genexpr>   s     z-_slice_scale_for_dimension.<locals>.<genexpr>r   Nz;Slicing with step > 1 is not implemented for scale tensors.)
r6   opsatenr+   sliceTensortuplerangelenNotImplementedError)r;   r<   r*   r=   r>   r?   rF   block_sizesblock_size_for_dimscale_start	scale_endr   rC   r   _slice_scale_for_dimension}   s&   
 rQ   xc                 C   s8   t | ds	J dt| jd|  d  | jd f kS )~Checks if a quantized tensor is rowwise scaled
    Args:
        x: quantized tensor (should have `block_size` attribute)
    
block_size.Expecting input to have `block_size` attribute)r   r   r%   )hasattrrI   rT   r*   r+   rR   r   r   r   _is_rowwise_scaled   s   &rX   c                    s.   t  ds	J dt fddt jD S )rS   rT   rU   c                 3   s2    | ]} j | d kp j |  j| kV  qdS )r%   N)rT   r+   r@   rW   r   r   rD      s    "
z(_is_tensorwise_scaled.<locals>.<genexpr>)rV   allrJ   ndimrW   r   rW   r   _is_tensorwise_scaled   s   r[   c                 C   sF   t | ds	J d| j}t|dko"t|dd dko"|d dkS )zChecks if a quantized tensor is scaled with a block size of 1x128
    Args:
        x: quantized tensor (should have `block_size` attribute)
    rT   rU   r&   Nr%   r      )rV   rT   rK   mathprodrR   br   r   r   _is_1_128_scaled   s   .ra   c                 C   s<   t | ds	J d| j}t|dko|d dko|d dkS )zChecks if a quantized tensor is scaled with a block size of 128x128
    Args:
        x: quantized tensor (should have `block_size` attribute)
    rT   rU   r&   r   r\   r   )rV   rT   rK   r_   r   r   r   _is_128_128_scaled   s   $rb   gc                 C   s@   ddl m} t| dko| d |ddgko| d |ddgkS )Nr   )PerBlockr&   r   r\   ) torchao.quantization.granularityrd   rK   )rc   rd   r   r   r   !_granularity_is_a_1_128_w_128_128   s   4rf   granularityc                 C   s   ddl m}m} d }| d u r| | f}|S t| ||fr#| | f}|S t| ttfrwt| dkrwt| d |o=t| d |}t| d |oKt| d |}t| }|s^|s^|s^td|  dt| d t	| d sqtd|  dt| }|S td|  d)	Nr   PerRow	PerTensorr&   r   zUnsupported granularity types: .zEDifferent granularities for activation and weight are not supported: z#Invalid granularity specification: )
re   ri   rj   
isinstancerI   listrK   rf   
ValueErrortype)rg   ri   rj   processed_granularityis_per_tensor
is_per_rowis_a_1_128_w_128_128r   r   r   _normalize_granularity   s2   	
rt   granularitiesc                 C   s   ddl m}m} t| d |ot| d |}t| d |o#t| d |}t| }|s,|rFtj s@tj r9t	 sDt
 sBJ ddS dS dS |rQt	 sOJ ddS td|  d)	a9  
    Validate that the hardware supports the requested granularities.

    Args:
        granularities: Tuple of (activation_granularity, weight_granularity)

    Raises:
        AssertionError: If hardware doesn't support the requested granularity
        ValueError: If invalid granularity type is provided
    r   rh   r   uU   Float8 dynamic quantization requires CUDA compute capability ≥8.9 or MI300+ or XPU.u[   Float8 1x128 activation and 128x128 weight scaling requires CUDA compute capability ≥8.9.zInvalid granularities rk   N)re   ri   rj   rl   rf   r6   xpuis_availablecudar   r
   rn   )ru   ri   rj   rq   rr   rs   r   r   r   _check_hardware_support  s.   
ry   )NNF)"r   r]   typingr   r   r   r   r   r6   torchao.float8.float8_utilsr   r   torchao.float8.typesr	   torchao.utilsr
   r   rH   r   r"   intr,   dtyper   r:   rQ   rX   r[   ra   rb   rm   rf   rt   ry   r   r   r   r   <module>   s   

	
(
3	






(
