o
    i'                  
   @   s.  d dl Z d dlmZmZmZmZmZ d dlZd dlm	Z	 d dl
m  mZ d dlmZ d dlmZ d dlmZmZmZmZ d dlmZ e de	jddfd	d
Zejjjjejjj jejjj!j"ejjj#jejjj$jejjj%jejjj&jejjj'jejjj(j"ejjj)jh
Z*G dd dej"Z+ej,-e+g dS )    N)AnyListOptionalSetTuple)suggest_memory_format)hp_tensor_to_float8_dynamic)Float8TrainingTensorGemmInputRoleLinearMMConfighp_tensor_and_scale_to_float8)EPSmodulereturnc                    s  ddl m  ddlm tjd  fdd|  D }dd |D }dd	 |D }|s0d
S |\}tj|t	j
d}t|}t|t}|j}|tj}t|j| }|tju rhtj|ttjjd}| tj}	t|D ]\}
}|	|
 |jj_qtd
S )a  
    Calculate scale dynamically for all float8 parameters.

    This should be run after the optimizer step. It performs a single all-reduce
    to compute the scales for all float8 weights.

    Args:
        module: The module containing float8 parameters.

    Example::

        model(input).sum().backward()
        optim.step()
        precompute_float8_dynamic_scale_for_fsdp(model)
    r   DTensor)Float8Linearz7torchao.float8.precompute_float8_dynamic_scale_for_fsdpc                    s4   g | ]}t |rt |j rt |jjtr|qS  )
isinstanceweight_local_tensor!WeightWithDynamicFloat8CastTensor).0mr   r   r   M/home/ubuntu/.local/lib/python3.10/site-packages/torchao/float8/fsdp_utils.py
<listcomp>4   s    
z<precompute_float8_dynamic_scale_for_fsdp.<locals>.<listcomp>c                 S   s   g | ]}|j qS r   )r   r   float8_linearr   r   r   r   ;   s    c                 S   s   h | ]}|j jjqS r   )configcast_config_weighttarget_dtyper   r   r   r   	<setcomp><   s    z;precompute_float8_dynamic_scale_for_fsdp.<locals>.<setcomp>N)ord)max)torch.distributed._tensorr   torchao.float8.float8_linearr   torch_C_log_api_usage_oncemodules_foreach_normmathinfstackclampr   dtypetofloat64finfor$   float16to_localfloat32	enumerater   r   _precomputed_scale)r   float8_linearsweightstarget_dtypesr!   max_weightsamax_tensororigin_dtypescale_tensorlocal_scale_tensorir   r   r   r   (precompute_float8_dynamic_scale_for_fsdp   s6   

rB   c                   @   s   e Zd Ze	ddejdedejdeej fddZ		ddejdedejdeej fdd	Z
edd
dZdd Zedd Zdd Zdd Zdddeejdf dedejdeej fddZdS )r   Ntensorlinear_mm_configr0   precomputed_scalec                 C   s<   t jj| | | | t||j|j|j	|
 |jd
S )N)stridesstorage_offsetmemory_formatr0   layoutdevice
pin_memoryrequires_grad)r'   Tensor_make_wrapper_subclasssizestriderG   r   r0   rI   rJ   	is_pinnedrL   )clsrC   rD   r0   rE   r   r   r   __new__   s   z)WeightWithDynamicFloat8CastTensor.__new__c                 C   s   || _ || _|| _|| _d S N)_tensor_linear_mm_config_dtyper8   )selfrC   rD   r0   rE   r   r   r   __init__   s   
z*WeightWithDynamicFloat8CastTensor.__init__c                    s   |t jjjjkrt|d j|d j|d jS d d   fdd}t	
t|||p*i f\}}||i |}|tvr<|S t	
t j fdd|S )Nr   c                    sF   d u r| j n| j ksJ  d u r| j | jS | j ks J | jS rT   )rV   rW   rU   )tr0   	mm_configr   r   unwrap   s   zDWeightWithDynamicFloat8CastTensor.__torch_dispatch__.<locals>.unwrapc                    s   t |  S rT   )r   )xr[   r   r   <lambda>   s    zFWeightWithDynamicFloat8CastTensor.__torch_dispatch__.<locals>.<lambda>)r'   opsatendetachdefaultr   rU   rV   rW   pytreetree_map_only_ops_to_preserve_subclassrM   )rR   functypesargskwargsr]   outr   r[   r   __torch_dispatch__   s$   z4WeightWithDynamicFloat8CastTensor.__torch_dispatch__c                 C   s(   dg}| j r|d || j| jdfS )NrU   r8   )r\   r0   )r8   appendrV   rW   )rX   tensorsr   r   r   __tensor_flatten__   s   
z4WeightWithDynamicFloat8CastTensor.__tensor_flatten__c                 C   s"   t | d |d |d t| dd S )NrU   r\   r0   r8   )r   getattr)inner_tensorsflatten_spec
outer_sizeouter_strider   r   r   __tensor_unflatten__   s   
z6WeightWithDynamicFloat8CastTensor.__tensor_unflatten__c                 C   s   d| j  d| j d| j dS )Nz)WeightWithDynamicFloat8CastTensor(tensor=z, linear_mm_config=z, dtype=))rU   rV   rW   )rX   r   r   r   __repr__   s   z*WeightWithDynamicFloat8CastTensor.__repr__c                 C   sR   | j d urt| j| j | j| jtj}nt| j| j| jdtj|d}|jf|j	ffS )NT)reduce_amaxgemm_input_roledevice_mesh)
r8   r   rU   rW   rV   r
   WEIGHTr   _data_scale)rX   meshfloat8_training_tensorr   r   r   fsdp_pre_all_gather   s"   
z5WeightWithDynamicFloat8CastTensor.fsdp_pre_all_gather)rk   all_gather_outputs.metadataparam_dtyperk   c                C   s   |\}|\}|d ur2ddl m} t|tr||_d S t||r+t|jtr+||j_d S td| t|||| jtj	d|ffS )Nr   r   z[out must be a Float8TrainingTensor or DTensor(_local_tensor=Float8TrainingTensor), but got )ry   )
r%   r   r   r	   r}   r   RuntimeErrorrV   r
   r{   )rX   r   r   r   rk   datascaler   r   r   r   fsdp_post_all_gather   s0   
	z6WeightWithDynamicFloat8CastTensor.fsdp_post_all_gatherrT   )__name__
__module____qualname__staticmethodr'   rM   r   r0   r   rS   rY   classmethodrl   ro   ru   rw   r   r   r   r   r   r   r   r   r      sN    
!
r   ).r,   typingr   r   r   r   r   r'   torch.nnnntorch.utils._pytreeutils_pytreerd   torch._prims_commonr   #torchao.float8.float8_scaling_utilsr   %torchao.float8.float8_training_tensorr	   r
   r   r   torchao.float8.float8_utilsr   no_gradModulerB   r`   ra   
empty_likerc   	new_zerosslicerM   copy_view
as_strided_to_copy_pin_memorysplitclonerf   r   serializationadd_safe_globalsr   r   r   r   <module>   s2   
@








0 