o
    oij                     @   s  U d dl mZ d dlmZmZmZmZmZmZm	Z	m
Z
mZ d dlZd dlmZ d dlmZmZ er=d dlmZ d dlmZ eeeeef f ZG dd	 d	ZG d
d deZ	ddejjdeg ejf de	eejgejf  defddZi dej dej!dddej"dej#dej$didej dej!dddej"dej#dej$didej dej!d dd!ej"d"ej#d"ej$d#id$ej!d%dd%ej"d%ej#d%ej$d&d'd(id)ej!d*dd*ej"d*ej#d*ej$d+d'd,id-ej!d.dd/ej"d0ej#d0ej$d1d'd2id3ej!d4dd4ej"d5ej#d5ej$d6d'd7id8ej d9ej!d:dd;ej"d<ej#d<ej$d=id>ej!d?dd@ej"d?ej#d?ej$dAd'dBidCej!dDddEej"dDej#dDej$dFd'dGidHej!dIddJej"dKej#dKej$dLd'dMidNej!dOddOej"dOej#dOej$dPd'dQidRej!dSddSej"dSej#dSej$dTd'dUidVej!dWddWej"dWej#dWej$dXd'dYidZej!d[dd[ej"d[ej#d[ej$d\d'd]id^ej!d_dd_ej"d_ej#d_ej$d`d'daidbej!dcej#ddej$ded'dfiej!dgej#dhiej!dgej#diej$djd'dkiej!dlej#dmej$dnd'doiej!dpej#dqej$drd'dsiej!dtej#duej$dvd'dwiej!dxej#dyej$dzd'd{iej d|ej!d}ej#dKiej d~ej!dej#diej dej!dej#deid	Z%eeeeeej&f ef f e'd< dddddZ(dej)deej&ef de	e fddZ*dddej&fddZ+e
dedZ,G dd dee, Z-dS )    )deque)	TYPE_CHECKINGAnyCallableDequeDictListOptionalTypeVarUnionN)override)rank_zero_onlyrank_zero_warn)Fabric)	Precisionc                   @   s   e Zd ZdZ	ddee deded	ed
df
ddZddddedededee dee d
dfddZ	d
e
fddZdddZdS )
Throughputa  Computes throughput.

    +------------------------+-------------------------------------------------------------------------------------+
    | Key                    | Value                                                                               |
    +========================+=====================================================================================+
    | batches_per_sec        | Rolling average (over ``window_size`` most recent updates) of the number of batches |
    |                        | processed per second                                                                |
    +--------------------------+-----------------------------------------------------------------------------------+
    | samples_per_sec        | Rolling average (over ``window_size`` most recent updates) of the number of samples |
    |                        | processed per second                                                                |
    +--------------------------+-----------------------------------------------------------------------------------+
    | items_per_sec          | Rolling average (over ``window_size`` most recent updates) of the number of items   |
    |                        | processed per second                                                                |
    +--------------------------+-----------------------------------------------------------------------------------+
    | flpps_per_sec          | Rolling average (over ``window_size`` most recent updates) of the number of flops   |
    |                        | processed per second                                                                |
    +--------------------------+-----------------------------------------------------------------------------------+
    | device/batches_per_sec | batches_per_sec divided by world size                                               |
    +--------------------------+-----------------------------------------------------------------------------------+
    | device/samples_per_sec | samples_per_sec divided by world size                                               |
    +--------------------------+-----------------------------------------------------------------------------------+
    | device/items_per_sec   | items_per_sec divided by world size. This may include padding depending on the data |
    +--------------------------+-----------------------------------------------------------------------------------+
    | device/flops_per_sec   | flops_per_sec divided by world size.                                                |
    +--------------------------+-----------------------------------------------------------------------------------+
    | device/mfu             | device/flops_per_sec divided by world size.                                         |
    +--------------------------+-----------------------------------------------------------------------------------+
    | time                   | Total elapsed time                                                                  |
    +--------------------------+-----------------------------------------------------------------------------------+
    | batches                | Total batches seen                                                                  |
    +--------------------------+-----------------------------------------------------------------------------------+
    | samples                | Total samples seen                                                                  |
    +--------------------------+-----------------------------------------------------------------------------------+
    | lengths                | Total items seen                                                                    |
    +--------------------------+-----------------------------------------------------------------------------------+

    Example::

        throughput = Throughput()
        t0 = time()
        for i in range(1000):
            do_work()
            if torch.cuda.is_available(): torch.cuda.synchronize()  # required or else time() won't be correct
            throughput.update(time=time() - t0, samples=i)
            if i % 10 == 0:
                print(throughput.compute())

    Notes:
        - The implementation assumes that devices FLOPs are all the same as it normalizes by the world size and only
          takes a single ``available_flops`` value.
        - items_per_sec, flops_per_sec and MFU do not account for padding if present. We suggest using
          samples_per_sec or batches_per_sec to measure throughput under this circumstance.

    Args:
        available_flops: Number of theoretical flops available for a single device.
        world_size: Number of devices available across hosts. Global metrics are not included if the world size is 1.
        window_size: Number of batches to use for a rolling average.
        separator: Key separator to use when creating per-device and global metrics.

    N   d   /available_flops
world_sizewindow_size	separatorreturnc                 C   sj   || _ || _|dksJ || _|dksJ t|d| _t|d| _t|d| _t|d| _t|d| _	d S )Nr   r   )maxlen)
r   r   r   _MonotonicWindow_time_batches_samples_lengthsr   _flops)selfr   r   r   r    r"   Y/home/ubuntu/.local/lib/python3.10/site-packages/lightning/fabric/utilities/throughput.py__init__^   s   zThroughput.__init__)lengthsflopstimebatchessamplesr%   r&   c                C   s   | j | ||k rtd| d| d| j| | j| |durU||k r4td| d| d| j| t| jt| jkrUtdt| j dt| j d|durd| j|| j	  dS dS )	ad  Update throughput metrics.

        Args:
            time: Total elapsed time in seconds. It should monotonically increase by the iteration time with each
                call.
            batches: Total batches seen per device. It should monotonically increase with each call.
            samples: Total samples seen per device. It should monotonically increase by the batch size with each call.
            lengths: Total length of the samples seen. It should monotonically increase by the lengths of a batch with
                each call.
            flops: Flops elapased per device since last ``update()`` call. You can easily compute this by using
                :func:`measure_flops` and multiplying it by the number of batches that have been processed.
                The value might be different in each device if the batch size is not the same.

        zExpected samples (z') to be greater or equal than batches ()NzExpected lengths (z') to be greater or equal than samples (zIf lengths are passed (z1), there needs to be the same number of samples ()
r   append
ValueErrorr   r   r   lenRuntimeErrorr    r   )r!   r'   r(   r)   r%   r&   r"   r"   r#   updateq   s$   zThroughput.updatec                 C   s  | j d | jd | jd d}| jr| jd |d< | jdk}t| j | j jkr| j d | j d  }| jd | jd  }| jd | jd  }|| }|| }|d| j d|| d| j d|i |rt|| j }|||| j d	 t| j| jjkr| jd | jd  }	|	| }
|
|d| j d
< |r|
| j }||d
< t| j	| j	jkrt
| j	| j	d  }| j d | j d  }|| }|| j }|r||d< ||d| j d< | jr|| j |d| j d< |S )zCompute throughput metrics.)r'   r(   r)   r%   r   r   devicebatches_per_secsamples_per_sec)r2   r3   items_per_secflops_per_secmfu)r   r   r   r   r   r-   r   r/   r   r    sumr   )r!   metricsadd_global_metricselapsed_timeelapsed_batcheselapsed_samplesdev_samples_per_secdev_batches_per_secr3   elapsed_lengthsdev_items_per_secr4   elapsed_flopsr5   dev_flops_per_secr"   r"   r#   compute   sR   



zThroughput.computec                 C   s6   | j   | j  | j  | j  | j  d S N)r   clearr   r   r   r    r!   r"   r"   r#   reset   s
   



zThroughput.reset)Nr   r   r   )r   N)__name__
__module____qualname____doc__r	   floatintstrr$   r/   _THROUGHPUT_METRICSrC   rG   r"   r"   r"   r#   r       s>    >

)4r   c                       sJ   e Zd ZdZdddeddf fddZdd	ee dedefd
dZ	  Z
S )ThroughputMonitora  Computes throughput.

    This class will automatically keep a count of the number of log calls (``step``). But that can be modified as
    desired. For manual logging, using :class:`Throughput` directly might be desired.

    Example::

        logger = ...
        fabric = Fabric(logger=logger)
        throughput = ThroughputMonitor(fabric)
        t0 = time()
        for i in range(1, 100):
            do_work()
            if torch.cuda.is_available(): torch.cuda.synchronize()  # required or else time() won't be correct
            throughput.update(time=time() - t0, batches=i, samples=i)
            if i % 10 == 0:
                throughput.compute_and_log(step=i)

    Args:
        fabric: The Fabric object.
        \**kwargs: See available parameters in :class:`Throughput`

    fabricr   kwargsr   Nc                    s   |   t|jj}t|j|}t jd||jd| || _	d| _
t| j| _t| ji d| _t| ji d| _t| j| _d S )N)r   r   r0   )defaultr"   )_validate_launched_plugin_to_compute_dtypestrategy	precisionget_available_flopsr1   superr$   r   _fabricstepr   r/   rC   compute_and_logrG   )r!   rQ   rR   dtyper   	__class__r"   r#   r$      s   zThroughputMonitor.__init__r[   c                 K   s>   |du r	| j d n|| _ | jdi |}| jj|| j d |S )zSee :meth:`Throughput.compute`

        Args:
            step: Can be used to override the logging step.
            \**kwargs: See available parameters in :meth:`Throughput.compute`

        Nr   )r8   r[   r"   )r[   rC   rZ   log_dict)r!   r[   rR   r8   r"   r"   r#   r\      s   z!ThroughputMonitor.compute_and_logrD   )rH   rI   rJ   rK   r   r$   r	   rM   rO   r\   __classcell__r"   r"   r^   r#   rP      s    $rP   model
forward_fnloss_fnr   c                 C   sz   ddl m} |dd}|$ |du r|  n||   W d   | S W d   | S 1 s4w   Y  | S )a-  Utility to compute the total number of FLOPs used by a module during training or during inference.

    It's recommended to create a meta-device model for this:

    Example::

        with torch.device("meta"):
            model = MyModel()
            x = torch.randn(2, 32)

        model_fwd = lambda: model(x)
        fwd_flops = measure_flops(model, model_fwd)

        model_loss = lambda y: y.sum()
        fwd_and_bwd_flops = measure_flops(model, model_fwd, model_loss)

    Args:
        model: The model whose FLOPs should be measured.
        forward_fn: A function that runs ``forward`` on the model and returns the result.
        loss_fn: A function that computes the loss given the ``forward_fn`` output. If provided, the loss and `backward`
            FLOPs will be included in the result.

    r   )FlopCounterModeF)displayN)torch.utils.flop_counterre   backwardget_total_flops)rb   rc   rd   re   flop_counterr"   r"   r#   measure_flops
  s   


rk   h100 nvlg  wBg  $^/lBtfloat32g SCg SCg  />2,Ch100 sxmg  wBg  $^/lBg SBg  />2C	h100 pcieg   vHBg   vHBg  |Bg  |Cg @.CCzrtx 4090g  $Bg bCint4g bCzrtx 4080g  m%Bg )Bg )Cl4g  ĎBg  WHBg  $ Bg  $ Bg  $ Bl40g  ˓Bg  ˓Bg  ˓Bg  ˓Ca100g  ꤡBg  2Bg  2Bg  2Bg  2Ca6000g  \EBg  \EBg gBg @$ Ca40g  xBg  xBg bcBg Ca10gg  P`Bg  4&kBg  4&kBg  4&kBg  4&kBzrtx 3090 tig  @0Bg  @0Bg  @0Czrtx 3090g  Pb0Bg  q$Bg  q$ Czrtx 3080 tig  cBg  cBg  YBzrtx 3080g  jZBg  .Bg  .Bzrtx 3070g  IvvBg  gH|Bg  gH|Bt4g  }wBg  Bg  Bg  Bg   h_Bg  HBg  HBg  HBg  HBg  `cԩBg  H`Bg  3Bg  3Bg   xHBg   xHBg   xHBg FBg  Bg  <vBg  /|Bg  /|Bg  pkGBg  pkGBg  *Bg  *Bg  P`Bg  ᎬBg  BwBg  BwBg  BwBg  @YԝBg  @YԭB)	quadro rtx 5000zrtx 2080 superzrtx 2080 tizrtx 2080zrtx 2070 super	titan rtxv100 sxm	v100 pcie
v100s pcie_CUDA_FLOPSg  聰vBg  ӊBg  CBg  `teB)v2v3v4	v5litepodr1   r]   c                 C   sx  | j dkrtj| }| }d|v r,d|v rd}nd|v r!d}nd|v s)d|v r+d	}n~d
|v r9d|v r6dnd
}nqd|v r[|dd }d}d|v rMd}nd|v rSd}d| | }nOd|v rbd}nHd|v rid}nAd|v rpd}n:d|v rwd}n3d|v r~d}n,d|v rd}n%d|v rd}nd|v rd}nd|v rd }nd!|v rd"}n	td#| d$S |tvrtd#|d%| d$S t| }|tju rd&d'l	m
} | rt d(krd)}||vrt|d*|  d$S t|| S | j d+kr:d&d,lm} |rd&d-lm}	 nd&d-lm}	 |	 }
|
d.p|
d/ d0d& }| }t|ts#J |tvr4td1|d2|  d$S tt| S d$S )3zReturns the available theoretical FLOPs.

    This is an optimistic upper limit that could only be achievable if only thick matmuls were run in a benchmark
    environment.

    cudah100hbm3rn   nvlrl   pciehbm2ero   rq   teslarr   zgeforce rtx     rY   z supertiz tizrtx rt   rs   ru   rv   rw   rx   ry   zv100-sxmrz   z	v100-pcier{   z
v100s-pcier|   zFLOPs not found for Nz
, chip is r   )_is_ampere_or_laterhighestrm   z does not support xla)_XLA_GREATER_EQUAL_2_1)tpuTYPEACCELERATOR_TYPE-zFLOPs not found for TPU z with )typetorchr   get_device_namelowersplitr   r}   float32"lightning.fabric.accelerators.cudar   get_float32_matmul_precisionrM   !lightning.fabric.accelerators.xlar   torch_xla._internalr   torch_xla.experimentalget_tpu_envget
isinstancerN   
_TPU_FLOPS)r1   r]   device_namechipnumberextradtype_to_flopsr   r   r   tpu_envr"   r"   r#   rX   	  s   


rX   pluginr   c           
      C   s   ddl m}m}m}m}m}m}m}m}m	}	 t
| |s"td|  t
| |r*| jS t
| ||fr4| jS t
| |r<tjS t
| |	|frF| jS t
| |rNtjS t
| |rZ| jjpYtjS t
| |rbtjS t| )Nr   )	BitsandbytesPrecisionDeepSpeedPrecisionDoublePrecisionFSDPPrecisionHalfPrecisionMixedPrecisionr   TransformerEnginePrecisionXLAPrecisionz!Expected a precision plugin, got )lightning.fabric.pluginsr   r   r   r   r   r   r   r   r   r   r.   r]   _desired_input_dtyper   double_desired_dtypeint8mixed_precision_configreduce_dtyper   NotImplementedError)
r   r   r   r   r   r   r   r   r   r   r"   r"   r#   rU   _  s$   ,





rU   T)boundc                       sp   e Zd ZdZdeddf fddZedee fddZ	e
d	eddfd
dZe
dededdfddZ  ZS )r   zjCustom fixed size list that only supports right-append and ensures that all values increase monotonically.r   r   Nc                    s   t    || _d S rD   )rY   r$   r   )r!   r   r^   r"   r#   r$     s   

z_MonotonicWindow.__init__c                 C   s   t | dkr
| d S d S )Nr   r0   )r-   rF   r"   r"   r#   last  s   z_MonotonicWindow.lastxc                 C   sR   | j }|d ur||krtd| d| t| | t| | jkr'| d= d S d S )Nz&Expected the value to increase, last: z, current: r   )r   r,   listr+   r-   r   )r!   r   r   r"   r"   r#   r+     s   
z_MonotonicWindow.appendkeyvaluec                 C   s   t d)Nz__setitem__ is not supported)r   )r!   r   r   r"   r"   r#   __setitem__  s   z_MonotonicWindow.__setitem__)rH   rI   rJ   rK   rM   r$   propertyr	   r   r   r   r+   r   r   ra   r"   r"   r^   r#   r     s    	 r   rD   ).collectionsr   typingr   r   r   r   r   r   r	   r
   r   r   typing_extensionsr   $lightning.fabric.utilities.rank_zeror   r   lightning.fabricr   r   r   rN   rM   rL   rO   r   rP   nnModuleTensorrk   float64r   bfloat16float16r   r}   r]   __annotations__r   r1   rX   rU   r   r   r"   r"   r"   r#   <module>   s  , 77
'%-5@HPYaiqy     	 * O&V!