o
    8wik                     @   s  U d dl mZ d dlmZmZmZmZmZmZ d dl	Z	d dl
mZ d dlmZmZ er7d dlmZ d dlmZ eeeeef f ZG dd	 d	ZG d
d deZ	dde	jjdeg e	jf deee	jge	jf  defddZi de	jde	jddde	j de	j!de	j"dide	jde	jddde	j de	j!de	j"dide	jde	jd dd!e	j d"e	j!d"e	j"d#id$e	jd%dd%e	j d%e	j!d%e	j"d&d'd(id)e	jd*dd*e	j d*e	j!d*e	j"d+d'd,id-e	jd.dd.e	j d.e	j!d.e	j"d/d'd0id1e	jd2dd3e	j d4e	j!d4e	j"d5d'd6id7e	jd8dd8e	j d9e	j!d9e	j"d:d'd;id<e	jd=e	jd>dd?e	j d@e	j!d@e	j"dAidBe	jdCddDe	j dCe	j!dCe	j"dEd'dFidGe	jdHddIe	j dHe	j!dHe	j"dJd'dKidLe	jdMddNe	j dOe	j!dOe	j"dPd'dQidRe	jdSddSe	j dSe	j!dSe	j"dTd'dUidVe	jdWddWe	j dWe	j!dWe	j"dXd'dYidZe	jd[dd[e	j d[e	j!d[e	j"d\d'd]id^e	jd_dd_e	j d_e	j!d_e	j"d`d'daidbe	jdcddce	j dce	j!dce	j"ddd'deie	jdfe	j!dge	j"dhd'diie	jdje	j!dkie	jdje	j!dle	j"dmd'dnie	jdoe	j!dpe	j"dqd'drie	jdse	j!dte	j"dud'dvie	jdwe	j!dxe	j"dyd'dzie	jd{e	j!d|e	j"d}d'd~ie	jde	jde	j!dOie	jde	jde	j!die	jde	jde	j!dhid
Z#eeeeee	j$f ef f e%d< dddddZ&de	j'dee	j$ef dee fddZ(ddde	j$fddZ)ededZ*G dd de+e* Z,dS )    )deque)TYPE_CHECKINGAnyCallableOptionalTypeVarUnionN)override)rank_zero_onlyrank_zero_warn)Fabric)	Precisionc                   @   s   e Zd ZdZ	ddee deded	ed
df
ddZddddedededee dee d
dfddZ	d
e
fddZdddZdS )
Throughputa  Computes throughput.

    +------------------------+-------------------------------------------------------------------------------------+
    | Key                    | Value                                                                               |
    +========================+=====================================================================================+
    | batches_per_sec        | Rolling average (over ``window_size`` most recent updates) of the number of batches |
    |                        | processed per second                                                                |
    +--------------------------+-----------------------------------------------------------------------------------+
    | samples_per_sec        | Rolling average (over ``window_size`` most recent updates) of the number of samples |
    |                        | processed per second                                                                |
    +--------------------------+-----------------------------------------------------------------------------------+
    | items_per_sec          | Rolling average (over ``window_size`` most recent updates) of the number of items   |
    |                        | processed per second                                                                |
    +--------------------------+-----------------------------------------------------------------------------------+
    | flpps_per_sec          | Rolling average (over ``window_size`` most recent updates) of the number of flops   |
    |                        | processed per second                                                                |
    +--------------------------+-----------------------------------------------------------------------------------+
    | device/batches_per_sec | batches_per_sec divided by world size                                               |
    +--------------------------+-----------------------------------------------------------------------------------+
    | device/samples_per_sec | samples_per_sec divided by world size                                               |
    +--------------------------+-----------------------------------------------------------------------------------+
    | device/items_per_sec   | items_per_sec divided by world size. This may include padding depending on the data |
    +--------------------------+-----------------------------------------------------------------------------------+
    | device/flops_per_sec   | flops_per_sec divided by world size.                                                |
    +--------------------------+-----------------------------------------------------------------------------------+
    | device/mfu             | device/flops_per_sec divided by world size.                                         |
    +--------------------------+-----------------------------------------------------------------------------------+
    | time                   | Total elapsed time                                                                  |
    +--------------------------+-----------------------------------------------------------------------------------+
    | batches                | Total batches seen                                                                  |
    +--------------------------+-----------------------------------------------------------------------------------+
    | samples                | Total samples seen                                                                  |
    +--------------------------+-----------------------------------------------------------------------------------+
    | lengths                | Total items seen                                                                    |
    +--------------------------+-----------------------------------------------------------------------------------+

    Example::

        throughput = Throughput()
        t0 = time()
        for i in range(1000):
            do_work()
            if torch.cuda.is_available(): torch.cuda.synchronize()  # required or else time() won't be correct
            throughput.update(time=time() - t0, samples=i)
            if i % 10 == 0:
                print(throughput.compute())

    Notes:
        - The implementation assumes that devices FLOPs are all the same as it normalizes by the world size and only
          takes a single ``available_flops`` value.
        - items_per_sec, flops_per_sec and MFU do not account for padding if present. We suggest using
          samples_per_sec or batches_per_sec to measure throughput under this circumstance.

    Args:
        available_flops: Number of theoretical flops available for a single device.
        world_size: Number of devices available across hosts. Global metrics are not included if the world size is 1.
        window_size: Number of batches to use for a rolling average.
        separator: Key separator to use when creating per-device and global metrics.

    N   d   /available_flops
world_sizewindow_size	separatorreturnc                 C   sj   || _ || _|dksJ || _|dksJ t|d| _t|d| _t|d| _t|d| _t|d| _	d S )Nr   r   )maxlen)
r   r   r   _MonotonicWindow_time_batches_samples_lengthsr   _flops)selfr   r   r   r    r   b/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lightning_fabric/utilities/throughput.py__init__^   s   zThroughput.__init__)lengthsflopstimebatchessamplesr"   r#   c                C   s   | j | ||k rtd| d| d| j| | j| |durU||k r4td| d| d| j| t| jt| jkrUtdt| j dt| j d|durd| j|| j	  dS dS )	ad  Update throughput metrics.

        Args:
            time: Total elapsed time in seconds. It should monotonically increase by the iteration time with each
                call.
            batches: Total batches seen per device. It should monotonically increase with each call.
            samples: Total samples seen per device. It should monotonically increase by the batch size with each call.
            lengths: Total length of the samples seen. It should monotonically increase by the lengths of a batch with
                each call.
            flops: Flops elapased per device since last ``update()`` call. You can easily compute this by using
                :func:`measure_flops` and multiplying it by the number of batches that have been processed.
                The value might be different in each device if the batch size is not the same.

        zExpected samples (z') to be greater or equal than batches ()NzExpected lengths (z') to be greater or equal than samples (zIf lengths are passed (z1), there needs to be the same number of samples ()
r   append
ValueErrorr   r   r   lenRuntimeErrorr   r   )r   r$   r%   r&   r"   r#   r   r   r    updateq   s$   zThroughput.updatec                 C   s  | j d | jd | jd d}| jr| jd |d< | jdk}t| j | j jkr| j d | j d  }| jd | jd  }| jd | jd  }|| }|| }|d| j d|| d| j d|i |rt|| j }|||| j d	 t| j| jjkr| jd | jd  }	|	| }
|
|d| j d
< |r|
| j }||d
< t| j	| j	jkrt
| j	| j	d  }| j d | j d  }|| }|| j }|r||d< ||d| j d< | jr|| j |d| j d< |S )zCompute throughput metrics.)r$   r%   r&   r"   r   r   devicebatches_per_secsamples_per_sec)r/   r0   items_per_secflops_per_secmfu)r   r   r   r   r   r*   r   r,   r   r   sumr   )r   metricsadd_global_metricselapsed_timeelapsed_batcheselapsed_samplesdev_samples_per_secdev_batches_per_secr0   elapsed_lengthsdev_items_per_secr1   elapsed_flopsr2   dev_flops_per_secr   r   r    compute   sR   



zThroughput.computec                 C   s6   | j   | j  | j  | j  | j  d S N)r   clearr   r   r   r   r   r   r   r    reset   s
   



zThroughput.reset)Nr   r   r   )r   N)__name__
__module____qualname____doc__r   floatintstrr!   r,   _THROUGHPUT_METRICSr@   rD   r   r   r   r    r       s>    >

)4r   c                       sJ   e Zd ZdZdddeddf fddZdd	ee dedefd
dZ	  Z
S )ThroughputMonitora  Computes throughput.

    This class will automatically keep a count of the number of log calls (``step``). But that can be modified as
    desired. For manual logging, using :class:`Throughput` directly might be desired.

    Example::

        logger = ...
        fabric = Fabric(logger=logger)
        throughput = ThroughputMonitor(fabric)
        t0 = time()
        for i in range(1, 100):
            do_work()
            if torch.cuda.is_available(): torch.cuda.synchronize()  # required or else time() won't be correct
            throughput.update(time=time() - t0, batches=i, samples=i)
            if i % 10 == 0:
                throughput.compute_and_log(step=i)

    Args:
        fabric: The Fabric object.
        \**kwargs: See available parameters in :class:`Throughput`

    fabricr   kwargsr   Nc                    s   |   t|jj}t|j|}t jd||jd| || _	d| _
t| j| _t| ji d| _t| ji d| _t| j| _d S )N)r   r   r-   )defaultr   )_validate_launched_plugin_to_compute_dtypestrategy	precisionget_available_flopsr.   superr!   r   _fabricstepr
   r,   r@   compute_and_logrD   )r   rN   rO   dtyper   	__class__r   r    r!      s   zThroughputMonitor.__init__rX   c                 K   s>   |du r	| j d n|| _ | jdi |}| jj|| j d |S )zSee :meth:`Throughput.compute`

        Args:
            step: Can be used to override the logging step.
            \**kwargs: See available parameters in :meth:`Throughput.compute`

        Nr   )r5   rX   r   )rX   r@   rW   log_dict)r   rX   rO   r5   r   r   r    rY      s   z!ThroughputMonitor.compute_and_logrA   )rE   rF   rG   rH   r   r!   r   rJ   rL   rY   __classcell__r   r   r[   r    rM      s    $rM   model
forward_fnloss_fnr   c                 C   sz   ddl m} |dd}|$ |du r|  n||   W d   | S W d   | S 1 s4w   Y  | S )a-  Utility to compute the total number of FLOPs used by a module during training or during inference.

    It's recommended to create a meta-device model for this:

    Example::

        with torch.device("meta"):
            model = MyModel()
            x = torch.randn(2, 32)

        model_fwd = lambda: model(x)
        fwd_flops = measure_flops(model, model_fwd)

        model_loss = lambda y: y.sum()
        fwd_and_bwd_flops = measure_flops(model, model_fwd, model_loss)

    Args:
        model: The model whose FLOPs should be measured.
        forward_fn: A function that runs ``forward`` on the model and returns the result.
        loss_fn: A function that computes the loss given the ``forward_fn`` output. If provided, the loss and `backward`
            FLOPs will be included in the result.

    r   )FlopCounterModeF)displayN)torch.utils.flop_counterrb   backwardget_total_flops)r_   r`   ra   rb   flop_counterr   r   r    measure_flops
  s   


rh   h100 nvlg  wBg  $^/lBtfloat32g SCg SCg  />2,Ch100 sxmg  wBg  $^/lBg SBg  />2C	h100 pcieg   vHBg   vHBg  |Bg  |Cg @.CCzrtx 4090g  $Bg bCint4g bCzrtx 4080g  m%Bg )Bg )Czrtx 4080 superg  :Bg  :Bg  :Cl4g  ĎBg  WHBg  $ Bg  $ Bg  $ Bl40g  ˓Bg  ˓Bg  ˓Bg  ˓Ca100g  ꤡBg  2Bg  2Bg  2Bg  2Ca6000g  \EBg  \EBg gBg @$ Ca40g  xBg  xBg bcBg Ca10gg  P`Bg  4&kBg  4&kBg  4&kBg  4&kBzrtx 3090 tig  @0Bg  @0Bg  @0Czrtx 3090g  Pb0Bg  q$Bg  q$ Czrtx 3080 tig  cBg  cBg  YBzrtx 3080g  jZBg  .Bg  .Bzrtx 3070g  IvvBg  gH|Bg  gH|Bg  }wBg  Bg  Bg  Bg   h_Bg  HBg  HBg  HBg  HBg  `cԩBg  H`Bg  3Bg  3Bg   xHBg   xHBg   xHBg FBg  Bg  <vBg  /|Bg  /|Bg  pkGBg  pkGBg  *Bg  *Bg  P`Bg  ᎬBg  BwBg  BwBg  BwBg  @YԝBg  @YԭB)
t4quadro rtx 5000zrtx 2080 superzrtx 2080 tizrtx 2080zrtx 2070 super	titan rtxv100 sxm	v100 pcie
v100s pcie_CUDA_FLOPSg  聰vBg  ӊBg  CBg  `teB)v2v3v4	v5litepodr.   rZ   c                 C   sx  | j dkrtj| }| }d|v r,d|v rd}nd|v r!d}nd|v s)d|v r+d	}n~d
|v r9d|v r6dnd
}nqd|v r[|dd }d}d|v rMd}nd|v rSd}d| | }nOd|v rbd}nHd|v rid}nAd|v rpd}n:d|v rwd}n3d|v r~d}n,d|v rd}n%d|v rd}nd|v rd}nd|v rd }nd!|v rd"}n	td#| d$S |tvrtd#|d%| d$S t| }|tju rd&d'l	m
} | rt d(krd)}||vrt|d*|  d$S t|| S | j d+kr:d&d,lm} |rd&d-lm}	 nd&d-lm}	 |	 }
|
d.p|
d/ d0d& }| }t|ts#J |tvr4td1|d2|  d$S tt| S d$S )3zReturns the available theoretical FLOPs.

    This is an optimistic upper limit that could only be achievable if only thick matmuls were run in a benchmark
    environment.

    cudah100hbm3rk   nvlri   pciehbm2erl   rn   teslaro   zgeforce rtx     rV   z supertiz tizrtx rq   rp   rr   rs   rt   ru   rv   zv100-sxmrw   z	v100-pcierx   z
v100s-pciery   zFLOPs not found for Nz
, chip is r   )_is_ampere_or_laterhighestrj   z does not support xla)_XLA_GREATER_EQUAL_2_1)tpuTYPEACCELERATOR_TYPE-zFLOPs not found for TPU z with )typetorchr   get_device_namelowersplitr   rz   float32"lightning_fabric.accelerators.cudar   get_float32_matmul_precisionrJ   !lightning_fabric.accelerators.xlar   torch_xla._internalr   torch_xla.experimentalget_tpu_envget
isinstancerK   
_TPU_FLOPS)r.   rZ   device_namechipnumberextradtype_to_flopsr   r   r   tpu_envr   r   r    rU     s   


rU   pluginr   c           
      C   s   ddl m}m}m}m}m}m}m}m}m	}	 t
| |s"td|  t
| |r*| jS t
| ||fr4| jS t
| |r<tjS t
| |	|frF| jS t
| |rNtjS t
| |rZ| jjpYtjS t
| |rbtjS t| )Nr   )	BitsandbytesPrecisionDeepSpeedPrecisionDoublePrecisionFSDPPrecisionHalfPrecisionMixedPrecisionr   TransformerEnginePrecisionXLAPrecisionz!Expected a precision plugin, got )lightning_fabric.pluginsr   r   r   r   r   r   r   r   r   r   r+   rZ   _desired_input_dtyper   double_desired_dtypeint8mixed_precision_configreduce_dtyper   NotImplementedError)
r   r   r   r   r   r   r   r   r   r   r   r   r    rR   g  s$   ,





rR   T)boundc                       sp   e Zd ZdZdeddf fddZedee fddZ	e
d	eddfd
dZe
dededdfddZ  ZS )r   zjCustom fixed size list that only supports right-append and ensures that all values increase monotonically.r   r   Nc                    s   t    || _d S rA   )rV   r!   r   )r   r   r[   r   r    r!     s   

z_MonotonicWindow.__init__c                 C   s   t | dkr
| d S d S )Nr   r-   )r*   rC   r   r   r    last  s   z_MonotonicWindow.lastxc                 C   sR   | j }|d ur||krtd| d| t| | t| | jkr'| d= d S d S )Nz&Expected the value to increase, last: z, current: r   )r   r)   listr(   r*   r   )r   r   r   r   r   r    r(     s   
z_MonotonicWindow.appendkeyvaluec                 C   s   t d)Nz__setitem__ is not supported)r   )r   r   r   r   r   r    __setitem__  s   z_MonotonicWindow.__setitem__)rE   rF   rG   rH   rJ   r!   propertyr   r   r   r	   r(   r   r   r^   r   r   r[   r    r     s    	 r   rA   )-collectionsr   typingr   r   r   r   r   r   r   typing_extensionsr	   $lightning_fabric.utilities.rank_zeror
   r   lightning_fabricr   r   r   dictrK   rJ   rI   rL   r   rM   nnModuleTensorrh   float64r   bfloat16float16r   rz   rZ   __annotations__r   r.   rU   rR   r   r   r   r   r   r   r    <module>   s    77
'%-5=HPXaiqy   
  	 * W&V!