o
    2wie1                     @   s:  d Z ddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZ ddlZd	d
lmZ d	dlmZmZ d	dlmZmZ d#dejeejf defddZG dd deZ G dd de	Z!G dd de	Z"G dd dZ#ddddddZ$G dd dZ%dd  Z&G d!d" d"eZ'dS )$z
Support for PyTorch lightning. You should just replace the call
to `Trainer(...)` with `get_trainer(...)`.
For using `dora.log.LogProgress` as a progress bar with PL, see `PLLogProgress`.
    N)LightningModule)Callback)ProgressBarBase)ClusterEnvironment)Trainer)from_argparse_args   )distrib)get_xpis_xp)boldLogProgressTmetricsepochc                 C   s   i }|   D ]:\}}|r|drq|s|drq|ds$|dr,|ddd }t|tjr<| dkr<| }|||< q|S )a  Filters metrics before formatting, in particular to remove the `_step` or `_epoch`
    suffix. This will also convert torch tensors to float.
    Args:
        metrics: dict given by PL.
        epoch: if True, keep only epoch level metrics, otherwise, keep only step level metrics.
    _step_epoch_r   r   )itemsendswithrsplit
isinstancetorchTensornumelitem)r   r   outkeyvalue r   K/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/dora/lightning.py_filter_metrics   s   
r    c                       s   e Zd Z fddZdefddZedefddZdefdd	Z	d
eddfddZ
defddZdeddfddZdefddZdefddZedefddZedefddZedefddZ  ZS )DoraEnvironmentc                    s    t    t | _t  d S N)super__init__r	   get_distrib_specspecset_distrib_envself	__class__r   r   r$   4   s   

zDoraEnvironment.__init__returnc                 C      dS NTr   r(   r   r   r   creates_children9      z DoraEnvironment.creates_childrenc                 C   r-   r.   r   r(   r   r   r   creates_processes_externally<      z,DoraEnvironment.creates_processes_externallyc                 C      | j jS r"   )r&   
world_sizer(   r   r   r   r4   @      zDoraEnvironment.world_sizesizeNc                 C      d S r"   r   )r)   r6   r   r   r   set_world_sizeC   r0   zDoraEnvironment.set_world_sizec                 C   r3   r"   )r&   rankr(   r   r   r   global_rankF   r5   zDoraEnvironment.global_rankr9   c                 C   r7   r"   r   )r)   r9   r   r   r   set_global_rankI   r0   zDoraEnvironment.set_global_rankc                 C   r3   r"   )r&   
local_rankr(   r   r   r   r<   L   r5   zDoraEnvironment.local_rankc                 C   r3   r"   )r&   	node_rankr(   r   r   r   r=   O   r5   zDoraEnvironment.node_rankc                   C   r-   NFr   r   r   r   r   detectR   r2   zDoraEnvironment.detectc                 C   s
   t jd S )N	MAIN_ADDR)osenvironr(   r   r   r   main_addressV   s   
zDoraEnvironment.main_addressc                 C   s   t tjd S )N	MAIN_PORT)intrA   rB   r(   r   r   r   	main_portZ   s   zDoraEnvironment.main_port)__name__
__module____qualname__r$   boolr/   propertyr1   rE   r4   r8   r:   r;   r<   r=   staticmethodr?   strrC   rF   __classcell__r   r   r*   r   r!   3   s"    r!   c                   @   s(   e Zd ZdZdd Zdd Zdd ZdS )	DoraCheckpointSyncz>Make sure Dora history, and checkpoint state are in sync.
    c                 C   s   t  | _d S r"   )r
   xpr(   r   r   r   r$   b   s   zDoraCheckpointSync.__init__c                 C   s   |d }| j j| d S )Ndora_link_history)rP   linkupdate_history)r)   trainer	pl_module
checkpointhistoryr   r   r   on_load_checkpointe   s   z%DoraCheckpointSync.on_load_checkpointc                 C   s*   | j jj|d< | j j|d< | j j|d< |S )NrQ   dora_sigdora_cfg)rP   rR   rW   sigcfgr)   rT   rU   rV   r   r   r   on_save_checkpointi   s   z%DoraCheckpointSync.on_save_checkpointN)rG   rH   rI   __doc__r$   rX   r^   r   r   r   r   rO   _   s
    rO   c                       s8   e Zd ZdZ fddZdd Zdd Zdd	 Z  ZS )
DoraHistoryLoggerz,Save metrics to Dora using the XP link.
    c                    s   t    t j| _d S r"   )r#   r$   r
   rR   r(   r*   r   r   r$   s      
zDoraHistoryLogger.__init__c                 C   
   d| _ d S r.   _first_validr)   rT   rU   r   r   r   on_fit_startw      
zDoraHistoryLogger.on_fit_startc                 C   rb   r>   rc   re   r   r   r   on_train_epoch_startz   rg   z&DoraHistoryLogger.on_train_epoch_startc                 C   s,   | j rd S |j}t|dd}| j| d S )NTr   )rd   logged_metricsr    rR   push_metrics)r)   rT   rU   r   r   r   r   on_epoch_end}   s
   zDoraHistoryLogger.on_epoch_end)	rG   rH   rI   r_   r$   rf   rh   rl   rN   r   r   r*   r   r`   p   s    r`   c                   @   s   e Zd Zdd ZdS )_DummySLURMConnectorc                 C   r7   r"   r   r(   r   r   r   register_slurm_signal_handlers   r0   z3_DummySLURMConnector.register_slurm_signal_handlersN)rG   rH   rI   rn   r   r   r   r   rm      s    rm   )auto_resumeadd_dora_loggerno_unfinished_epochsc                 O   sx  t  stdtj}t|dr|j}t|dstj|dgt| fi |}|d= |	dp/g }t
 }ttj | }| dkrI||dg7 }||d< |	ddpTg }	|	t  |	|d< |d	 duritd
|d dkrstd||d	< |jj|d< t j|d< |r|d t  |d}
| r|
du rt jd }| rt|}nd}||d< tdi |}|rt |_|S )a  Return a PL trainer, adding the necessary glue code to make everything works.
    The arguments are exactly the same as for `pytorch_lightning.trainer.Trainer`,
    with a few extras documented after.

    ..note:: You should not pass `gpus=` or `num_nodes=` arguments as those will be filled by Dora.

    Args:
        auto_resume (bool): if True, automatically resume previous checkpoints.
            You are still responsible for creating the `ModelCheckpoint` callback,
            this only handles the `resume_from_checkpoint` part.
        add_dora_logger (bool): if True, adds a Dora Logger to automatically
            forward the metrics (those logged with per_epoch=True), otherwise
            pushing metrics will be up to you.
        no_unfinished_epochs (bool): if True, deactivates SLURM signal handling
            by PL, which can result in half finished epoch with each interruption.
            It is recommended to instead dump a checkpoint every epoch and resume
            from that one so that training is reliable.

    z.This can only be called from inside a Dora XP.__wrapped__Nr)   pluginsr   ddp	callbacksgpuszCYou cannot specify the number of GPUs, as this is provided by Dora.	num_nodeszDYou cannot specify the number of nodes, as this is provided by Dora.default_root_dirresume_from_checkpointz	last.ckptr   )r   RuntimeErrorr   r$   hasattrrr   inspectgetcallargslistpopr!   minr   cudadevice_countr4   appendrO   r&   rw   r
   folderr`   getis_filerM   rm   slurm_connector)ro   rp   rq   argskwargsinitrs   envrv   ru   ry   lastresumerT   r   r   r   get_trainer   sJ   



r   c                   @   s    e Zd Zeejdd ZdS )
_Interceptc                 O   s   || _ || _d S r"   )r   r   r)   r   r   r   r   r   r$      s   
z_Intercept.__init__N)rG   rH   rI   	functoolswrapsr   r$   r   r   r   r   r      s    
r   c                 K   s$   t t| fi |}t|ji |jS r"   )r   r   r   r   r   )r   r   	interceptr   r   r   trainer_from_argparse_args   s   r   c                       s  e Zd ZdZd0 fddZdeddf fddZ fd	d
Zede	fddZ
	d1dejeejf dedefddZedd Zdd Z fddZ fddZdd Z fddZ fddZd d! Zd"d# Z fd$d%Z fd&d'Z fd(d)Zd*d+ Zd,d- Zd.d/ Z  ZS )2PLLogProgressz<`dora.log.LogProgress` support for Pytorch-Lightning.


    r,   Nc                    s    t    || _|| _d | _d S r"   )r#   r$   loggerr   
_pl_module)r)   r   r   r*   r   r   r$      s   

zPLLogProgress.__init__stagec                    s    t  ||| || _g | _d S r"   )r#   setupr   _replay_history)r)   rT   rU   r   r*   r   r   r      s   
zPLLogProgress.setupc                    s   t  || d| _d| _d S )NFT)r#   rf   	_in_trainrd   re   r*   r   r   rf      s   
zPLLogProgress.on_fit_startc                 C   s   | j d usJ | j S r"   )r   r(   r   r   r   rU      s   zPLLogProgress.pl_moduleFr   r   c                 C   s2   i }|  D ]\}}t|trt|d||< q|S )ag  Default method to format metrics for displaying in the progress bar.
        To customize, you can define a `format_metrics()` method on your
        Lightning module.

        Args:
            metrics: dict of metrics given by PL.
            stage: "train" or "valid".
            epoch: if True, provided metrics are for the end of epoch summary.
        z.5f)r   r   floatformat)r)   r   r   r   r   r   r   r   r   r   format_metrics   s   
zPLLogProgress.format_metricsc                 C   s   t | jd| jS )Nr   )getattrrU   r   r(   r   r   r   _format_metrics  s   zPLLogProgress._format_metricsc                 C   s   | j d | j |dkrdnd | d| jjd   }|dkr)t| j}n|dkr3t| j}ntd| t	|}t
| j |f||d	| j| _t| j d S )
NzF----------------------------------------------------------------------trainzTraining...zValidating...z	 | Epoch r   validzInvalid stage )totalname)r   info
capitalizerT   current_epochrE   total_train_batchestotal_val_batchesrz   ranger   r   logprogiter)r)   r   r   r   loaderr   r   r   _on_epoch_start  s   zPLLogProgress._on_epoch_startc                    s$   |  d d| _d| _t ||S )Nr   TF)r   r   rd   r#   rh   re   r*   r   r   rh     s   
z"PLLogProgress.on_train_epoch_startc                    s   |  d t ||S Nr   )r   r#   on_validation_epoch_startre   r*   r   r   r   $  ra   z'PLLogProgress.on_validation_epoch_startc                 C   sL   |  | j| j}t|dd}| j||dd}| jjdi | t| j d S )NFri   r   )get_metricsrT   rU   r    r   r   updatenext)r)   r   r   	formattedr   r   r   _on_batch_end(  s
   zPLLogProgress._on_batch_endc                        t  j|i | | d d S )Nr   )r#   on_train_batch_endr   r   r*   r   r   r   /     z PLLogProgress.on_train_batch_endc                    r   r   )r#   on_validation_batch_endr   r   r*   r   r   r   3  r   z%PLLogProgress.on_validation_batch_endc                 C   sZ   |dkr| j jjjdd }n| j jjjjdd }t|dd}| || j j| d S )Nr   Flogri   )	rT   fit_loop
epoch_loop_resultsr   val_loopr    _show_epoch_summaryr   )r)   r   r   r   r   r   _on_stage_end7  s
   zPLLogProgress._on_stage_endc                 C   sj   | j |||f | j||dd}| }ddd | D }| jt| d|d  d|  d S )NTri   z | c                 s   s&    | ]\}}|   d | V  qdS )=N)r   ).0r   valr   r   r   	<genexpr>D  s    
z4PLLogProgress._show_epoch_summary.<locals>.<genexpr>z Summary | End of Epoch r   )	r   r   r   r   joinr   r   r   r   )r)   r   r   r   r   r   summaryr   r   r   r   @  s   
(z!PLLogProgress._show_epoch_summaryc                    s<   t  || | js| jsJ | js| d d| _d S d S Nr   F)r#   on_train_endr   rd   r   re   r*   r   r   on_validation_startI  s   

z!PLLogProgress.on_validation_startc                    s(   t  || | jr| d d| _d S r   )r#   rl   r   r   re   r*   r   r   rl   P  s   

zPLLogProgress.on_epoch_endc                    s   t  || | d d S r   )r#   on_validation_endr   re   r*   r   r   r   V  s   zPLLogProgress.on_validation_endc                 C   r7   r"   r   r(   r   r   r   disableZ  s   zPLLogProgress.disablec                 C   s4   | dg }|r| jd |D ]}| j|  qd S )Ndora_replay_historyzReplaying past metrics...)r   r   r   r   )r)   rT   rU   rV   replay_historystepr   r   r   rX   _  s   z PLLogProgress.on_load_checkpointc                 C   s   | j |d< |S )Nr   )r   r]   r   r   r   r^   f  s   
z PLLogProgress.on_save_checkpoint)r,   N)F) rG   rH   rI   r_   r$   rM   r   rf   rK   r   rU   tpDictAnyrJ   r   r   r   rh   r   r   r   r   r   r   r   rl   r   r   rX   r^   rN   r   r   r*   r   r      s:    

		r   )T)(r_   r   r|   rA   typingr   pytorch_lightningr   pytorch_lightning.callbacksr   $pytorch_lightning.callbacks.progressr   &pytorch_lightning.plugins.environmentsr   pytorch_lightning.trainerr   $pytorch_lightning.utilities.argparser   r    r	   rP   r
   r   r   r   r   r   rM   r   rJ   r    r!   rO   r`   rm   r   r   r   r   r   r   r   r   <module>   s0    ,F