o
    oi                     @  sv   d Z ddlmZ ddlZddlmZ ddlZddlmZ ddl	m
Z
 edZddddZ	ddddZdddZdS )z*Utilities that can be used with Deepspeed.    )annotationsN)Any)_PATH)_DEEPSPEED_AVAILABLEcpucheckpoint_dirr   tag
str | Nonereturnstrc                 C  s   |d u r3t j| d}t j|r,t|}|  }W d    n1 s&w   Y  ntd| t j| |}t j|sHt	dt
 d|S )Nlatestz Unable to find 'latest' file at zDirectory 'z' doesn't exist)ospathjoinisfileopenreadstrip
ValueErrorisdirFileNotFoundErrords_checkpoint_dir)r   r   latest_pathfd	directory r   Y/home/ubuntu/.local/lib/python3.10/site-packages/lightning/pytorch/utilities/deepspeed.pyr      s   
r   output_filedict[str, Any]c                   s   t sttt ddlm}m}m} || |g d t| } || }tj	|d t
d}|d d }|| |}	tj	|	t
d}
 fdd|
 D }
fd	dD |
d
< td|  t|
| |
S )a  Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be loaded with
    ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. It gets copied into the top
    level checkpoint dir, so the user can easily do the conversion at any point in the future. Once extracted, the
    weights don't require DeepSpeed and can be used in any application. Additionally the script has been modified to
    ensure we keep the lightning state inside the state dict for being able to run
    ``LightningModule.load_from_checkpoint('...')```.

    Args:
        checkpoint_dir: path to the desired checkpoint folder.
            (one that contains the tag-folder, like ``global_step14``)
        output_file: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
        tag: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt
            to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``

    Examples::

        # Lightning deepspeed has saved a directory instead of a file
        convert_zero_checkpoint_to_fp32_state_dict(
            "lightning_logs/version_0/checkpoints/epoch=0-step=0.ckpt/",
            "lightning_model.pt"
        )

    r   )(get_fp32_state_dict_from_zero_checkpointget_model_state_fileget_optim_files)module	optimizerlr_schedulercsr_tensor_module_namesskipped_stepsglobal_stepsdp_world_sizemp_world_size)map_locationoptimizer_state_dict
zero_stagec                   s   i | ]\}}| vr||qS r   r   ).0keyvalue)deepspeed_statesr   r   
<dictcomp>d   s    z>convert_zero_checkpoint_to_fp32_state_dict.<locals>.<dictcomp>c                   s   i | ]
}t |d  | qS )z_forward_module.)_remove_prefix)r-   k)
state_dictr   r   r1   g   s    r4   zSaving fp32 state dict to )r   ModuleNotFoundErrorr   deepspeed.utils.zero_to_fp32r   r    r!   r   torchload
CPU_DEVICEitemsprintsave)r   r   r   r   r    r!   optim_filesoptim_stater,   
model_fileclient_stater   )r0   r4   r   *convert_zero_checkpoint_to_fp32_state_dict.   s"   


rA   r.   prefixc                 C  s   |  |r| t|d  S | S N)
startswithlen)r.   rB   r   r   r   r2   p   s   r2   rC   )r   r   r   r	   r
   r   )r   r   r   r   r   r	   r
   r   )r.   r   rB   r   r
   r   )__doc__
__future__r   r   typingr   r7    lightning.fabric.utilities.typesr   &lightning.pytorch.strategies.deepspeedr   devicer9   r   rA   r2   r   r   r   r   <module>   s   
B