o
    Ti!2                     @   sl   d dl Z d dlZd dlZd dlZd dlZddlmZ zd dlZW n	 e	y)   Y nw da
G dd deZdS )    N   )DeepSpeedAcceleratorc                   @   s  e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd ZdddZ	dddZ
dd Zdd Zdd Zdd ZdddZdd ZdddZdd d!Zd"d# Zd$d% Zd&d' Zd(d) Zed*d+ Zd,d- Zdd.d/Zdd0d1Zed2d3 Zd4d5 Zdd6d7Zdd8d9Zdd:d;Z dd<d=Z!dd>d?Z"dd@dAZ#ddBdCZ$ddDdEZ%ddFdGZ&ddHdIZ'ddJdKZ(dLdM Z)ddNdOZ*dPdQ Z+dRdS Z,dTdU Z-dVdW Z.dXdY Z/dZd[ Z0d\d] Z1d^d_ Z2d`da Z3dbdc Z4ddde Z5ddfdgZ6dhdi Z7edjdk Z8edldm Z9edndo Z:edpdq Z;edrds Z<edtdu Z=edvdw Z>ddydzZ?d{d| Z@d}d~ ZAdd ZBdZCdd ZDdd ZEdd ZFdd ZGdd ZHdd ZIdd ZJdd ZKdd ZLdS )CUDA_Acceleratorc                 C   s8   d| _ tjdkr
dnd| _d| _td u r|   d S d S )Ncudawin32ncclglooinductor)_namesysplatform_communication_backend_name_compile_backendpynvml_init_pynvmlself r   Z/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/accelerator/cuda_accelerator.py__init__   s   zCUDA_Accelerator.__init__c                 C   sL   zdd l a W n
 ty   Y d S w zt   W d S  t jy%   d a Y d S w )Nr   )r   ImportErrornvmlInit	NVMLErrorr   r   r   r   r   !   s   zCUDA_Accelerator._init_pynvmlc                 C   s   dS )NFr   r   r   r   r   is_synchronized_device-   s   z'CUDA_Accelerator.is_synchronized_devicec                 C      |   S Nr   r   r   r   r   use_host_timers0      z CUDA_Accelerator.use_host_timersc                 C   r   r   r   r   r   r   r   resolves_data_dependency3   r   z)CUDA_Accelerator.resolves_data_dependencyc                 C   r   r   r   r   r   r   r   handles_memory_backpressure6   r   z,CUDA_Accelerator.handles_memory_backpressureNc                 C   s   |d u rdS d |S )Nr   cuda:{})formatr   device_indexr   r   r   device_name:   s   
zCUDA_Accelerator.device_namec                 C      t j|S r   )torchr   devicer#   r   r   r   r(   ?      zCUDA_Accelerator.devicec                 C   s   t j| d S r   )r'   r   
set_devicer#   r   r   r   r*   B      zCUDA_Accelerator.set_devicec                 C   
   t j S r   )r'   r   current_devicer   r   r   r   r-   E      
zCUDA_Accelerator.current_devicec                 C   s   d tj S )Nr!   )r"   r'   r   r-   r   r   r   r   current_device_nameH   r+   z$CUDA_Accelerator.current_device_namec                 C   r,   r   )r'   r   device_countr   r   r   r   r0   K   r.   zCUDA_Accelerator.device_countc                 C   r&   r   )r'   r   synchronizer#   r   r   r   r1   N   r)   zCUDA_Accelerator.synchronizec                 C   s   t jS r   )r'   randomr   r   r   r   r2   R      zCUDA_Accelerator.randomc                 C   s"   |d u r
t j|S t j||S r   )r'   r   set_rng_state)r   	new_stater$   r   r   r   r4   U   s   zCUDA_Accelerator.set_rng_statec                 C   s   |d u r	t j S t j|S r   )r'   r   get_rng_stater#   r   r   r   r6   [   s   
zCUDA_Accelerator.get_rng_statec                 C   r&   r   )r'   r   manual_seedr   seedr   r   r   r7   a   r)   zCUDA_Accelerator.manual_seedc                 C   r&   r   )r'   r   manual_seed_allr8   r   r   r   r:   d   r)   z CUDA_Accelerator.manual_seed_allc                 C   r,   r   )r'   r   initial_seedr   r   r   r   r;   g   r.   zCUDA_Accelerator.initial_seedc                 C   s   t jj| S r   )r'   r   default_generatorsr#   r   r   r   default_generatorj   r)   z"CUDA_Accelerator.default_generatorc                 C      t jjS r   )r'   r   Streamr   r   r   r   r?   n      zCUDA_Accelerator.Streamc                 C   r&   r   )r'   r   stream)r   rA   r   r   r   rA   r   r)   zCUDA_Accelerator.streamc                 C   r&   r   )r'   r   current_streamr#   r   r   r   rB   u   r)   zCUDA_Accelerator.current_streamc                 C   r&   r   )r'   r   default_streamr#   r   r   r   rC   x   r)   zCUDA_Accelerator.default_streamc                 C   r>   r   )r'   r   Eventr   r   r   r   rD   {   r@   zCUDA_Accelerator.Eventc                 C   r,   r   )r'   r   empty_cacher   r   r   r   rE      r.   zCUDA_Accelerator.empty_cachec                 C   r&   r   )r'   r   memory_allocatedr#   r   r   r   rF      r)   z!CUDA_Accelerator.memory_allocatedc                 C   r&   r   )r'   r   max_memory_allocatedr#   r   r   r   rG      r)   z%CUDA_Accelerator.max_memory_allocatedc                 C   r&   r   )r'   r   reset_max_memory_allocatedr#   r   r   r   rH      r)   z+CUDA_Accelerator.reset_max_memory_allocatedc                 C   r&   r   )r'   r   memory_cachedr#   r   r   r   rI      r)   zCUDA_Accelerator.memory_cachedc                 C   r&   r   )r'   r   max_memory_cachedr#   r   r   r   rJ      r)   z"CUDA_Accelerator.max_memory_cachedc                 C   r&   r   )r'   r   reset_max_memory_cachedr#   r   r   r   rK      r)   z(CUDA_Accelerator.reset_max_memory_cachedc                 C      t tjdrtj|S d S )Nmemory_stats)hasattrr'   r   rM   r#   r   r   r   rM         zCUDA_Accelerator.memory_statsc                 C   rL   )Nreset_peak_memory_stats)rN   r'   r   rP   r#   r   r   r   rP      rO   z(CUDA_Accelerator.reset_peak_memory_statsc                 C   rL   )Nmemory_reserved)rN   r'   r   rQ   r#   r   r   r   rQ      rO   z CUDA_Accelerator.memory_reservedc                 C   rL   )Nmax_memory_reserved)rN   r'   r   rR   r#   r   r   r   rR      rO   z$CUDA_Accelerator.max_memory_reservedc                 C   s   t j|jS r   )r'   r   get_device_propertiestotal_memoryr#   r   r   r   rT      s   zCUDA_Accelerator.total_memoryc                 C   s4   dt jv rtttt jddd}|| S |S )z
        credit: https://discuss.pytorch.org/t/making-pynvml-match-torch-device-ids-cuda-visible-devices/103020

        Remap torch device id to nvml device id, respecting CUDA_VISIBLE_DEVICES.

        If the latter isn't set return the same id
        CUDA_VISIBLE_DEVICES ,)osenvironlistmapintgetsplit)r   torch_gpu_ididsr   r   r   _get_nvml_gpu_id   s   
	z!CUDA_Accelerator._get_nvml_gpu_idc                 C   sH   t r|d u r
|  }t | |}t |}|jS | || | S r   )r   r-   nvmlDeviceGetHandleByIndexra   nvmlDeviceGetMemoryInfofreerT   rF   )r   r$   handleinfor   r   r   available_memory   s   
z!CUDA_Accelerator.available_memoryc                 C   s   t j sdS t j S )NT)r'   r   is_availableis_bf16_supportedr   r   r   r   ri      s   

z"CUDA_Accelerator.is_bf16_supportedc                 C   sN   t j sdS tjdddk}t j \}}|dkrdS |dkr%|r%dS dS )NTDS_ALLOW_DEPRECATED_FP1601      F)r'   r   rh   rX   rY   r]   get_device_capability)r   allow_deprecated_fp16major_r   r   r   is_fp16_supported   s   
z"CUDA_Accelerator.is_fp16_supportedc                 C   s4   t jg}|  r|t j |  r|t j |S r   )r'   floatrs   appendhalfri   bfloat16)r   supported_dtypesr   r   r   rx      s   z!CUDA_Accelerator.supported_dtypesc                 C   s   t tjdr
tjjS d S )Namp)rN   r'   r   ry   r   r   r   r   ry      s   zCUDA_Accelerator.ampc                 C   r,   r   )r'   r   rh   r   r   r   r   rh      r.   zCUDA_Accelerator.is_availablec                 C   s    t tjjdrtjj|S d S )N
range_push)rN   r'   r   nvtxrz   )r   msgr   r   r   rz      s   zCUDA_Accelerator.range_pushc                 C   s   t tjjdrtjj S d S )N	range_pop)rN   r'   r   r{   r}   r   r   r   r   r}      s   zCUDA_Accelerator.range_popc                 C   r&   r   )r'   r   
_lazy_call)r   callbackr   r   r   	lazy_call   r)   zCUDA_Accelerator.lazy_callc                 C      | j S r   )r   r   r   r   r   communication_backend_name   r3   z+CUDA_Accelerator.communication_backend_namec                 C   s   t j \}}|dkrdS dS )N   TF)r'   r   ro   )r   rq   rr   r   r   r   is_triton_supported   s   z$CUDA_Accelerator.is_triton_supportedc                 C   r,   r   )r'   r   	CUDAGraphr   r   r   r   create_graph   r.   zCUDA_Accelerator.create_graphc                 C   s   t j|||S r   )r'   r   graph)r   r   poolrA   r   r   r   capture_to_graph   r+   z!CUDA_Accelerator.capture_to_graphc                 C   s   |   d S r   )replay)r   r   r   r   r   replay_graph  s   zCUDA_Accelerator.replay_graphc                 C      t jtjtjddS Nr   )dtyper(   )	functoolspartialr'   tensorrw   r   r   r   r   BFloat16Tensor     zCUDA_Accelerator.BFloat16Tensorc                 C   r   r   )r   r   r'   r   uint8r   r   r   r   
ByteTensor  r   zCUDA_Accelerator.ByteTensorc                 C   r   r   )r   r   r'   r   doubler   r   r   r   DoubleTensor  r   zCUDA_Accelerator.DoubleTensorc                 C   r   r   )r   r   r'   r   rt   r   r   r   r   FloatTensor  r   zCUDA_Accelerator.FloatTensorc                 C   r   r   )r   r   r'   r   rv   r   r   r   r   
HalfTensor  r   zCUDA_Accelerator.HalfTensorc                 C   r   r   )r   r   r'   r   r\   r   r   r   r   	IntTensor  r   zCUDA_Accelerator.IntTensorc                 C   r   r   )r   r   r'   r   longr   r   r   r   
LongTensor   r   zCUDA_Accelerator.LongTensorr   c                 C      |  S r   )
pin_memory)r   r   align_bytesr   r   r   r   $  r   zCUDA_Accelerator.pin_memoryc                 C   r   r   )	is_pinned)r   r   r   r   r   r   '  r   zCUDA_Accelerator.is_pinnedc                 C   s   t |j}|drdS dS )Nzcuda:TF)strr(   
startswith)r   r   
device_strr   r   r   on_accelerator*  s   

zCUDA_Accelerator.on_acceleratorc                 C   s(   z	ddl m} W dS  ty   Y dS w )Nr   )__deepspeed__
op_builderzdeepspeed.ops.op_builder)r   r   r   )r   r   r   r   r   op_builder_dir1  s   zCUDA_Accelerator.op_builder_dirc                 C   s   | j d urd S i | _ |  }t|}tj|j}t	|gD ]F\}}}|dkrf|dkrftj
tj||sftd||}| D ] }|dre|dkre|dkre|dkre|| j vret||| j |< qEq d S )Nall_opsbuilderz{}.{}Builder	OpBuilderCUDAOpBuilderTorchCPUOpBuilder)
class_dictr   	importlibimport_modulerX   pathdirname__file__pkgutiliter_modulesisdirjoinr"   __dir__endswithgetattr)r   r   op_builder_moduleop_builder_absolute_pathrr   module_namemodulemember_namer   r   r   _lazy_init_class_dict?  s(   


z&CUDA_Accelerator._lazy_init_class_dictc                 C   s"   |    || jv r| j|  S d S r   r   r   r   
class_namer   r   r   create_op_builderX  s   
z"CUDA_Accelerator.create_op_builderc                 C   s    |    || jv r| j| S d S r   r   r   r   r   r   get_op_builder`  s   

zCUDA_Accelerator.get_op_builderc                 C   s   ddl m} |S )Nr   )BuildExtension)torch.utils.cpp_extensionr   )r   r   r   r   r   build_extensiong  s   z CUDA_Accelerator.build_extensionc                 C      dgS )NNCCLr   r   r   r   r   export_envsk  r3   zCUDA_Accelerator.export_envsc                 C   r   )NrU   r   r   r   r   r   visible_devices_envsn  r3   z%CUDA_Accelerator.visible_devices_envsc                 C   s&   |   D ]}dtt|||< qd S )NrW   )r   r   r[   r   )r   current_envlocal_accelerator_idsenvr   r   r   set_visible_devices_envsq  s   z)CUDA_Accelerator.set_visible_devices_envsc                 C   r   r   )r   r   r   r   r   get_compile_backendu  r3   z$CUDA_Accelerator.get_compile_backendc                 C   s<   t jjdd}||v r|| _d S t| d|   d| )Nr   )exclude_tagsz not supported by z. Supported Backends are )r'   _dynamolist_backendsr   
ValueErrorr%   )r   backendsupported_backendsr   r   r   set_compile_backendx  s   
z$CUDA_Accelerator.set_compile_backendr   )NN)r   )M__name__
__module____qualname__r   r   r   r   r   r    r%   r(   r*   r-   r/   r0   r1   r2   r4   r6   r7   r:   r;   r=   propertyr?   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rM   rP   rQ   rR   rT   ra   rg   ri   rs   rx   ry   rh   rz   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r      s    




















	








r   )r   rX   r   r   r   abstract_acceleratorr   
torch.cudar'   r   r   r   r   r   r   r   <module>   s   