o
    TiB                     @   s
  d dl Z d dlmZ d dlmZ d dlmZmZm	Z	 eG dd dZ
eG dd dZd	d
 Zd1ddZd1ddZdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* Zd+ee jj d,e	fd-d.Zd/d0 Z dS )2    N)	dataclass)comm)DictListCallablec                   @   s   e Zd ZU eed< eed< dS )fragment_addressnumelstartN)__name__
__module____qualname__int__annotations__ r   r   S/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/utils/tensor_fragment.pyr      s   
 r   c                   @   s   e Zd ZU ejed< eed< ejed< eed< eed< eed< eed< e	ed< d	Z
eed
< dd Zdd Zdd Zdd Zdd Zdd ZdddZdd Zd	S )tensor_fragmentlp_fragmentlp_fragment_addresshp_fragmenthp_fragment_addressgradient_dictoffload_gradient_dictuse_offloadparam_group_indexNoptim_fragmentc                 C      | j j| jj d S N)r   datacopy_r   selfr   r   r   	update_hp      ztensor_fragment.update_hpc                 C   r   r   )r   r   r   r   r   r   r   r   	update_lp!   r"   ztensor_fragment.update_lpc                 C   s"   || j v r
| j | S t| d)Nz& not found in optimizer state fragment)r   
ValueError)r    keyr   r   r   get_optim_state_fragment$   s   

z(tensor_fragment.get_optim_state_fragmentc                    s    fdd|  D _d S )Nc                    s>   i | ]\}}t |r|j jkr||d jjjjqS )r   )torch	is_tensorshapenarrowr   r	   r   ).0r%   valueflat_hp_partitionr    r   r   
<dictcomp>+   s    z<tensor_fragment.set_optim_state_fragment.<locals>.<dictcomp>)itemsr   )r    r.   r   r   r-   r   set_optim_state_fragment*   s   z(tensor_fragment.set_optim_state_fragmentc                 C   s   | j S r   )r   r   r   r   r   get_hp_fragment_address1   s   z'tensor_fragment.get_hp_fragment_addressc                 C   s   t | j S r   )listr   keysr   r   r   r   get_optim_state_keys4   s   z$tensor_fragment.get_optim_state_keysc                 C   s   |d u r| j S | |S r   )r   r&   )r    optim_state_keyr   r   r   get_hp_fragment7   s   
ztensor_fragment.get_hp_fragmentc                 C   sB   | j r| j}n| j}| j|vs|| j d u rtd|| j | S )NzNGradients are only available immediately after backward and before engine step)r   r   r   r   r$   )r    index_in_param_groupr   r   r   r   get_lp_grad_fragment<   s   z$tensor_fragment.get_lp_grad_fragmentr   )r
   r   r   r'   Tensorr   r   r   boolr   r   r!   r#   r&   r1   r2   r5   r7   r9   r   r   r   r   r      s$   
 


r   c           
      C   sx   |D ]7}| }t |}|D ]%}|jd ur2|j }|d|j|j}	|	j|jj	|dj |	|j_
q||| |< qd S )Nr   )r6   )r'   
zeros_like_hp_mappingr2   r*   r	   r   r   r   r7   r   )
flat_hp_tensor
lp_tensorsoptim_stateopt_keysr%   hp_parambufferlpr   r   r   r   r   map_to_flat_opt_statesH   s   


rE   c                 C   sn   t j| t jd }| jd ur*| jj}t |d|j|j}| j	|}|j
|j
 tj|| jd || S )Ndtyper   group)r'   r<   float32flattenr=   r   r*   r	   r   r7   r   r   dist
all_reduce	_dp_group
reshape_as)r    r6   reduce_bufferlp_frag_addressreduce_fragmentr   r   r   r   get_full_hp_paramW   s   

rS   c                 C   sL   | j d ur$| j j}t| d|j|j}| j |}|j	|j d S d S Nr   )
r=   r   r'   r*   rK   r	   r   r7   r   r   )r    r,   r6   rQ   value_fragmentr   r   r   r   set_full_hp_paramb   s   
rV   c                 C   s   t j| t jd }| jd urD| j| j}|t j }| jj}t 	|d|j
|j}| dj|jkr=|j|j n|j|j tj|| jd || S )NrF   r   rH   )r'   r<   rJ   rK   r=   r9   _index_in_param_grouptor   r*   r	   r   viewr)   r   r   rL   rM   rN   rO   )r    rP   lp_grad_fragmenthp_grad_fragmentrQ   rR   r   r   r   get_full_hp_gradj   s   

r]   c                 C   sV   | j d ur)| j | j}| j j}t| d|j|j}|j	
|j	|j	 d S d S rT   )r=   r9   rX   r   r'   r*   rK   r	   r   r   r   rO   )r    r,   r[   rQ   rU   r   r   r   set_full_hp_grad|   s   
r^   c                 C   s,   t | dr| j| S t | dr|  S dS )zAssemble and return the fp32 parameter of a low-precision (e.g., fp16) parameter.

        Args:
            param (``torch.nn.Parameter``): A model parameter

        Returns:
            Union[torch.Tensor, None]: A tensor on accelerator device
    ds_idr=   Nhasattr_z3_optimizerrS   paramr   r   r   safe_get_full_fp32_param   s
   


re   c                 C   s4   t | dr| j||  t | dr| | dS dS )zUpdate the partitioned fp32 parameter of a low-precision (e.g., fp16) parameter.

        Args:
            param (``torch.nn.Parameter``): A model parameter
            value (``torch.Tensor``): New value
    r_   r=   Nra   rb   rV   rd   r,   r   r   r   safe_set_full_fp32_param   s
   

rh   c                 C   s0   t | dr| j| |S t | dr| |S dS )ah  Assemble and return the fp32 optimizer state of a low-precision (e.g., fp16) parameter.

        Args:
            param (``torch.nn.Parameter``): A model parameter
            optim_state_key (``string``): Key value of optimizer state (e.g., `exp_avg` in Adam optimizer)

        Returns:
            Union[torch.Tensor, None]: A tensor on accelerator device
r_   r=   Nr`   rd   r6   r   r   r   safe_get_full_optimizer_state   s
   


rj   c                 C   s8   t | dr| j|| | t | dr| || dS dS )aC  Update the partitioned fp32 optimizer state of a low-precision (e.g., fp16) parameter.

        Args:
            param (``torch.nn.Parameter``): A model parameter
            value (``torch.Tensor``): New value
            optim_state_key (``string``): Key value of optimizer state (e.g., `exp_avg` in Adam optimizer)
    r_   r=   Nrf   rd   r,   r6   r   r   r   safe_set_full_optimizer_state   s
   
	
rl   c                 C   s<   | j dur| j S t| dr| j| S t| dr|  S dS )a  
        Assemble and return the fp32 gradient of a low-precision (e.g., fp16) parameter.
        The return data type is that used for gradient accumulation. This is usually the param data type,
        but could also be different (e.g., bf16 param training with fp32 gradient accumulation).

        Args:
            param (``torch.nn.Parameter``): A model parameter

        Returns:
            Union[torch.Tensor, None]: A tensor on accelerator device
    Nr_   r=   )gradra   rb   get_fp32_grad_for_paramr]   rc   r   r   r   safe_get_full_grad   s   


ro   c                 C   sR   | j dur| j | dS t| dr| j||  dS t| dr'| | dS dS )ab  
        Update the partitioned gradient of a low-precision (e.g., fp16) parameter.
        To avoid precision issues, the update value should have the data type of
        gradient accumulation.

        Args:
            param (``torch.nn.Parameter``): A model parameter
            value (``torch.Tensor``): The un-partitioned new gradient value.
    Nr_   r=   )rm   r   ra   rb   set_fp32_grad_for_paramr^   rg   r   r   r   safe_set_full_grad   s   



rq   c                 C      t | ds	J d| j| S )a  
        Get the local gradient partition of a ZeRO-3 partitioned parameter.
        The return data type is that used for gradient accumulation. This is usually the param data type,
        but could also be different (e.g., bf16 param training with fp32 gradient accumulation).

        Args:
            param (``torch.nn.Parameter``): A model parameter

        Returns:
            Union[torch.Tensor, None]: A tensor on accelerator device
    r_   :This API is only defined for ZeRO-3 partitioned parameters)ra   rb   get_local_fp32_grad_for_paramrc   r   r   r   safe_get_local_grad   s   ru   c                 C   $   t | ds	J d| j||  dS )a_  
        Update the local gradient partition of a ZeRO-3 partitioned parameter.
        To avoid precision issues, the update value should have the data type of
        gradient accumulation.

        Args:
            param (``torch.nn.Parameter``): A model parameter.
            value (``torch.Tensor``): New value of local gradient partition.
    r_   rs   N)ra   rb   set_local_grad_for_paramrg   r   r   r   safe_set_local_grad  s   
rx   c                 C   rr   )zGet the local partition of a ZeRO-3 partitioned parameter in fp32 precision.

        Args:
            param (``torch.nn.Parameter``): A model parameter.

        Returns:
            Union[torch.Tensor, None]: A tensor on accelerator device
    r_   rs   ra   rb   get_local_fp32_paramrc   r   r   r   safe_get_local_fp32_param  s   	r{   c                 C   s    t | ds	J d| j| |S )ao  Get the local optimizer state partition of ZeRO-3 partitioned parameter in fp32 precision.

        Args:
            param (``torch.nn.Parameter``): A model parameter
            optim_state_key (``string``): Key value of optimizer state (e.g., `exp_avg` in Adam optimizer)

        Returns:
            Union[torch.Tensor, None]: A tensor on accelerator device
    r_   rs   ry   ri   r   r   r   safe_get_local_optimizer_state'  s   
r|   c                 C   s&   t | ds	J d| j|| | dS )a`  Update the local optimizer state partition of a ZeRO-3 partitioned parameter.

        Args:
            param (``torch.nn.Parameter``): A model parameter.
            value (``torch.Tensor``): New value of local optimizer state partition.
            optim_state_key (``string``): Key value of optimizer state (e.g., `exp_avg` in Adam optimizer).
    r_   rs   Nra   rb   set_local_hp_paramrk   r   r   r   safe_set_local_optimizer_state5  s   r   c                 C   rv   )zUpdate the local partition of ZeRO-3 partitioned parameter.

        Args:
            param (``torch.nn.Parameter``): A model parameter.
            value (``torch.Tensor``): New value of local parameter partition.
    r_   rs   Nr}   rg   r   r   r   safe_set_local_fp32_paramA  s   r   
param_listupdate_funcc                 C   s   g }| D ]}|j dur|j ||j | q|jr|| q|s#dS t|d dr5|d j|| dS t|d drO|D ]}t|}|||}|| q>dS dS )a  
        Vectorized update of the partitioned gradients of a list of low-precision (e.g., fp16) parameters.
        To avoid precision issues, the update value should have the data type of
        gradient accumulation.

        Args:
            param_list (``List[torch.nn.Parameter]``): List of model parameters
            update_func (``torch.Tensor``): A function that takes current full gradient value and returns new one.
    Nr   r_   r=   )	rm   r   requires_gradappendra   rb   %update_fp32_grad_for_param_vectorizedro   r^   )r   r   partitioned_grad_paramspold_gradnew_gradr   r   r    safe_update_full_grad_vectorizedP  s(   




r   c	              
   C   s   |   | }	|}
|| }t||
}t|	|}||k s$J d| d| || }t||
 |d}|d|j|j }t|| |d}|  d|j|j }t||||||||dS )Nzfragment start z should be < fragment_end )r	   r   r   )r   r   r   r   r   r   r   r   )r   maxminr   r*   r	   rK   r   )lp_paramlp_startr.   r   r   r   r   partition_startpartition_sizelp_endhp_starthp_endfragment_startfragment_endfragment_numelhp_frag_addresshp_fragment_tensorrQ   lp_fragment_tensorr   r   r   get_hp_fragment_mappings  s,   


r   r   )!r'   dataclassesr   	deepspeedr   rL   typingr   r   r   r   r   rE   rS   rV   r]   r^   re   rh   rj   rl   ro   rq   ru   rx   r{   r|   r   r   nn	Parameterr   r   r   r   r   r   <module>   s8   5

#