o
    Ti                     @   sL   d dl mZ G dd deZdd Zdd Zdd	d
ZdddZdd ZdS )   )partition_datac                   @   sF   e Zd Zdd Zdd Zdd Zddd	Zd
d Zdd Zdd Z	dS )meg_2d_parallel_mapc                 C   s   || _ || _i | _d S N)	pp_degree	tp_degreemap)selfr   r    r	   W/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/checkpoint/reshape_meg_2d.py__init__   s   
zmeg_2d_parallel_map.__init__c                    s$    fddt  j j D  _d S )Nc                    s(   i | ]}  | j | j |gqS r	   )	_make_keyr   ).0ir   r	   r
   
<dictcomp>   s    z3meg_2d_parallel_map.simple_init.<locals>.<dictcomp>)ranger   r   r   r   r	   r   r
   simple_init   s   
zmeg_2d_parallel_map.simple_initc                 C   sV   |  || t|tu sJ | ||}|| j vr g | j|< | j|  |7  < d S r   )_validate_indicestypelistr   r   keys)r   pp_indextp_indexdatakeyr	   r	   r
   add_data   s   
zmeg_2d_parallel_map.add_dataNc                 C   sv   |  || |d u rtt| jn|g}|d u rtt| jn|g}g }|D ]}|D ]}|| j| || 7 }q*q&|S r   )r   r   r   r   r   r   r   )r   r   r   
pp_indices
tp_indicesresultr   jr	   r	   r
   get_data   s   zmeg_2d_parallel_map.get_datac                 C   s4   t |  | j D ]\}}t | d|  q
d S )Nz = )printr   items)r   tagr   valuer	   r	   r
   
print_data+   s   
zmeg_2d_parallel_map.print_datac                 C   s4   |d u s|| j k sJ |d u s|| jk sJ d S d S r   )r   r   )r   r   r   r	   r	   r
   r   0   s   z%meg_2d_parallel_map._validate_indicesc                 C   s   | d| S )N,r	   )r   r   r   r	   r	   r
   r   4   s   zmeg_2d_parallel_map._make_key)NN)
__name__
__module____qualname__r   r   r   r    r%   r   r   r	   r	   r	   r
   r   	   s    
	r   c                 C   sZ   | j }t||}t|D ]}| j|d d}t||}t|D ]}|||||  qq|S N)r   r   )r   r   r   r    r   r   )
old_2d_mapnew_tp_degreeold_pp_degree
new_2d_mapr   ranks_for_pp_indexsplit_ranksr   r	   r	   r
   _reshape_tp_dimension8      

r1   c                 C   sZ   | j }t||}t|D ]}| jd |d}t||}t|D ]}|||||  qq|S r*   )r   r   r   r    r   r   )r+   new_pp_degreeold_tp_degreer.   r   ranks_for_tp_indexr0   r   r	   r	   r
   _reshape_pp_dimensionD   r2   r6   Fc                 C   s   || ksJ ||ksJ t | |}|  |r|d ||kr&t||}n|}|r/|d | |kr9t||}n|}|rB|d |S )Nzoriginal_2d_map:zafter_tp_reshape:zfinal_2d_map:)r   r   r%   r1   r6   )r-   r4   r3   r,   verboser+   
new_tp_map	final_mapr	   r	   r
   reshape_meg_2d_parallelP   s"   



r:   Nc              	      sF  | | | }t d|  d| d| d|  t| |}t||}|||  }|| }|| }	|| }
g }t|D ]"  |	 } d |	 }t|D ]}t|| ||}|t| qHq8t d| g }t|D ]  fdd|D }|t| qft d	| g }t|D ] t |  d | }|t| qt d
| |||fS )a  
    Initialize model data parallel groups.

    Arguments:
        tp_size: number of GPUs used to parallelize model tensor.
        pp_size: number of GPUs used to parallelize model pipeline.
        dp_size: number of GPUs used to parallelize model data.

    Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
    the model pipeline. The present function will
    create 8 tensor model-parallel groups, 4 pipeline model-parallel groups
    and 8 data-parallel groups as:
        8 data_parallel groups:
            [g0, g2], [g1, g3], [g4, g6], [g5, g7], [g8, g10], [g9, g11], [g12, g14], [g13, g15]
        8 tensor model-parallel groups:
            [g0, g1], [g2, g3], [g4, g5], [g6, g7], [g8, g9], [g10, g11], [g12, g13], [g14, g15]
        4 pipeline model-parallel groups:
            [g0, g4, g8, g12], [g1, g5, g9, g13], [g2, g6, g10, g14], [g3, g7, g11, g15]
    Note that for efficiency, the caller should make sure adjacent ranks
    are on the same DGX box. For example if we are using 2 DGX-1 boxes
    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
    ranks 8 to 15 belong to the second box.
    z	

*** tp=z, pp=z, dp=z, world=r   DPc                    s   g | ]}|  qS r	   r	   )r   data_parallel_group_ranksr   r	   r
   
<listcomp>   s    z!get_mpu_ranks.<locals>.<listcomp>PPTP)r!   minr   appendr   )tp_sizepp_sizedp_sizevirtual_pp_size
world_sizetensor_model_parallel_sizepipeline_model_parallel_sizedata_parallel_size num_tensor_model_parallel_groups"num_pipeline_model_parallel_groupsnum_data_parallel_groupsall_dp_group_ranks
start_rankend_rankr   ranksall_pp_group_ranksall_tp_group_ranksr	   r=   r
   get_mpu_ranksk   s8    





rT   c                 C   s   t d|  d|  | \}}}|\}}}t|||d\}}	}
t|||d\}}}t|||d\}}}t d t|D ]\}}t ||  d||   q:t d t|	D ]\}}t ||  d||   qTdS )zo
    reshape([tp_size_src, pp_size_src, dp_size_src],
            [tp_size_tgt, pp_size_tgt, dp_size_tgt])
    z

*** Reshaping: z => )rC   rD   rE   z
*** TP contraction:z
*** PP contraction:N)r!   rT   	enumerate)srctgttp_size_srcpp_size_srcdp_size_srctp_size_tgtpp_size_tgtdp_size_tgt	tp_ranks1	pp_ranks1	dp_ranks1	tp_ranks2	pp_ranks2	dp_ranks2	tp_ranks3	pp_ranks3	dp_ranks3r   rr	   r	   r
   reshape   s   

rh   )F)r   r   r   N)	reshape_utilsr   objectr   r1   r6   r:   rT   rh   r	   r	   r	   r
   <module>   s   /

K