o
    Ti'                     @   sb   d dl Z d dlmZ d dlmZmZmZmZmZ dddZ	dd Z
dd	 Z	
dddZdd ZdS )    N)warning_once)get_shard_sizeget_shard_size_listget_num_kv_heads
get_n_embdget_num_attention_headsc                    s8   fdd| D  fddt td D }|S )Nc                    s   g | ]
}t j| d qS )dim)torchsplit).0mat)	split_dim
split_size Z/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/module_inject/fusedqkv_utils.py
<listcomp>   s    z/split_by_qkvlist_and_refuse.<locals>.<listcomp>c                    s(   g | ] t j fd dD dqS )c                    s   g | ]}|  qS r   r   )r   qkv_sir   r   r      s    z:split_by_qkvlist_and_refuse.<locals>.<listcomp>.<listcomp>r   )r
   cat)r   )cat_dimqkv_split_listr   r   r      s    r   )rangelen)qkv_listr   r   r   tp_fusedqkv_listr   )r   r   r   r   r   split_by_qkvlist_and_refuse
   s
   r   c                 C   s0   g d}|dkr
dS |D ]	}|| v r dS qdS )N)qkv_projquery_key_valuez	attn.Wqkvzself_attn.W_packc_attn   FTr   )namemp_sizefused_qkvw_name_list
fused_namer   r   r   require_tp_fused_qkvw   s   r&   c           	         s   t |  |d u rd S dddddddddddd}dfd	d
	fddfddfddfdd fddd fdd	}fdd| D }|rit|td}|| }||||| S td ||S )Ncodegentype	bloomtypeglmtypeqwentypebigcodetypephi3type)CodeGenBlock
BloomBlockGLMBlockMPTBlockMptBlockBaichuanLayer	QWenBlockFalconDecoderLayerGPTBigCodeBlockDecoderLayerPhi3DecoderLayer   c                    s   t     dksJ d| jtd }|  d   d }ttj||jd d dd} fdd|D }t|td d dd}tj|ddd d}|| d |  S )	Nr   zCcodgen autoTP requires num_kv_heads % (mp_size*codegen_mp_num) == 0r!      r   c                    s"   g | ]}|   d d qS )r!   )reshape)r   xcodegen_mp_numr#   shaper   r   r   <   s   " zJprepare_tp_fused_qkvw.<locals>._codegen_type_transpose.<locals>.<listcomp>r:   )	r   r?   r   r;   listr
   r   r   r   )inputr#   r>   	dst_shapenum_mp_blocks	src_splitsplit_fusedqkvtp_fuseqkv_weight	gpu_indexr=   r   _codegen_type_transpose0   s   z6prepare_tp_fused_qkvw.<locals>._codegen_type_transposec                    s   t  dkra| j}t }|d | t   }| d | }| |||  }| || d  }|jt|jd |dd}|jt|jd |dd}	|jt|jd |dd}
tj|  |	  |
  fddS | j}tj| |d d dd}t|t|d d |}|  S )N   r   r   r9   )r   r?   r   r   r   r
   r   r   )rA   r#   r?   
hidden_dimkv_dimqkvq_splitk_splitv_splitrD   rE   rG   r   r   _glm_type_transposeC   s   
 z2prepare_tp_fused_qkvw.<locals>._glm_type_transposec                    s&   | j }| jt|d |dd}|  S Nr   r   )r?   r   r   )rA   r#   r?   rE   rG   r   r   _bloom_type_transposeY   s   z4prepare_tp_fused_qkvw.<locals>._bloom_type_transposec                    s2   t |dst|dd t|jj||j_ | |S )N_ds_fusedqkv_enteredT)hasattrsetattrr   attnr   )rA   r#   module)rS   r   r   _qwen_type_transpose_   s   

z3prepare_tp_fused_qkvw.<locals>._qwen_type_transposec                    sR   t  }| d | }| |d  }|j}|jt|d |dd}tj|  |fddS rT   )r   r?   r   r   r
   r   )rA   r#   n_embdrM   kvr?   split_qrG   r   r   _bigcode_type_transposef   s   z6prepare_tp_fused_qkvw.<locals>._bigcode_type_transposec                    s   t  }t }| jd }|| }| jd d| |  }| d | }| ||||   }| |||  d  }	|jt|jd |dd}
|jt|jd |dd}|	jt|	jd |dd}tj|
  |  |  fddS )Nr!   r   rJ   r   )r   r   r?   r   r   r
   r   )rA   r#   num_kv_heads	num_headshidden_sizehead_dimq_posrM   rN   rO   r^   split_ksplit_vrG   r   r   _phi3_type_transposen   s   
 z3prepare_tp_fused_qkvw.<locals>._phi3_type_transposec                    sv   |dkr	| |S |dkr| |S |dkr| |S |dkr%| ||S |dkr. | |S |dkr7| |S t d)Nr(   r'   r)   r*   r+   r,   zunknown fused_qkv_type)
ValueError)srcr#   fused_qkv_typerZ   )r_   rU   rI   rS   rg   r[   r   r   _transpose_fused_qkvw|   s   




z4prepare_tp_fused_qkvw.<locals>._transpose_fused_qkvwc                    s   g | ]}| v r|qS r   r   )r   rN   )
module_strr   r   r          z)prepare_tp_fused_qkvw.<locals>.<listcomp>)keyzUnrecognized fusedkqv weight type, default to using bloom type,please check in prepare_tp_fused_qkvw() to avoid potential calculation errors)r8   )NN)strstripkeysmaxr   r   )	rZ   ri   r#   rH   fused_type_dictrk   module_name_matchesmodule_name
fused_typer   )r_   rU   rI   rS   rg   r[   rH   rl   r   prepare_tp_fused_qkvw   s:   
rw   Tc                    s  |r
| j d }d}n| j d }d}t  |  } | dks J | d kr0td| d    | }|| }	g }
d}||k rS|
|	d  |	d7 }	|d }||k s@|
 fdd|
D  g }g }|
D ];}|r|| || |d |   |d ur||j|| |d |   qe|| d d || |d | f  qetj||d}|d ur|rtj|dd}n|t| }tj	
|tj	
|fS tj	
|d fS )	Nr   r!   rJ   zworld_size z" is larger than half of num_heads c                    s   g | ]}| d   qS )rJ   r   )r   r   ra   r   r   r      rm   z-shard_value_with_share_qk.<locals>.<listcomp>r   )r?   r   RuntimeErrorappendextenddatar
   r   floatnn	Parameter)weightbiasrank
world_sizeshard_value
total_sizeweight_cat_dimrc   head_per_rankq_head_start
v_head_idsr   sharded_weightsharded_biashead_idr   rx   r   shard_value_with_share_qk   sH   

 (r   c                 C   s   | j ddd\}}|jd }|jt||ddd}|jt||ddd}tj|| || fdd}	|d urc|j ddd\}
}|
jt||ddd}|jt||ddd}|	tj|| || fddfS |	d fS )NrJ   r   r   mlp)chunkr?   r   r   r
   r   )r   r   r   r   weight_gateweight_statesr   split_weight_gatesplit_weight_statesshard_weight	bias_gatebias_statessplit_bias_gatesplit_bias_statesr   r   r   shard_chunk_mlp   s   
r   )r   r   )T)r
   deepspeed.utils.loggingr    deepspeed.module_inject.tp_shardr   r   r   r   r   r   r&   rw   r   r   r   r   r   r   <module>   s   
 
4