o
    }oib                  	   @   sT  d dl mZmZmZmZ d dlZd dlZd dlZd dl	m
Z
mZ i addddddZg d	Zd
dgZddgZddgZddgZddgZddgZg dZddgZddgZdgZdgZdgZddgZd d!gZg d"Zd#Zd$ZdYd%d&Z d'd( Z!d)d* Z"dZd,d-Z#d[d.d/Z$d0e%d1e%fd2d3Z&d0e%d1e%fd4d5Z'd0e%d6ee% d1e(fd7d8Z)d0e%d9eeee% e%f  d1ee% fd:d;Z*d0e%d1ee% fd<d=Z+d0e%d1e%fd>d?Z,d0e%d1e(fd@dAZ-d0e%d1eee%e%f ee%e%f f fdBdCZ.dDe/d0e%dEej0dFe/fdGdHZ1dIdJ Z2dKeej3 dLe(fdMdNZ4e5 i fdOdPZ6d\dQeej3ej0f dRe7dSe7dTe7fdUdVZ8dWdX Z9dS )]    )ListOptionalTupleUnionN)mpi_commtorch_to_numpyGPTForCausalLMLlamaForCausalLMGemmaForCausalLMFalconForCausalLM)gptjgptnextllamagemmafalcon)zpost_attention_layernorm.weightzpost_attention_layernorm.biaszpost_self_attn_layernorm.weightzmlp.linear_fc2.biaszmlp.dense_4h_to_h.biaszattention.linear_proj.biaszattention.dense.biaszinput_layernorm.weightzinput_layernorm.biaszpre_mlp_layernorm.weightzpre_mlp_layernorm.biaszattention.linear_proj.weightzattention.dense.weightzmlp.linear_fc2.weightzmlp.dense_4h_to_h.weight)zmlp.dense_h_to_4h.weightzmlp.dense_h_to_4h.biaszmlp.linear_fc1.weightzmlp.linear_fc1.biaszattention.query_key_value.biaszattention.linear_qkv.biasz attention.query_key_value.weightzattention.linear_qkv.weightzmlp.router.weightzexperts.linear_fc1.weightzexperts.linear_fc2.weightzfinal_layernorm.weightzfinal_layernorm.biaszmlp.dense_h_to_4h_2.weightzmlp.dense_h_to_4h_2.bias)zattention.query.weightzattention.query.biaszattention.key_value.weightzattention.key_value.biasz.weights_scaling_factorz.activation_scaling_factorc                 C   s   |d u rdnd| d}t | rU|   } t| jdkr.| | jd d} t | dd} |tvrFt j	| 
 | j| jdd	d
t| | < t| |  j| d	d d S t| jdkrntt| | jd dddg} | t| | < d S )N ..bin   r      cpuT)dtypelayoutdevice
pin_memory)non_blocking)torch	is_tensordetach
contiguouslenshapereshape	transposeweights_dictemptysizer   r   copy_npascontiguousarray)valdirkeytp_numsuffix r0   W/home/ubuntu/.local/lib/python3.10/site-packages/nemo/export/trt_llm/converter/utils.pysave_valA   s   
$r2   c                 C   s,   t | D ]\}}t||||| |  qd S N)	enumerater2   )
split_valsr,   r-   isplit_factorjr+   r0   r0   r1   
save_splitV   s   r9   c           	      C   sJ   t | D ]\}}|| | }|d u rdnd| d}|t| | < qd S )Nr   r   r   )r4   r%   )	r5   r,   r-   r6   r7   r8   r+   r.   r/   r0   r0   r1   save_expert_split[   s
   r:   Fc              
   C   s  |r(|s(d|d  ddjdddd    }d|d  dd   }n|r0|r0tdd|d     }d|d    }d	| }d	| }td|d
    }td|d    }	t|d   d }
|	||  }|	||  }|rt||j	}t||j	}dd }|| | || | |
tj|
tj|
tj|
tj|
tj|

tjdS )a  This function has two purposes:
    - compute quantized weights, scaled either per-tensor or per-column
    - compute scaling factors.

    Depending on the GEMM API (CUTLASS/CUBLAS) the required scaling factors differ.
    CUTLASS uses two sets of scaling factors. One for the activation X, one for the weight W.
    CUBLAS only has one (we can't do per-row scaling). So we must provide pre-multiplied scaling factor.

    Here is the list of what we need (T means per-tensor, C per-column):
    - scale_x_orig_quant puts fp activation into the quantized range (i.e. [-128, 127], for int8).
    Used before the GEMM. (T)
    - scale_y_quant_orig puts quantized activation into the fp range. Used if the GEMM outputs int8. (T)
    - scale_w_quant_orig puts weights from quant range to fp range (used with CUTLASS) (T, C)
    - scale_y_accum_quant puts the GEMM result (XW) from accumulation range (int32)
    to quant range (int8) (used for CUBLAS) (T, C)

    Note that we don't do anything special about row-parallel GEMM.
    Theoretically, we could have per-GPU scaling factors too,
    but then the model would change depending on the number of GPUs used.

    For QKV projection, the behavior is special. Even if we have a single matrix to perform QKV projection,
    we consider it
    as three different matrices: Q, K, and V. So per-tensor actually means one scaling factor for each Q, K and V.
    g     _@w   r   T)dimkeepdimsr   z4Multi-query w/ int8 quant has not been supported yet      ?xyc                 S   s   |   ddtjS )Ni   )roundclipastyper)   int8)r@   r0   r0   r1   to_i8   s   zgenerate_int8.<locals>.to_i8)weight.int8weight.int8.colscale_x_orig_quantscale_w_quant_origscale_w_quant_orig.colscale_y_accum_quantscale_y_accum_quant.colscale_y_quant_orig)r#   maxr   numpy
ValueErrorr)   arrayitembroadcast_tor"   rE   float32)weights	act_rangeis_qkvmulti_query_modescale_w_orig_quant_tscale_w_orig_quant_cscale_w_quant_orig_tscale_w_quant_orig_cscale_x_orig_quant_tscale_y_orig_quant_tscale_y_quant_orig_tscale_y_accum_quant_tscale_y_accum_quant_crG   r0   r0   r1   generate_int8d   s6   *







rd   c           	      C   s   |s(t tj| d ||d|| d|| t tj| d ||d|| d|| dg}|s3|g d7 }|sf|dkr`t tj| d	 ||d|| d
|| t tj| d ||d|| d|| n|d	dg7 }|dkr||D ]}t| | || d|  qld S d S )NrH   axisz.weight.int8rI   z.weight.int8.colrO   )rJ   rK   rM   r   rL   z.scale_w_quant_orig.colrN   z.scale_y_accum_quant.colr   r   )r9   r)   splitr2   )	valsr,   base_key	split_dimtp_rankr7   kv_cache_onlysaved_keys_oncesave_keyr0   r0   r1   
write_int8   sP   ro   r-   returnc                 C   s   d|  dd  S )Nr   r   rg   r-   r0   r0   r1   
get_suffix   s   rs   c                 C   s   |  dd }d| S )Nr   r   ztransformer.layers.rq   )r-   	layer_numr0   r0   r1   get_trt_llm_prefix   s   
ru   wordsc                    s   t  fdd|D S )Nc                    s   g | ]}| v qS r0   r0   ).0wordrr   r0   r1   
<listcomp>   s    z#any_word_in_key.<locals>.<listcomp>)any)r-   rv   r0   rr   r1   any_word_in_key   s   r{   mappingc                 C   s$   |D ]\}}t | |r|  S qd S r3   )r{   )r-   r|   keywordsmappedr0   r0   r1   sequential_key_map   s
   
r   c                 C   sZ   t dftdftdftdftdftdftdftdftt	 dft
dftdftdfg}t| |S )Nz.post_layernormz	.mlp.projz.attention.densez.input_layernormz.mlp.fcz.attention.qkvz.mlp.router)post_layernorm_keysmlp_proj_bias_keysattention_dense_bias_keysinput_layernorm_keyspre_layernorm_keysattention_dense_weight_keysmlp_proj_weight_keysmlp_fc_keysattention_qkv_bias_keysattention_qkv_weight_keysmlp_router_keysmlp_fc_expert_keysmlp_proj_experts_keysr   )r-   r|   r0   r0   r1   get_trt_llm_infix   s   

r   c                 C   s:   t | tr| ddS t|  }rt| | t|  S | S )Nfinal_layernormztransformer.ln_f)r{   final_layernorm_keysreplacer   ru   rs   )r-   infixr0   r0   r1   get_trt_llm_keyname   s
   
r   c                 C   s   d| v S )N	scale_fwdr0   rr   r0   r0   r1   is_scaling_factor   s   r   c                 C   s   d | dd d d }t|}d |dd d }|t }|t }||f}t| }|d }|t }	|t }
|	|
f}||fS )Nr   .weightr   	.mlp.gate)joinrg   r   weight_scaling_suffixactivation_scaling_suffixru   )r-   corresponding_weight_key corresponding_trt_llm_weight_keyri   weight_scaleactivation_scalekeyslayer_prefix
mapped_keygate_activationgate_weight	gate_keysr0   r0   r1   get_scaling_factor_keys  s   r   scaling_factorsr+   configc                 C   s   t |s| S d|d d }d|d d }t|\\}}}|| |< || |< |dd}	|	rCt|ddgrC|\}
}|| |
< || |< | S )Nr   r   split_gated_activationFzmlp.dense_h_to_4hzmlp.linear_fc1)r   viewr   getr{   )r   r-   r+   r   activation_factorweights_factorweights_keyactivation_keyr   r   gate_activation_keygate_weight_keyr0   r0   r1   save_scaling_factor  s   r   c                    sx   |sfdd| D S t j}dd | D }|D ]}||v r2|||t    fdd| D }  nqfdd| D S )Nc                       g | ]}|  qS r0   torw   r+   storage_typer0   r1   ry   -      z%cast_val_datatype.<locals>.<listcomp>c                 S   s$   g | ]}| tr|td  qS r   )endswithr   rg   )rw   kr0   r0   r1   ry   0  s
    
c                    s   g | ]
}| tj  qS r0   )r   r   rV   r   )scaler0   r1   ry   7      c                    r   r0   r   r   r   r0   r1   ry   :  r   )r   float8_e4m3fnr   r   )rh   trt_llm_keyr   is_fp8_modelr   fp8_storage_typequantized_keysr   r0   )r   r   r1   cast_val_datatype+  s   r   rh   convert_on_devicec                 C   s<   |rdd t j| d dddD S dd | D }tt| S )Nc                 S   s   g | ]}|gqS r0   r0   )rw   nr0   r0   r1   ry   ?      z"split_val_gate.<locals>.<listcomp>r   r   r   re   c                 S   s   g | ]
}t j|d ddqS )r   r   re   )r)   rg   r   r0   r0   r1   ry   A  r   )r   chunklistzip)rh   r   splitsr0   r0   r1   split_val_gate=  s   r   c	           )   	      sV  | dd}	| dd}
| dd}| dd}| dd }| d	d}| d
|}| dd }| dd}| dd}| dd}|dkpI|dk}t|}t|tsV|g}| ddrj|d jdkrjdd |D }d|v r{| ddr{dd |D }t|||||}|rt|dksJ t|d sJ nt|d rdd |D }t	|t
t t t t t r| dks|rt|d || t/S t	|tt r|rt|d || nd}tj||d}tj|||d}t|||| | |d ur|dkr|dd}t|||d}t||||| | t/S t	|tr|
r"t||\}}|r.t|d || nd}tj||d}tj|||d}t|||| | |d urg|dkrg|dd}t|||d}t||||| | |
r|roJ t|}|d t| }|rt|d || t/S tj||d}tj|||d}t|||| | t/S t	|tr|rt|d || nd}tj||d}tj|||d}t|||| | |d ur|dkr|dd}t|||d}t||||| | t/S t	|tr|d j d }||d|   }|| } t|}!|r|d }ntj|dd}|!||! | | d |}|rVtj|| ddgdd}"tj|"d !d|"d !d|"d !dgdd}t||| t/S tj|| | d gdd}"tj|"d |ddtj|"d |ddtj|"d |ddfd dt"|D }t|||| | t/S t	|t#r|	sJ d!|d j d  |d u r | }|| } |r|d ! || | d |}tj|| ddgdd}"tj|"d ! d|"d ! d|"d ! dgdd}t||| nwt|}!tj|dd}|! ||! | | d |}tj|| | d gdd}"|"d j }#t|#dkr:|#d | dkr:t$d"%|#d tj|"d |ddtj|"d |ddtj|"d |dd fd#dt"|D }t|||| | |r|dd}t||d$|d%}t||||| ||dkd& |r|d'd}t&d(g}$t|$t'|d)  t/S t	|t(r	 t/S t	|t)rtj|dd}t||| t/S t	|t*r d}tj||d}tj|ddd\}%}&tj|%|dd}'tj|&|dd}(d*d t+|(|'D }t,|||| | t/S t	|t-r!d}tj||d}tj|||d}t,|||| | t/S t.d+| d, t/S )-Nuse_attention_nemo_shapeFr   num_attention_headsr   tp_sizer   int8_outputsrZ   num_kv_headskv_channelsr   fp8_quantizedfp8_kvcacheallrl   transpose_weightsr   c                 S   s   g | ]}|j qS r0   )Tr   r0   r0   r1   ry   ]  r   z)split_and_save_weight.<locals>.<listcomp>zlayernorm.weightapply_layernorm_1pc                 S   s   g | ]}|  d  qS )r?   )floatr   r0   r0   r1   ry   _      c                 S   s   g | ]}t | qS r0   )r   r   r   r0   r0   r1   ry   f  r   re   r   r   )rZ   r   r   r=   c                    s>   g | ]}t j| d  | d | d gddqS )r   r   re   r)   concatenater#   rw   r6   )k_splitq_splitv_splitr0   r1   ry     s    0z'Only support NEMO shape for QKV weightszNumber of query groups of the models is {0}. Please select tensor parallelism size that can split the number of query groups to equal number of query matrices in the each GPU.c              	      sD   g | ]}t j|  d |  d |  d gddqS )r   r   re   r   r   
hidden_dimr   r   r   r0   r1   ry     s    	T)rY   rZ   )rl   z.qkv.weightr?   z.kv_cache_scaling_factorc                 S   s   g | ]	}t j|d dqS )r   re   )r)   r   )rw   rT   r0   r0   r1   ry   !  s    z
[WARNING] z not handled by converter)0r   r   
isinstancer   ndimr   r!   r   r   r{   r   r   r   r   r   r   r2   r   r   r)   r   rg   r9   r   rd   ro   r   r   ru   rs   mlp_dense_2_keysr   r"   r#   ranger   	ExceptionformatFloatTensorr,   attention_not_mapped_keysr   r   r   r:   r   printr%   ))rk   	saved_dirr7   r-   rh   r   rX   r   r   r   r   r   r   r   rZ   r   size_per_headr   r   use_fp8_kv_cache	save_int8r   cat_dimr+   r5   ri   vals_i8gatesr   gate_keygateqkv_hidden_dimq_numlen_valsqkvquery_groups_shapescaling_factorw1w3	split_w1s	split_w3sr0   r   r1   split_and_save_weightG  s  
 <  . 
      
2pd
.

	
r   vr   idxr=   c                 C   sh   |dkr| S t | jdkr|nd}t| r'tj| | || |d|  S ttj| ||d| S )z9Splits the np tensor v on dim and return the idx's slice.r   r   r   re   )	r!   r"   r   r   rg   r'   r    r)   r*   )r   r   r  r=   r0   r0   r1   rg   0  s   
"rg   c                 C   s   ddl m} | }| }| }| }| }| }| r3|dkr3|| }tj	
 | }d}d}|| | }tjj|| t j||d}	ddlm}
 |	|
_|||||fS )Nr   )parallel_stater   )colorr-   )MPI)megatron.corer  &get_pipeline_model_parallel_world_size$get_tensor_model_parallel_world_sizeget_data_parallel_world_sizeget_tensor_model_parallel_rank get_pipeline_model_parallel_rankget_data_parallel_rankr   distributedget_ranktensorrt_llmbindingsMpiCommrg   r   Splitmpi4pyr  
COMM_WORLD)reshard_modelr  pp_sizer   dp_sizerk   pp_rankdp_rankmp_ranknew_commr  r0   r0   r1   init_model_parallel_from_nemo<  s$   r  r3   )FF)Fr   ):typingr   r   r   r   rQ   r)   r  r   tensorrt_llm._utilsr   r   r%   DECODER_MODEL_TYPEr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r2   r9   r:   rd   ro   strrs   ru   boolr{   r   r   r   r   r   dictTensorr   r   ndarrayr   no_gradr   intrg   r  r0   r0   r0   r1   <module>   sh   

	
>-*
*
 (i