o
    hiR                     @   sl  d dl mZ d dlZd dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZmZ d dlmZ d dlmZmZmZmZmZ d d	lmZ eeZG d
d dZG dd dZG dd dZG dd dZG dd dZG dd dZG dd dZ G dd dZ!G dd deZ"G dd deZ#G dd de"Z$G d d! d!e"Z%G d"d# d#e"Z&G d$d% d%e"Z'G d&d' d'eZ(dS )(    )	getLoggerN)DynamoOnnxHelper)Fusion)AttentionOpTypeFusionOptions) FusionBiasSkipLayerNormalizationFusionSkipLayerNormalization)NumpyHelper)
ModelProto	NodeProtoTensorProtohelpernumpy_helper)	OnnxModelc                   @      e Zd Zdd ZdS )ProcessGemmWFuncc                 C   s   t |dS )N   r   )np	transposeselfx r   e/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/onnxruntime/transformers/onnx_model_phi.py__call__      zProcessGemmWFunc.__call__N__name__
__module____qualname__r   r   r   r   r   r          r   c                   @   r   )ProcessMatMulQFuncc                 C   s   t t |ddd dS )N   r   r   r   r   splitr   r   r   r   r         zProcessMatMulQFunc.__call__Nr   r   r   r   r   r"      r!   r"   c                   @   r   )ProcessMatMulKFuncc                 C      t t |ddd dS )Nr#   r   r   r   r$   r   r   r   r   r      r&   zProcessMatMulKFunc.__call__Nr   r   r   r   r   r'      r!   r'   c                   @   r   )ProcessMatMulVFuncc                 C   r(   )Nr#   r      r   r$   r   r   r   r   r   $   r&   zProcessMatMulVFunc.__call__Nr   r   r   r   r   r)   #   r!   r)   c                   @   r   )ProcessBiasQFuncc                 C      t |ddd }|S )Nr#   r   r   r%   r   r   r   r   r   )      zProcessBiasQFunc.__call__Nr   r   r   r   r   r+   (   r!   r+   c                   @   r   )ProcessBiasKFuncc                 C   r,   )Nr#   r-   r   r.   r   r   r   r   r   /   r/   zProcessBiasKFunc.__call__Nr   r   r   r   r   r0   .   r!   r0   c                   @   r   )ProcessBiasVFuncc                 C   r,   )Nr#   r-   r*   r.   r   r   r   r   r   5   r/   zProcessBiasVFunc.__call__Nr   r   r   r   r   r1   4   r!   r1   c                   @   r   )ProcessRotCacheFuncc                 C   s8   t |jdks	J |jd dkr|d d ddf S |S )Nr*   r       r      )lenshaper   r   r   r   r   ;   s   zProcessRotCacheFunc.__call__Nr   r   r   r   r   r2   :   r!   r2   c                       s  e Zd Zdedee f fddZdefddZdd	 Z	d
d Z
dd Zdd Zd7ddZdd Zdd Zdd Zdee dedee fddZd8dee d ee d!efd"d#Zd8dee d ee d!efd$d%Zd9dee d ee d!efd'd(Zd8dee d ee d!efd)d*Zd8dee d ee d!efd+d,Zd:dee d ee d!efd-d.Zd:dee d ee d!efd/d0Zd:dee d ee d!efd1d2Z		&	3	4d;dee d ee d!efd5d6Z  ZS )<Fissionmodelnodes_to_findc                    s   t  |d| d S )NDONOTUSEsuper__init__)r   r8   r9   	__class__r   r   r=   E   s   zFission.__init__attn_op_typec                 C   s
   || _ d S N)r@   )r   r@   r   r   r   set_attention_op_typeL   s   
zFission.set_attention_op_typec                 C   s   |d t | S )N_)str)r   layer_idnamer   r   r   	get_unameO   s   zFission.get_unamec                 C   s>   |D ]}||ks| |s||r|  S qtd| d)NzEdge z
 not found)endswith
startswith
ValueError)r   edgesrF   edger   r   r   get_edge_by_nameR   s
   zFission.get_edge_by_namec                 C      |  |j|S rA   )rM   inputr   noderF   r   r   r   get_input_by_nameX      zFission.get_input_by_namec                 C   rN   rA   )rM   outputrP   r   r   r   get_output_by_name[   rS   zFission.get_output_by_nameNc                 C   sd   | j |}t|}||}tj|d u r|d n|tj|j|	 
 dd}| j || j |jS )N
_processedT	data_typedimsvalsraw)r8   get_initializerr	   to_arrayr   make_tensorr   FLOATr6   flattentobytesadd_initializerthis_graph_namerF   )r   initializer_namefunctorcustom_namei
i_np_arrayprocessed_i_np_array
new_tensorr   r   r   process_initializer^   s   

zFission.process_initializerc                 C   &   | j  j }||_tj|jj_	d S rA   )
r8   graph
value_infoaddrF   r   r_   typetensor_type	elem_typer   rF   new_value_infor   r   r   add_fp32_value_infol      zFission.add_fp32_value_infoc                 C   rl   rA   )
r8   rm   rn   ro   rF   r   INT64rp   rq   rr   rs   r   r   r   add_int64_value_infoq   rv   zFission.add_int64_value_infoc                 C   s\   | j  jD ]}|j|kr| j  j|  nqtj|tj|d}| j  j	|g d S )Nrr   r6   )
r8   rm   rn   rF   remover   make_tensor_value_infor   r_   extend)r   rF   r6   rn   rt   r   r   r   replace_fp32_value_infov   s   
zFission.replace_fp32_value_infosubgraph_nodesrE   layer_known_edges_namesc                 C   s   |D ]_}t |jD ]\}}|dkrq	||vr'| |||j|< | |j|  q	t |jD ]\}}|dkr6q-||vrK| |||j|< | |j|  q-| ||j|_| j| | j| j	|j< qd S )N )
	enumeraterO   rG   ru   rT   rF   nodes_to_addappendrc   node_name_to_graph_name)r   r~   rE   r   new_noderg   rF   r   r   r   set_unique_name_and_add_nodes   s&   z%Fission.set_unique_name_and_add_nodesr   inputsoutputsprefixc                 C   s>   t |dksJ t |dksJ tjd|||d dd}|gS )Nr#   r   LayerNormalization_LayerNormalizationg   >)r   r   rF   epsilonr5   r   	make_noder   r   r   r   rQ   r   r   r   	layernorm      zFission.layernormc                 C   sr   t |dksJ t |dksJ tjd|d |d g|d g|d d}tjd|d |d g||d	 d}||gS )
Nr#   r   MatMulr   
matmul_outr   r   rF   Addr*   Biasr   )r   r   r   r   matmulro   r   r   r   gemm   s   zFission.gemmr3   c              	   C   sB   t |dksJ t |dksJ tjd|||d d||d}|gS )N   r   RotaryEmbeddingcom.microsoft)r   r   rF   domainrotary_embedding_dim	num_headsr   )r   r   r   r   rot_dimr   rQ   r   r   r   rotary      	zFission.rotaryc                 C   s>   t |dksJ t |dksJ tjd|||d dd}|gS )Nr   FastGelur   )r   r   rF   r   r   r   r   r   r   fastgelu   r   zFission.fastgeluc                 C   s<   t |dksJ t |dksJ tjd|||d d}|gS )Nr*   r   r   r   r   r   r   r   r   ro      s   zFission.addc              	   C   sB   t |dksJ t |dksJ tjd|||d d|dd}|gS )N   r#   MultiHeadAttentionr   r   )r   r   rF   r   r   unidirectionalr   r   r   r   r   r   rQ   r   r   r   mha   r   zFission.mhac              	   C   sB   t |dksJ t |dksJ tjd|||d d||d}|gS )N   r#   GroupQueryAttentionr   )r   r   rF   r   r   kv_num_headsr   r   r   r   r   gqa   r   zFission.gqac                 C   sF   t |dksJ t |dksJ tjd|||d d|dddd	}|gS )N   r*   	Attentionr   r   r3   )r   r   rF   r   r   r   	do_rotaryr   r   r   r   r   r   	attention   s   zFission.attentionP      %?c                 C   sF   t |dksJ t |dksJ tjd|||d d||||d	}|gS )N   r   PagedAttentionzvllm.ort.ext)r   r   rF   r   r   num_kv_heads	head_sizescaler   )r   r   r   r   r   r   r   rQ   r   r   r   
paged_attn  s   	zFission.paged_attnrA   )r   )r   r3   r3   )r   r3   )r   r3   r   r   )r   r   r    r   listrD   r=   r   rB   rG   rM   rR   rU   rk   ru   rx   r}   r   intr   r   r   r   r   ro   r   r   r   r   __classcell__r   r   r>   r   r7   D   sR    

        r7   c                       s\   e Zd Zdededef fddZdefddZd	d
 Zde	fddZ
de	fddZ  ZS )Phi2PreProcessorr8   r   hidden_sizec                    s(   t  | d| _|| _|| _d| _d S )Nr3   modeling_phi_PhiModel_model_1)r<   r=   num_hidden_layersnum_attention_headsr   	func_namer   r8   r   r   r>   r   r   r=     s
   
zPhi2PreProcessor.__init__returnc                 C   s   i }d|d< d|d< d|d< d|d< t d	| jd	D ],}d
| |d| < d| |d| < d| |d| d< d| |d| d< qdd | jjjD }d|v rbd|v rbd|d< d|d< |S d|v rjd|v slJ d|d< d|d< |S )Nlogits	lm_head_1	input_idsl_input_ids_
past_key_0
key_statespast_value_0value_statesr   	past_key_key_states_past_value_value_states_present_key_model_layers__1present_value__1_1c                 S   s   g | ]}|j qS r   rF   ).0or   r   r   
<listcomp>2  s    z7Phi2PreProcessor.get_phi2_edge_dict.<locals>.<listcomp>model_layers_0_1_1model_layers_0_1_2present_key_0present_value_0model_layers_0_1)ranger   r8   rm   rT   )r   	edge_dictrg   r   r   r   r   get_phi2_edge_dict&  s&   z#Phi2PreProcessor.get_phi2_edge_dictc                 C   s<   d}| j jjD ]}|j|}|dkr|j|d  |_qd S )N)modeling_phi_PhiDecoderLayer_model_layersr-   )r8   rm   rQ   op_typefind)r   phi2_transformer_layer_namerQ   indexr   r   r   simplify_phi2_op_type<  s   z&Phi2PreProcessor.simplify_phi2_op_typer@   c              
   C   s  |t jk| _|t jk| _| jj}g }|jD ]}d|jv rkt	j
|j| js&tjntjddgd}t	j
dtjdgd}t	j
dtjddgd}t	j
dtjddgd}t	j
d	tjdgd}	| jsc||||gn||||	g | jrd
|jv rt	j
|jd
d|jjjdd| jd| j| j gd}
||
g q| jrd
|jv rt	j
|j|jjjg dd}
||
g d|jv rt	j
|j|jjjg dd}
||
g qd
|jv sd|jv rt	j
|j|jjjd| jd| j| j gd}
||
g q|d |j| g }t|jD ]_\}}|dkr||g q| jr?d|jv r=t	j
|jdd|jjjdd| jd| j| j gd}
||
g q| jrEqt	j
|j|jjjd| jd| j| j gd}
||
g q|d |j| d S )Nr   
batch_sizeseq_lenry   stepr   position_idsattention_maskinput_metadatapast_keypastr*   past_seq_len)
num_blocksr   head_size_x
block_sizeblock_x
past_value)r   r   r   r   rO   r   present_keypresenttotal_seq_lenrT   )r   r   use_attnr   use_vllmr8   rm   rO   rF   r   r{   r   INT32rw   r|   replacerp   rq   rr   r   r   
ClearFieldr   rT   )r   r@   rm   
new_inputsvivi_iidvi_stepvi_pidvi_maskvi_metavi_cachenew_outputsrg   r   r   r   process_graph_ioC  s   














z!Phi2PreProcessor.process_graph_ioc                 C   s~   d }| j jD ]}|j| jr|j} nq|d usJ | | | |   |   | 	  |t
jkr8|   | | d S rA   )r8   	functionsrF   rH   r   unroll_functionupdate_edgesr   r   remove_dropout_layerr   r   remove_lm_head_layerr  )r   r@   function_namefuncr   r   r   preprocess_onnx  s   

z Phi2PreProcessor.preprocess_onnx)r   r   r    r
   r   r=   dictr   r   r   r  r  r   r   r   r>   r   r     s    }r   c                       *   e Zd Zdef fddZdd Z  ZS )FissionTransformerEmbeddingPhir8   c                       t  |dg d S )N6torch_nn_modules_sparse_Embedding_model_embed_tokens_1r;   r   r8   r>   r   r   r=        z'FissionTransformerEmbeddingPhi.__init__c           	      C   s   t d|j t|jdksJ t|jdksJ |jd }|jd }| |d}|||g}tjd||g|gddg}| 	|d| | j
| d	| _d S )
NOptimizing %s...r*   r   r   zembed_tokens.weightGatherEmbedding_Gatherr   T)loggerinforF   r5   rO   rT   rR   r   r   r   nodes_to_remover   prune_graph)	r   rQ   input_name_to_nodesoutput_name_to_noderO   rT   	embeddingr   r~   r   r   r   fuse  s"   


	
z#FissionTransformerEmbeddingPhi.fuser   r   r    r   r=   r   r   r   r   r>   r   r    
    r  c                       r  )FissionTransformerLayerNormPhir8   c                    r  )N@torch_nn_modules_normalization_LayerNorm_model_final_layernorm_1r;   r  r>   r   r   r=     r  z'FissionTransformerLayerNormPhi.__init__c           
      C   s   t d|j t|jdksJ t|jdksJ |jd }|jd }| |d}| |d}||||g}g }	|	| |||g|gd | 	|	d| | 
|g d	 | 
|g d	 | j| d
| _d S )Nr  r#   r   r   zfinal_layernorm.weightzfinal_layernorm.biasFinalc   r   r   r   T)r  r  rF   r5   rO   rT   rR   r|   r   r   r}   r  r   r  )
r   rQ   r  r  rO   rT   	ln_weightln_biasr   r~   r   r   r   r     s   


z#FissionTransformerLayerNormPhi.fuser!  r   r   r>   r   r#    r"  r#  c                       r  )!FissionTransformerCausalLMHeadPhir8   c                    r  )N(torch_nn_modules_linear_Linear_lm_head_1r;   r  r>   r   r   r=     r  z*FissionTransformerCausalLMHeadPhi.__init__c           
      C   s   t d|j t|jdksJ t|jdksJ |jd }|jd }| | |dt }| |d}||||g}g }	|		| 
|||g|gd | |	d	| | |g d
 | |g d | j| d| _d S )Nr  r   r   r*   r   zlm_head.weightzlm_head.biasLMHead_r&  r'  )r   r   i   T)r  r  rF   r5   rO   rT   rk   rR   r   r|   r   r   r}   r  r   r  )
r   rQ   r  r  rO   rT   	fc_weightfc_biasr   r~   r   r   r   r     s   


z&FissionTransformerCausalLMHeadPhi.fuser!  r   r   r>   r   r*    r"  r*  c                       sF   e Zd Zdedef fddZdd Zdd Zd	d
 Zdd Z	  Z
S )FissionTransformerBlockPhir8   r   c                    sT   || _ d}i | _g }t|D ]}d| d}|| || j|< qt || d S )Nr3   *modeling_phi_PhiDecoderLayer_model_layers_r   )r   func_to_layer_idr   r   r<   r=   )r   r8   r   max_num_layersr9   layerr   r>   r   r   r=   5  s   
z#FissionTransformerBlockPhi.__init__c                 C   s   | j |j S rA   )r1  r   )r   rQ   r   r   r   get_layer_idE  r   z'FissionTransformerBlockPhi.get_layer_idc                 C   s   t jddgdgdtjdt jdddgdgd	d
t jdddgdgdd
t jddgdgdtjdt jddgdgdd
t jdddgdgdddt jddgdgdtjdg}|S )NCastr   
mask_int64Cast_gqa_aux_0)r   r   rF   to	ReduceSumonemask_row_sumsReduceSum_gqa_auxr   Subseqlens_k_int64Sub_gqa_aux	seqlens_kCast_gqa_aux_1Shape
mask_shapeShape_gqa_aux_0r  total_seq_len_int64Gather_gqa_aux_0r   )r   r   rF   axistotal_sequence_lengthCast_gqa_aux_2)r   r   r   rw   r   )r   gqa_aux_nodesr   r   r   get_gqa_aux_nodesH  sV   +z,FissionTransformerBlockPhi.get_gqa_aux_nodesc	                 C   sX  | j |}	| j |}
| j |}tt|	d}tt|
d}tt|d}tj|||fdd}| j |}| j |}| j |}t|}t|}t|}tj|||fdd}|jd }tj	|t
j||d g|  dd}| j || j tj	|t
j|d g|  dd}| j || j | |j | |j ||fS )Nr   r   )rG  r   r#   TrW   )r8   r\   r   r   r	   r]   stackr6   r   r^   r   r_   r`   ra   rb   rc   ru   rF   )r   q_wk_wv_wq_bk_bv_bweight_name	bias_nameq_weightk_weightv_weightqwkwvw
qkv_weightq_biask_biasv_biasqbkbvbqkv_biasr   weightbiasr   r   r   pack_qkv_gemmv  sD   






z(FissionTransformerBlockPhi.pack_qkv_gemmc           $      C   s  t d|j t d| j  | |}|jd }| |d}| |d}|jd }| |d}	| |d}
| |d	}| |d
}d\}}}}}}d\}}d\}}| jt	j
kr| | |dt }| | |dt }| | |dt }| |d}| |d}| |d}| | |dt }| | |dt }n.| | |d| |d| |d| |d| |d| |d| |d| |d\}}| | |dt }| |d}| | |dt }| | |dt }| |d}| |d}g }||||g |||	|
g |||g | jt	j
kr2|||||||||g n|||g |||||||g |g d g }|| |||gdg || d||gd gd! || d||gd"gd# || d"gd$g || d$||gd%gd& || d d%gd'gd( || |d'g|gd) | jt	j
kr|| d||gd*gd+ || d||gd,gd- || d||gd.gd/ | jt	jkrd0nd1}|| d*|||gd2gd+ || d,|||gd3gd- | jt	jkr|| d2d3d.d4d5d4||gd|	|
g n| jt	jkri|| d2d3d.||d6d7gd|	|
g |dkrh|  } | D ]}!| j|! | j| j|!j< qD| j !t"j#t$j%d8gd9d:d;d<| j n9| jt	jkr|| &d2d3d.||d=gdg n!d>| }"d?| }#||"|#g || 'd||d5|"gd|#g | (||| | )|g d@ | )|g d@ | j*| dA| _+d S )BNr  zAttentionOpType: r   r   r   r-   r   present_valuezinput_layernorm.weightzinput_layernorm.bias)NNNNNN)NNzself_attn.q_proj.weightzself_attn.k_proj.weightzself_attn.v_proj.weightzself_attn.q_proj.biaszself_attn.k_proj.biaszself_attn.v_proj.biaszrotary_emb.cos_cachedzrotary_emb.sin_cachedattn_qkv_weightattn_qkv_biaszself_attn.dense.weightzself_attn.dense.biaszmlp.fc1.weightzmlp.fc2.weightzmlp.fc1.biaszmlp.fc2.bias)r   r   r@  rH  r   r   ln_outattn_outattn_add_outOutProj_fc1_outFC1_gelu_outfc2_outFC2_residual_1_out
Residual_1
Residual_2queryQ_keyK_valueV_r   r   	query_rotkey_rotr   r   r@  rH  r   int64)dtyper:  r   r   past_present_r'  T),r  r  rF   r@   r4  rO   rR   rT   rU   r   r   rk   r   r2   re  rG   r|   r   r   r   ro   r   r   r   r   r   r   rK  r   r   rc   r   r8   rb   r   
from_arrayr   arrayr   r   r   r}   r  r  )$r   rQ   r  r  rE   i_hidden_statesi_key_cachei_value_cacheo_hidden_stateso_key_cacheo_value_cacher(  r)  attn_q_weightattn_q_biasattn_k_weightattn_k_biasattn_v_weightattn_v_biasrg  rh  	cos_cache	sin_cacheattn_out_weightattn_out_biasmlp_fc1_weightmlp_fc2_weightmlp_fc1_biasmlp_fc2_biasr   r~   pos_ids_namerJ  r   	past_namepresent_namer   r   r   r     s  










	



zFissionTransformerBlockPhi.fuse)r   r   r    r   r   r=   r4  rK  re  r   r   r   r   r>   r   r/  4  s    .*r/  c                       sX   e Zd Zdededef fddZddedB d	ef fd
dZdd Z	dddZ
  ZS )PhiOnnxModelr8   r   r   c                    sJ   t  | t| j||| _t| || _t| | _t	| | _
t| | _d S rA   )r<   r=   r   r8   phi2_preprocessorr/  fission_transformer_blockr*  fission_causal_lm_headr#  fission_transformer_layernormr  fission_transformer_embeddingr   r>   r   r   r=   O  s   

zPhiOnnxModel.__init__NFoptionsadd_dynamic_axesc                    s   |d usJ |j }| j| | j| | j  | j  | j  | j  t	 
  t| | _t| | _| j  | j  d S rA   )attention_op_typer  rB   r  r  applyr  r  r  r<   r  r   fuse_slnr   fuse_bias_sln)r   r  r  r@   r>   r   r   optimizeW  s   







zPhiOnnxModel.optimizec                 C   s@   i }g d}|D ]}|  |}t|||< qtd|  |S )z8
        Returns node count of fused operators.
        )	r   r   r   r   GeluBiasGelur   r   SkipLayerNormalizationzOptimized operators: )get_nodes_by_op_typer5   r  r  )r   op_countopsopnodesr   r   r   get_fused_operator_statisticsl  s   
z*PhiOnnxModel.get_fused_operator_statisticsc                    s    du r|    dtf fdd}|d|d |d |d }|d	|d
 |d }|d|d }|dko@||ko@||k}|dkrJtd |dkrStd |dkr\td |S )zA
        Returns True when the model is fully optimized.
        Nop_namec                    s     | pdS )Nr   )get)r  fused_op_countr   r   r    rS   z1PhiOnnxModel.is_fully_optimized.<locals>.op_countr   r   r   r   r  r  r   r   r  r   zLayer Normalization not fusedzGelu (or FastGelu) not fusedz+Attention (or MultiHeadAttention) not fused)r  rD   r  debugwarning)r   r  r  r   gelu
layer_norm
is_perfectr   r  r   is_fully_optimized  s*   


zPhiOnnxModel.is_fully_optimized)NFrA   )r   r   r    r
   r   r=   r   boolr  r  r  r   r   r   r>   r   r  N  s
    r  ))loggingr   numpyr   dynamo_onnx_helperr   fusion_baser   fusion_optionsr   r   fusion_skiplayernormr   r   fusion_utilsr	   onnxr
   r   r   r   r   
onnx_modelr   r   r  r   r"   r'   r)   r+   r0   r1   r2   r7   r   r  r#  r*  r/  r  r   r   r   r   <module>   s:   
 Z 4"!!  