o
    ڷi                     @   sz   d dl Z d dlmZmZ d dlmZ d dlmZmZ d dl	m
Z
 d dlmZ e eZG dd deZG d	d
 d
eZdS )    N)AttentionMaskFusionAttention)NumpyHelper)	NodeProtohelper)	OnnxModel)BertOnnxModelc                       sl   e Zd ZdZdedededef fddZded	e	d
e	dededededede	dB fddZ
dd Z  ZS )FusionTnlrAttentionz
    Fuse TNLR Attention subgraph into one Attention node.
    TNLR Attention has extra addition after qk nodes and adopts [S, B, NH] as I/O shape.
    modelhidden_size	num_headsattention_maskc                    s   t  |||| d S N)super__init__)selfr
   r   r   r   	__class__ ^/home/ubuntu/vllm_env/lib/python3.10/site-packages/onnxruntime/transformers/onnx_model_tnlr.pyr      s   zFusionTnlrAttention.__init__
mask_indexmatmuladdinputoutput
add_qk_strreturnNc	                 C   s  |dksJ |dkr|| dkrt d| d|  d S | j|jd }	| j|jd p7| j|jd }
|	d u s@|
d u rBd S t|	}t|
}| jd}|	j}t	
|}t	j|d ||d| g|| dd	}	| j|	| j t	j|d
 |d| g|| dd	}
| j|
| j ||d |d
 g}|d ur|| n|d |d ur|d || t	jd||g|d}d|_|jt	d|g |S )Nr   zinput hidden size z# is not a multiple of num of heads    	Attention_qkv_weight   T)name	data_typedimsvalsraw	_qkv_bias )inputsoutputsr!   zcom.microsoftr   )loggerdebugr
   get_initializerr   r   to_arraycreate_node_namer"   r   tensor_dtype_to_np_dtypemake_tensorastypetobytesadd_initializerthis_graph_nameappend	make_nodedomain	attributeextendmake_attribute)r   r   r   r   r   r   r   r   r   weightbias
qkv_weightqkv_biasattention_node_nametensor_dtypenp_typeattention_inputsattention_noder   r   r   create_attention_node   s`   $






z)FusionTnlrAttention.create_attention_nodec                 C   s  |}|j dkr	d S | j|g dg d}|d ur"|\}}}}}	}
nd S g }t|jD ]\}}||vr4q+||d jd kr>q+|| q+t|dkrLd S |d }| j|
g dg d}|d u rbd S |\}}}}}| j|dgdg}|d }| j|
g d	g d
}|d u rd S |\}}}| j|g dg d}|d u rd S |d }|d }| j|g dg d}|d u rd S |d }|d }| j|ddgddg}|d u rd S |jd |kr^d }|}| |||| j	| j
||jd |d jd }|d u rd S | j| | j| j|j< tjdd|j g|jd gd|j g dd}| j|| j |jd |jd< d|j |jd< | j||	|
g | j| | j| | j| | j| d| _d S d S )NSkipLayerNormalization)WhereAddMatMulReshape	TransposerH   )r   r   r   r   r   r   r   r   )rJ   rI   SlicerG   rH   )r   r   r   r   r   rJ   )SoftmaxrG   rH   )r   r   r   )MulrJ   rI   rK   rG   rH   )r   r   r   r   r   r   rI   rF   back_transpose_in_back_transpose_)r   r      )permT)op_typer
   match_parent_path	enumerater   r   r5   lenrD   r   r   nodes_to_addr4   node_name_to_graph_namer!   r   r6   add_nodenodes_to_remover9   prune_graph)r   normalize_nodeinput_name_to_nodesoutput_name_to_node
start_node	qkv_nodes_matmul_belowreshape_qkvtranspose_qkv
matmul_qkvother_inputs_ir   
root_inputv_nodesr   r   upper_nodes	transposeqk_nodesadd_qk	matmul_qkq_nodesk_nodesrelative_position_bias_nodesr   attention_last_nodenew_nodeback_transposer   r   r   fuseg   s   





zFusionTnlrAttention.fuse)__name__
__module____qualname____doc__r   intr   r   strr   rD   rv   __classcell__r   r   r   r   r	      s>    		

Hr	   c                       s$   e Zd Z fddZdd Z  ZS )TnlrOnnxModelc                    s4   t  ||| t| | _t| | j| j| j| _d S r   )r   r   r   r   r	   r   r   attention_fusion)r   r
   r   r   r   r   r   r      s   
zTnlrOnnxModel.__init__c                 C   s   | j   d S r   )r   apply)r   r   r   r   fuse_attention   s   zTnlrOnnxModel.fuse_attention)rw   rx   ry   r   r   r}   r   r   r   r   r~      s    r~   )loggingfusion_attentionr   r   fusion_utilsr   onnxr   r   
onnx_modelr   onnx_model_bertr   	getLoggerrw   r*   r	   r~   r   r   r   r   <module>   s   
 L