o
    ٷi@                     @   s   d dl Z d dlZd dlZd dlZd dlmZmZmZmZm	Z	 d dl
mZmZ d dlmZmZ d dlmZ eeZG dd dZG dd	 d	eZG d
d deZG dd dZdd Zdd Zdd Zedkrle  dS dS )    N)AttentionInputIDsAttentionOutputIDsMultiHeadAttentionInputIDsMultiHeadAttentionOutputIDs	Operators)helper
load_model)	NodeProto	OnnxModel)SymbolicShapeInferenceHelperc                   @   s   e Zd ZdedefddZdedB fddZdedB fd	d
ZdedB fddZ	de
fddZdee dee ddfddZdee dee ddfddZdededdfddZdedB fddZdde
ddfddZdS ) PackingAttentionBasemodelattention_op_typec                 C   sD   || _ g | _g | _d| _i | _| j j jj| _|| _| j 	|| _
d S )NF)r   nodes_to_removenodes_to_addprune_graphnode_name_to_graph_namegraphnamethis_graph_namer   get_nodes_by_op_typeattention_nodes)selfr   r    r   d/home/ubuntu/.local/lib/python3.10/site-packages/onnxruntime/transformers/convert_to_packing_mode.py__init__   s   zPackingAttentionBase.__init__returnNc                 C   sr   | j tjkr	tjntj}|  }|rt|j	|krd S |j	| }| j
D ]}t|j	|ks3|j	| |kr6 d S q#|S N)r   r   	ATTENTIONr   
MASK_INDEXr   KEY_PADDING_MASK_try_getting_first_attentionleninputr   )r   
mask_indexfirst_attention_nodeattention_masknoder   r   r   _try_getting_attention_mask$   s   

z0PackingAttentionBase._try_getting_attention_maskc                 C   s   t | jdkr	d S | jd S )Nr   )r"   r   r   r   r   r   r!   8   s   
z1PackingAttentionBase._try_getting_first_attentionc                 C   s4   d }| j  D ]}|jtjks|jtjkr|}q|S r   )r   nodesop_typer   	LAYERNORMSKIPLAYERNORM)r   last_layernorm_noder'   r   r   r   _try_getting_last_layernorm>   s   z0PackingAttentionBase._try_getting_last_layernormc                 C      t  r   NotImplementedErrorr)   r   r   r   _are_attentions_supportedE      z.PackingAttentionBase._are_attentions_supportedinputsoutputsc                 C   B   t jtj||| jtjd}d|_| j| | j	| j
|j< d S Nr5   r6   r   com.microsoft)r   	make_noder   REMOVEPADDINGr   create_node_namedomainr   appendr   r   r   r   r5   r6   new_noder   r   r   _insert_removepadding_nodeH      z/PackingAttentionBase._insert_removepadding_nodec                 C   r7   r8   )r   r;   r   RESTOREPADDINGr   r=   r>   r   r?   r   r   r   r@   r   r   r   _insert_restorepadding_nodeT   rC   z0PackingAttentionBase._insert_restorepadding_nodetoken_offsetcumulative_sequence_lengthc                 C   r0   r   r1   )r   rF   rG   r   r   r   )_replace_attention_with_packing_attention`   r4   z>PackingAttentionBase._replace_attention_with_packing_attentionc                 C   s   | j tjkr|jtj S d S r   )r   r   r   r#   r   INPUT)r   r%   r   r   r   _get_input_to_remove_paddingc   s   z1PackingAttentionBase._get_input_to_remove_paddingTuse_symbolic_shape_inferc                 C   s  t d |  sd S |  }|sd S |  }|  }|sd S | |}|s(d S |d }|d }|d }|d }	| ||g||||	g | j	|| t d |j
d d }
| |
|g|j
d g | j|j
d |
 t d	|j d
 | || t d| j d| j  | j| j | j| j| j | jr| j  n| js| jr| j  | j  |rt| jjdd}|j| jjddd}|r|| j_d S d S d S )Nz$start converting to packing model..._no_padding_token_offset_cumulated_seq_len_max_seq_lenz'inserted RemovePadding before Attentionr   _restore_inputz#inserted RestorePadding after last z layerz	replaced z with PackedverboseTF)
auto_mergeguess_output_rank)loggerdebugr3   r(   r!   r/   rJ   rB   r   replace_input_of_all_nodesoutputrE   replace_output_of_all_nodesr+   rH   r   remove_nodesr   	add_nodesr   r   r   update_graphclean_shape_inferr   infer_shapes)r   rK   r&   r%   r.   input_to_remove_paddingoutput_without_paddingrF   cumulated_seq_lenmax_seq_lenrestorepadding_inputshape_infer_helperinferred_modelr   r   r   converth   sV   





zPackingAttentionBase.convertT)__name__
__module____qualname__r
   strr   r(   r	   r!   r/   boolr3   listrB   rE   rH   rJ   rf   r   r   r   r   r      s    
r   c                       sF   e Zd Zdef fddZdefddZdededd	fd
dZ  Z	S )PackingAttentionr   c                       t  |tj d S r   )superr   r   r   r   r   	__class__r   r   r         zPackingAttention.__init__r   c                 C   s   | j D ]K}t|dd ur dS t|dd ur dS t|d}|d ur,|dkr, dS t|jtjkr=|jtj s= dS t|jtjkrN|jtj sN dS qdS )Npast_present_share_bufferF	do_rotaryunidirectionalr   T)r   r
   get_node_attributer"   r#   r   PASTPAST_SEQUENCE_LENGTH)r   r'   unidirection_attrr   r   r   r3      s    

z*PackingAttention._are_attentions_supportedrF   rG   Nc              	   C   s   | j D ]f}t|jtjkr|jtj nd}tjtj|jtj	 |jtj
 |jtj |||g|jtj g| jtjd}g }|jD ]}|jdv rL|| q@|j| d|_| j| | j| | j| j|j< qtdt| j  d S )N r9   )	num_headsqkv_hidden_sizesscaler:   z0Converted %d Attention nodes to PackedAttention.)r   r"   r#   r   ATTENTION_BIASr   r;   r   PACKEDATTENTIONrI   WEIGHTSBIASrX   r   OUTPUTr   r=   	attributer   r?   extendr>   r   r   r   r   rU   info)r   rF   rG   	attentionattention_biaspacked_attention
attributesattrr   r   r   rH      s8   






z:PackingAttention._replace_attention_with_packing_attention)
rh   ri   rj   r
   r   rl   r3   rk   rH   __classcell__r   r   rr   r   rn      s    rn   c                       s|   e Zd Zdef fddZdedefddZdedefdd	Zd
e	fddZ
deded
dfddZd
edB fddZ  ZS )PackingMultiHeadAttentionr   c                    ro   r   )rp   r   r   MULTI_HEAD_ATTENTIONrq   rr   r   r   r      rt   z"PackingMultiHeadAttention.__init__indexr   c                 C   D   t |j|kr t |j| dkr td| d| d|  dS dS )'Check a node does not have given input.r   znode input  (0) is not supported in PackedMultiHeadAttention: FT)r"   r#   rU   errorr   r'   r   r   r   r   r   _check_empty_input   
   z,PackingMultiHeadAttention._check_empty_inputc                 C   r   )r   r   znode output r   r   FT)r"   rX   rU   r   r   r   r   r   _check_empty_output   r   z-PackingMultiHeadAttention._check_empty_outputr   c                 C   s   | j D ]T}|jD ]}|jdvrtd|j d|    dS q|jtj r4|jtj s4td  dS | 	|tj
drT| 	|tjdrT| |tjdrT| |tjdsW dS qdS )	Nr}   mask_filter_valuer   znode attribute z/ is not supported in PackedMultiHeadAttention: Fz=packed kv format is not supported in PackedMultiHeadAttentionpast_keypresent_keyT)r   r   r   rU   r   r#   r   KEYVALUEr   PAST_KEY
PAST_VALUEr   r   PRESENT_KEYPRESENT_VALUE)r   r'   r   r   r   r   r3      s(   



z3PackingMultiHeadAttention._are_attentions_supportedrF   rG   Nc           
   
   C   sH  d}| j D ]}t|jtjkr|jtj nd}tjtj|jtj	 |jtj
 |jtj |jtj |||g|jtj g| jtjd}g }|jD ]}|jdv rS|| qG|j| d|_| j| | j| | j| j|j< |r| j|tj}	|	r|	jdkrt|	jdkr|	j| |d7 }qtd	t| j  td
| d S )Nr   r|   r9   r   r:   GatedRelativePositionBias      zBConverted %d MultiHeadAttention nodes to PackedMultiHeadAttention.z=Converted %d GatedRelativePositionBias nodes to packing mode.)r   r"   r#   r   r   r   r;   r   PACKED_MULTI_HEAD_ATTENTIONQUERYr   r   r   rX   r   r   r   r=   r   r   r?   r   r>   r   r   r   r   
get_parentr+   rU   r   )
r   rF   rG   gated_relative_pos_bias_countmhar   
packed_mhar   r   rel_pos_bias_noder   r   r   rH     sP   




	



zCPackingMultiHeadAttention._replace_attention_with_packing_attentionc                 C   s*   | j |d}|r|jdkr|jd S d S )Nr   MatMul)r   r   r+   r#   )r   r%   matmulr   r   r   rJ   4  s   
z6PackingMultiHeadAttention._get_input_to_remove_padding)rh   ri   rj   r
   r   intrk   r   r   rl   r3   rH   rJ   r   r   r   rr   r   r      s    0r   c                   @   s.   e Zd ZdefddZd
deddfdd	ZdS )PackingModer   c                 C   s
   || _ d S r   )r   rq   r   r   r   r   =  s   
zPackingMode.__init__TrK   r   Nc                 C   sn   | j tjr| j tjrtd d S t| j }||S | j tjr0t	| j }||S td d S )NzRPacking mode does not support both Attention and MultiHeadAttention in same graph.zPPacking mode requires either Attention or MultiHeadAttention node in onnx graph.)
r   r   r   r   r   rU   r   rn   rf   r   )r   rK   packingr   r   r   rf   @  s   





zPackingMode.convertrg   )rh   ri   rj   r
   r   rl   rf   r   r   r   r   r   <  s    r   c                  C   sx   t jdd} | jddtdd | jddtdd | jd	d
ddd | jd
d | jdd
ddd | jd
d |  }|S )Nz_Convert to packing mode tool for ONNX Runtime. It converts BERT like model to use packing mode.)descriptionz--inputTzinput onnx model path)requiredtypehelpz--outputzoptimized onnx model pathz	--verboseF
store_truezshow debug information.)r   actionr   rQ   z--use_external_data_formatz4use external data format to store large model (>2GB)use_external_data_format)argparseArgumentParseradd_argumentrk   set_defaults
parse_args)parserargsr   r   r   _parse_argumentsO  s    r   c                 C   s&   | rt jddd d S t jdd d S )NDEBUGz8[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s)levelfmtz%(funcName)20s: %(message)s)r   )coloredlogsinstallrQ   r   r   r   _setup_loggerg  s   
r   c                  C   s|   t  } t| j td|   tj| jtj| j	kr#t
d t| j}tt|}|  |jj| j	| jd d S )Nz
arguments:zYSpecified the same input and output path. Note that this may overwrite the original modelr   )r   r   rR   rU   rV   ospathrealpathr#   rX   warningr   r   r
   rf   r   save_model_to_filer   )r   r   packing_moder   r   r   mainq  s   


r   __main__)r   loggingr   r   	constantsr   r   r   r   r   onnxr   r   
onnx_modelr	   r
   rd   r   	getLoggerrh   rU   r   rn   r   r   r   r   r   r   r   r   r   <module>   s(   
 
9a

