o
    ڷi&Q                     @   sh   d dl mZ d dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZ d dlmZ eeZG dd deZdS )	    )	getLoggerN)Fusion)NumpyHelper)	NodeProtohelpernumpy_helper)	OnnxModelc                       s   e Zd ZdZdededef fddZdedefd	d
ZdedefddZ	dd Z
	d+dedededeeef fddZdededededededededededB fddZdd  Zd!d" Zdefd#d$Zd,d%d&Zded'ed(ededef
d)d*Z  ZS )-FusionMultiHeadAttentionSam2zI
    Fuse MultiHeadAttention subgraph of Segment Anything v2 (SAM2).
    modelhidden_size	num_headsc                    s.   t  |ddg || _|| _d| _d| _d S )NMultiHeadAttentionLayerNormalizationT)super__init__r   r   num_heads_warninghidden_size_warning)selfr
   r   r   	__class__ d/home/ubuntu/vllm_env/lib/python3.10/site-packages/onnxruntime/transformers/fusion_attention_sam2.pyr      s
   
z%FusionMultiHeadAttentionSam2.__init__	reshape_qreturnc                 C   s`   d}| j |jd }|dur#t|tjr#t|jdgkr#t|d }t|tr.|dkr.|S dS )Detect num_heads from a reshape node.

        Args:
            reshape_q (NodeProto): reshape node for Q
        Returns:
            int: num_heads, or 0 if not found
        r      N      )	r
   get_constant_valueinput
isinstancenpndarraylistshapeint)r   r   r   shape_valuer   r   r   get_decoder_num_heads#   s   z2FusionMultiHeadAttentionSam2.get_decoder_num_heads
reshape_inc                 C   s   d}| j |jd }|dur$t|tjr#t|jdgkr#t|d }n4| j 	|dd}|durXt
|jdkrX| j |jd }|durXt|tjrXt|jdgkrXt|d }t|trc|dkrc|S dS )r   r   r   N      Concat)r
   r   r   r    r!   r"   r#   r$   r%   match_parentlen)r   r(   r   r&   concat_shaper   r   r   get_encoder_num_heads8   s   z2FusionMultiHeadAttentionSam2.get_encoder_num_headsc                 C   s*   | j |jd }|rt|jd S dS )zDetect hidden_size from LayerNormalization node.
        Args:
            layernorm_node (NodeProto): LayerNormalization node before Q, K and V
        Returns:
            int: hidden_size, or 0 if not found
        r   r   )r
   get_initializerr   r   to_arrayr$   )r   layernorm_nodelayernorm_biasr   r   r   get_hidden_sizeT   s   z,FusionMultiHeadAttentionSam2.get_hidden_sizeFr2   
is_encoderc                 C   s   |r|  |}n| |}|dkr| j}| jdkr1|| jkr1| jr1td| j d| d d| _| |}|dkr=| j}| jdkrZ|| jkrZ| jrZtd| j d| d d| _||fS )a  Detect num_heads and hidden_size.

        Args:
            reshape_q (NodeProto): reshape node for Q
            layernorm_node (NodeProto): LayerNormalization node before Q, K, V
        Returns:
            Tuple[int, int]: num_heads and hidden_size
        r   z--num_heads is z. Detected value is z. Using detected value.Fz--hidden_size is )	r/   r'   r   r   loggerwarningr4   r   r   )r   r   r2   r5   r   r   r   r   r   get_num_heads_and_hidden_sizea   s&   

z:FusionMultiHeadAttentionSam2.get_num_heads_and_hidden_sizeq_matmulq_addk_matmulk_addv_matmulv_addoutputNc
              
   C   s*  |dkr|| dkrt d| d|  dS | j|jd }
| j|jd }| j|jd }|
r8|r8|s:dS t|
}t|}t|}t d|j d|j d|j d	|  | jd
}|j	d |j	d |j	d g}t
jd
||	g|d}d|_|jt
d|g dd}| | |S )aF  Create an Attention node.

        Args:
            q_matmul (NodeProto): MatMul node in fully connection for Q
            q_add (NodeProto): Add bias node in fully connection for Q
            k_matmul (NodeProto): MatMul node in fully connection for K
            k_add (NodeProto): Add bias node in fully connection for K
            v_matmul (NodeProto): MatMul node in fully connection for V
            v_add (NodeProto): Add bias node in fully connection for V
            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
            output (str): output name

        Returns:
            Union[NodeProto, None]: the node created or None if failed.
        r   zinput hidden size z# is not a multiple of num of heads Nr   zqw=z kw=z vw=z hidden_size=r   inputsoutputsnamecom.microsoftr   MultiHeadAttention ({})zcross attention)r6   debugr
   r0   r   r   r1   r$   create_node_namer?   r   	make_nodedomain	attributeextendmake_attributeformatincrease_counter)r   r9   r:   r;   r<   r=   r>   r   r   r?   q_weightk_weightv_weightqwkwvwattention_node_nameattention_inputsattention_nodecounter_namer   r   r   create_attention_node   s8   


(

z2FusionMultiHeadAttentionSam2.create_attention_nodec                 C   s  |  |||r	d S | |}|d u r4|jd |vrd S ||jd  }|jdkr)d S | |}|d u r4d S |\	}}}}	}
}}}}|}| ||d\}}|dkrUtd d S | j|	|
|||||||jd d	}|d u rld S | j	
| | j| j|j< | j||g d| _d S )Nr   AddF*fuse_attention: failed to detect num_heads)r?   T)fuse_sam_encoder_patternmatch_attention_subgraphr   op_typer8   r6   rF   rY   r?   nodes_to_addappendthis_graph_namenode_name_to_graph_namerC   nodes_to_removerK   prune_graph)r   normalize_nodeinput_name_to_nodesoutput_name_to_node	match_qkvskip_addreshape_qkvtranspose_qkvr   matmul_qadd_qmatmul_kadd_kmatmul_vadd_vattention_last_nodeq_num_headsq_hidden_sizenew_noder   r   r   fuse   sF   




z!FusionMultiHeadAttentionSam2.fusec              	   C   sl  | j |g dg d}|du rdS |\}}}}}| j |g dg d}|du r0td dS |\}}}}	| j |ddgd	d	g}
|
durK|
\}}ntd
 dS | j |g dg d}|du ritd dS |\}}}}}| j |g dg d}|du rtd dS |\}}}}}| j |g dg d}|du s|d |krtd dS ||||||||	|f	S )z.Match Q, K and V paths exported by PyTorch 2.*rZ   MatMulReshape	Transposerx   )NNNr   r   N)rz   ry   rZ   rx   )r   r   r   Nz&fuse_attention: failed to match v pathSoftmaxrx   r   z'fuse_attention: failed to match qk path)Mulrz   ry   rZ   rx   )r   Nr   r   Nz&fuse_attention: failed to match q path)r   Nr   r   Nz&fuse_attention: failed to match k path)SqrtDivr}   CastSliceShaperz   ry   )Nr   r   r   r   r   r   r   z*fuse_attention: failed to match mul_q pathr
   match_parent_pathr6   rF   )r   node_after_output_projection	qkv_nodes_rj   rk   
matmul_qkvv_nodesrq   rp   qk_nodes_softmax_qk	matmul_qkq_nodesmul_q_transpose_qr   rm   rl   k_nodes_mul_kro   rn   mul_q_nodesr   r   r   r]      sR   





z5FusionMultiHeadAttentionSam2.match_attention_subgraphc                 C   s  | j |g dg d}|d u r| j |g dg d}|d u r*| j |dgdg}|d u r0dS |d }| j|t|d	kr?d	nd d
}|d u rIdS |\}}}	}
}}t|
d}t|trb|g dkrddS t|d}t|tru|g dkrwdS t|d}t|tr|g dkrdS | j |	g dg d}|d u rdS |\}}}| ||d\}}|dkrt	
d dS d}| j |}|d u rtjtjg ddd|d}| j || j | j d}tjd|
jd |g|
jd d g|d}| j| | j| j|j< |
}|jd |jd< |jd d |jd< t	
d|d| | ||||}|d u r,dS t| j ||d	ks:J |jd |jd< | j| | j| j|j< | j|g d| _dS )N)rZ   ry   rz   ry   r   Nr   r   )rZ   r   r   ry   rz   ry   )r   Nr   r   r   r   rZ   r   Fr   r   )input_indexperm)r   r   r   r*   )r   r   r*   r   )ry   rZ   rx   )r   r   NTr[   bsnh_to_bsd_reshape_dims)r   r   r   int64)dtype)rC   ry   _BSDr@   _BNSHzFound MHA: q_num_heads=z q_hidden_size=) r
   r   $match_sam_encoder_attention_subgraphr-   r   get_node_attributer    r#   r8   r6   rF   r0   r   
from_arrayr!   arrayadd_initializerra   rG   r   rH   r   r_   r`   rb   rC   r?   create_mha_nodeget_childrenrc   rK   rd   )r   re   rf   rg   nodesr   matched_sdpareshape_outtranspose_out	split_qkvtranspose_qtranspose_ktranspose_vpermutation_qpermutation_kpermutation_vinput_projection_nodesr(   add_in	matmul_inrs   rt   new_dims_namenew_dimsreshape_q_namer   transpose_k_bnshru   r   r   r   r\   1  s   


z5FusionMultiHeadAttentionSam2.fuse_sam_encoder_patternc              	   C   sj  | j |g d|ddddg}|du rdS |\}}}}}| j |g dg d}|du r3td dS |\}	}}
}| j |ddgddg}|durN|\}}ntd	 dS | j |g d
g d}|du r|| j |g dg d}|du r|td dS |d |
krdS |d }| j |g d
g d}|du rtd dS |d |
krdS |\}}}}|||
|||	fS )z%Match SDPA pattern in SAM2 enconder.*rw   Nr   )rz   SqueezeSplitry   )r   r   r   r   zfailed to match v pathr{   rx   zfailed to match qk path)r|   rz   r   r   r   )	r|   rz   ry   rz   MaxPoolrz   ry   r   r   )	r   Nr   r   r   r   r   r   r   zfailed to match q pathr   r   )r   Nr   r   zfailed to match k pathr   )r   r   r   	out_nodesr   r   r   matmul_qk_vr   r   r   rj   r   r   r   r   r   r   mul_kr   
_squeeze_kr   r   r   r     sN   




zAFusionMultiHeadAttentionSam2.match_sam_encoder_attention_subgraphr   r   c           
      C   sx   | j d}|jd |jd |jd g}|d }tjd||g|d}d|_|jtd|g d	d}	| 
|	 |S )	a  Create a MultiHeadAttention node for SAM2 encoder.

        Args:
            reshape_q (NodeProto): Reshape node for Q, output is 3D BxSxNH format
            transpose_k (NodeProto): Transpose node for K, output is BNSH format
            transpose_v (NodeProto): Transpose node for V, output is BNSH format
            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.

        Returns:
            NodeProto: the MultiHeadAttention node created.
        r   r   _outr@   rD   r   rE   zself attention)r
   rG   r?   r   rH   rI   rJ   rK   rL   rM   rN   )
r   r   r   r   r   rU   rA   r?   rW   rX   r   r   r   r     s"   

z,FusionMultiHeadAttentionSam2.create_mha_node)F)N)__name__
__module____qualname____doc__r   r%   r   r   r'   r/   r4   booltupler8   strrY   rv   r]   r\   r   r   __classcell__r   r   r   r   r	      sv    

$	

@2: 
7r	   )loggingr   numpyr!   fusion_baser   fusion_utilsr   onnxr   r   r   
onnx_modelr   r   r6   r	   r   r   r   r   <module>   s   