o
    ٷiE*                  	   @   s   d dl Z d dlZd dlZd dlm  mZ d dlmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ e eZG dd	 d	ejZ	
	
ddedededefddZ	
ddedefddZdS )    N)SAM2ImageEncoderrandom_sam2_input_image)SAM2MaskDecoder)SAM2PromptEncoder)SAM2Base)compare_tensors_with_tolerance)nnc                       s   e Zd Z			ddededededed	d
f fddZe 	ddej	dej	dej	dej	dej	dej	dej	dej	defddZ
  ZS )SAM2ImageDecoderTF        	sam_modelmultimask_outputdynamic_multimask_via_stabilityreturn_logitsmask_thresholdreturnNc                    s2   t    t|| _t|||| _|| _|| _d S )N)super__init__r   prompt_encoderr   mask_decoderr   r   )selfr   r   r   r   r   	__class__ f/home/ubuntu/.local/lib/python3.10/site-packages/onnxruntime/transformers/models/sam2/image_decoder.pyr      s
   


zSAM2ImageDecoder.__init__image_features_0image_features_1image_embeddingspoint_coordspoint_labelsinput_maskshas_input_masksoriginal_image_sizeenable_nvtx_profilec
                 C   s  d}
|	rddl m} |g d}
|
dur|
jddd | ||||\}}}|
dur6|
d |
jdd	d | ||||||\}}|
durR|
d |
jd
dd tj||d |d fddd}t	|dd}| j
sp|| jk}|
dur}|
d
 |
  |||fS )a  
        Decode masks from image features and prompts. Batched images are not supported. H=W=1024.

        Args:
            image_features_0 (torch.Tensor): [1, 32, H/4, W/4]. high resolution features of level 0 from image encoder.
            image_features_1 (torch.Tensor): [1, 64, H/8, W/8]. high resolution features of level 1 from image encoder.
            image_embeddings (torch.Tensor): [1, 256, H/16, W/16]. image embedding from image encoder.
            point_coords (torch.Tensor): [L, P, 2] shape and float32 dtype and contains the absolute pixel
                                         coordinate in (x, y) format of the P input points in image of size 1024x1024.
            point_labels (torch.Tensor): shape [L, P] and int32 dtype, where 1 means
                                         positive (foreground), 0 means negative (background), -1 means padding,
                                         2 (box left upper corner), 3 (box right bottom corner).
            input_masks (torch.Tensor): [L, 1, H/4, W/4]. Low resolution mask input to the model.
                                        Typically coming from a previous iteration.
            has_input_masks (torch.Tensor): [L]. 1.0 if input_masks is used, 0.0 otherwise.
            original_image_size(torch.Tensor): [2]. original image size H_o, W_o.
            enable_nvtx_profile (bool): enable NVTX profiling.

        Returns:
            masks (torch.Tensor): [1, M, H_o, W_o] where M=3 or 1. Masks of original image size.
            iou_predictions (torch.Tensor): [1, M]. scores for M masks.
            low_res_masks (torch.Tensor, optional): [1, M, H/4, W/4]. low resolution masks.
        Nr   )
NvtxHelper)r   r   post_processr   blue)colorr   redr$   green   bilinearF)modealign_cornersg      @g      @@)nvtx_helperr#   start_profiler   stop_profiler   Finterpolatetorchclampr   r   print_latency)r   r   r   r   r   r   r   r    r!   r"   r-   r#   sparse_embeddingsdense_embeddingsimage_pelow_res_masksiou_predictionsmasksr   r   r   forward#   s>   $





zSAM2ImageDecoder.forward)TFr
   F)__name__
__module____qualname__r   boolfloatr   r2   no_gradTensorr;   __classcell__r   r   r   r   r	      sL    	
r	   F
sam2_modelonnx_model_pathr   verbosec                 C   s0  d}t |}t|  }||\}}}	td|j td|j td|	j t| |dd }
d}d}tjd	d
||dftj	d}tjd	d||ftj
d}tj|dddtj	d}tjdtj	d}tjddgtj
d}|||	|||||f}td|j td|j td|j td|j td|j |r|
| \}}}td|j td|j td|j g d}g d}ddddddd	did	diddddd	did	did }t ) |stjd!tjjd" tjd!td" tjj|
||dd#d|||d$	 W d    n	1 sw   Y  td%| d S )&Nr)   zimage_features_0.shape: %szimage_features_1.shape: %szimage_embeddings.shape: %sTr   r         r      lowhighsizedtype   rP   i  i  zpoint_coords.shape: %szpoint_labels.shape: %szinput_masks.shape: %szhas_input_masks.shape: %szoriginal_image_size.shape: %szmasks.shape: %sziou_predictions.shape: %szlow_res_masks.shape: %s)r   r   r   r   r   r   r    r!   )r:   r9   r8   
num_labels
num_points)r   r)   original_image_heightoriginal_image_width)r   rI   rJ   )r   r   r   r    r:   r8   r9   ignore)category   )export_paramsopset_versiondo_constant_foldinginput_namesoutput_namesdynamic_axeszdecoder onnx model saved to %s)r   r   cpuloggerinfoshaper	   r2   randintrA   int32zerosonestensorwarningscatch_warningsfilterwarningsjitTracerWarningUserWarningonnxexport)rE   rF   r   rG   
batch_sizeimagesam2_encoderr   r   r   sam2_decoderrS   rT   r   r   r   r    r!   example_inputsr:   r9   r8   r]   r^   r_   r   r   r   export_decoder_onnxs   s   


rv   c                    s  d}t |}t|  }||\}}}t| |dd }	d}
d}tjdd|
|dftjd}tjdd|
|ftjd}tj|
dd	d	tjd
}tjdtjd
}tj	ddgtjd
}||||||||f |	  \}}}dd l
}|j|dgd}| fddttD }td| | fddttD }td|  fddttD }|||}t|D ]\}}t| d|| j q|\}}}td| t	| rtd|t	|rtd|t	|rtd| d S td| d S )Nr)   TrH      r   rK   rI   rL   rQ   rR   i  CPUExecutionProvider)	providersc                       g | ]} | j qS r   name.0i)model_inputsr   r   
<listcomp>       z%test_decoder_onnx.<locals>.<listcomp>zinput_names: %sc                    rz   r   r{   r}   )model_outputsr   r   r      r   zoutput_names: %sc                    s    i | ]}| j  |  qS r   )r|   numpyr}   )ru   r   r   r   
<dictcomp>  s     z%test_decoder_onnx.<locals>.<dictcomp>z
.shape: %sr:   r9   r8   zonnx model has been verified:zonnx model verification failed:)r   r   r`   r	   r2   rd   rA   re   rf   rh   onnxruntimeInferenceSession
get_inputsrangelenra   rb   get_outputsrun	enumeraterc   r   print)rE   rF   r   rq   rr   rs   r   r   r   sam2_image_decoderrS   rT   r   r   r   r    r!   r:   r9   r8   r   ort_sessionr]   r^   inputsoutputsr   output_name	ort_masksort_iou_predictionsort_low_res_masksr   )ru   r   r   r   test_decoder_onnx   sb   
r   )FFr<   )loggingri   r2   torch.nn.functionalr   
functionalr0   image_encoderr   r   r   r   r   r   sam2.modeling.sam2_baser   
sam2_utilsr   	getLoggerr=   ra   Moduler	   strr@   rv   r   r   r   r   r   <module>   s:   
b
b