o
    ߥi9A                     @   s   d dl Z d dlZd dlZd dlmZ d dlm  mZ G dd dejZ	G dd dejZ
dddZd	d
 ZdddZG dd dejZdd ZdS )    Nc                       sL   e Zd ZdZ									d fdd	Zdd Zejjd	d
 Z	  Z
S )DetectorzO This is a combination of "Swin with RAM" and a "Neck-free Deformable Decoder" FN   c              
      s  t    | _| _|j}t|| _t||dd _	| _
| _| _|	 _ jr@td|
 ddd t||
 jjd _| _| _| _|du rt|j}g }t|D ]}|j| }|ttj||dd	td
| qXt| _ jD ]}tjj|d jdd tj |d j!d q|d _"n| _"ttj|jd |dd	td
| _#ttj||dd	td
| _$d}t%&d| |  }t'(||  jj!_)tj  j	j*d jj)d tj  j	j*d j!j)d tjj j#d jdd tj  j#d j!d tjj j$d jdd tj  j$d j!d  jr<tj  jj*d jj)d tj  jj*d j!j)d |j+j,d }|rmt- j| _t- j	| _	tj  j	d j*d j!j)dd d  j	 jj+_	n4tj  j	j*d j!j)dd d t fddt|D  _t fddt|D  _	d jj+_	 jrtj  jj*d j!j)dd d t fddt|D  _ jrt||dd _.|rt- j.| _.dS t fddt|D  _.dS dS )a   Initializes the model.
        Args:
            backbone: torch module of the backbone to be used. See backbone.py
            transformer: torch module of the transformer architecture. See transformer.py
            num_classes: number of object classes
            num_queries: number of object queries (i.e., det tokens). This is the maximal number of objects
                         DETR can detect in a single image. For COCO, we recommend 100 queries.
            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
            with_box_refine: iterative bounding box refinement
            epff: None or fusion module available
            iou_aware: True if iou_aware is to be used.
              see the original paper https://arxiv.org/abs/1912.05992
            token_label: True if token_label is to be used.
              see the original paper https://arxiv.org/abs/2104.10858
            distil: whether to use knowledge distillation with token matching
              z Training with vector_hidden_dim .T)flushN   )kernel_size    r   )gaing{Gz?   g       c                       g | ]} j qS  )class_embed.0_selfr   R/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/cv/vidt/head.py
<listcomp>       z%Detector.__init__.<locals>.<listcomp>c                    r   r   )
bbox_embedr   r   r   r   r      r   c                    r   r   )vector_embedr   r   r   r   r      r   c                    r   r   )	iou_embedr   r   r   r   r      r   )/super__init__num_queriestransformerd_modelnnLinearr   MLPr   aux_losswith_box_refinewith_vectorprocessor_dctprintn_keepr   	iou_awaretoken_labeldistillennum_channelsrangeappend
SequentialConv2d	GroupNorm
ModuleList
input_projinitxavier_uniform_weight	constant_biasfusiontgt_projquery_pos_projmathlogtorchonesdatalayersdecoder
num_layers_get_clonesr   )r   backboner    num_classesr   r%   r&   epffr'   r(   vector_hidden_dimr+   r,   r-   
hidden_dimnum_backbone_outsinput_proj_listr   in_channelsproj
prior_prob
bias_valuenum_pred	__class__r   r   r      s   
!






 "
"
zDetector.__init__c           #      C   sn  ||||g}|  |ddddd}| |ddddd}g }	t|D ]\}
}|	|jdd  q,g }| jdu rVt|D ]\}
}|| j	|
 | qFn| |}g }t|D ]/\}
}|	|jdd  t
j|d  |jdd dtjd }|| |dusJ qag }g }| ||||\}}}}t|jd D ]S}|dkr|n||d  }t|}| j| || }| j| || }|jd dkr||7 }n|jd dksJ |d	ddf  |7  < | }|| || qt|}t|}d}| jr,g }t|jd D ]}| j| || }|| qt|}|d |d d
}| jrB|d|d i | jrW| jjjdkrW| ||||d< | jrg }t|jd D ]}|| j| ||  qdt|}|d |d< | jrt|d D ]\}} || | d< q| j rd|i|d< | j!r|||d|d< |d }!|d }"|!|"fS )a?   The forward step of ViDT

        Args:
            The forward expects a NestedTensor, which consists of:
            - features_0: images feature
            - features_1: images feature
            - features_2: images feature
            - features_3: images feature
            - det_tgt: images det logits feature
            - det_pos: images det position feature
            - mask: images mask
        Returns:
            A dictionary having the key and value pairs below:
            - "out_pred_logits": the classification logits (including no-object) for all queries.
                            Shape= [batch_size x num_queries x (num_classes + 1)]
            - "out_pred_boxes": The normalized boxes coordinates for all queries, represented as
                           (center_x, center_y, height, width). These values are normalized in [0, 1],
                           relative to the size of each individual image (disregarding possible padding).
                           See PostProcess for information on how to retrieve the unnormalized bounding box.
        r   r   r   r   r   N)sizer   .pred_logits
pred_boxespred_vectorsaux_outputs	pred_iousrX   
enc_tokens)patch_tokenbody_det_tokenneck_det_tokendistil_tokensrY   )"r=   	unsqueezesqueezepermuter>   	enumerater1   shaper<   r6   FinterpolatefloattorA   boolr    r0   inverse_sigmoidr   r   sigmoidstackr'   r   updater%   rE   rF   _set_aux_lossr+   r   r,   r-   )#r   
features_0
features_1
features_2
features_3det_tgtdet_posmaskfeaturesshapeslesrcsrcsmasks_maskoutputs_classesoutputs_coordshsinit_referenceinter_referencesenc_token_class_unflatlvl	referenceoutputs_classtmpoutputs_coordoutputs_vectoroutputs_vectorsoutoutputs_iousoutputs_iouiauxout_pred_logitsout_pred_boxesr   r   r   forward   s   










zDetector.forwardc                 C   sZ   |d u rdd t |d d |d d D S dd t |d d |d d |d d D S )Nc                 S   s   g | ]	\}}||d qS )rW   r   )r   abr   r   r   r   0  s
    
z*Detector._set_aux_loss.<locals>.<listcomp>r   c                 S      g | ]\}}}|||d qS ))rX   rY   rZ   r   )r   r   r   cr   r   r   r   5      
)zip)r   r   r   r   r   r   r   rp   )  s   
zDetector._set_aux_loss)	FFNFNr   FFF)__name__
__module____qualname____doc__r   r   rA   jitunusedrp   __classcell__r   r   rT   r   r      s"      r   c                       s(   e Zd ZdZ fddZdd Z  ZS )r$   z5 Very simple multi-layer perceptron (also called FFN)c                    sJ   t    || _|g|d  }tdd t|g| ||g D | _d S )Nr   c                 s   s     | ]\}}t ||V  qd S N)r"   r#   )r   nkr   r   r   	<genexpr>D  s    
zMLP.__init__.<locals>.<genexpr>)r   r   rF   r"   r5   r   rD   )r   	input_dimrL   
output_dimrF   hrT   r   r   r   @  s   

zMLP.__init__c                 C   s<   t | jD ]\}}|| jd k rt||n||}q|S )Nr   )re   rD   rF   rg   relu)r   xr   layerr   r   r   r   G  s   &zMLP.forward)r   r   r   r   r   r   r   r   r   rT   r   r$   =  s    r$   h㈵>c                 C   s8   | j ddd} | j |d}d|  j |d}t|| S )Nr   r   )minmax)r   )clamprA   r@   )r   epsx1x2r   r   r   rl   M  s   rl   c                 C   sL   |  d\}}}}|d|  |d|  |d|  |d|  g}tj|ddS )Nr   g      ?dim)unbindrA   rn   )r   x_cy_cwr   r   r   r   r   box_cxcywh_to_xyxyT  s   ,r   皙?c           
      C   s   g }| D ]H}g }t t|d D ]6}t|d |  }t|d |  }g }|d |  D ]	}	|t|	 q0||krF||||g q|| q|S )Nscoreslabelsboxes)r0   r.   ri   cpuintr1   )
post_resultsbbox_thubatch_final_resper_img_resper_img_final_resr   scorelabelbboxitr   r   r   get_predictions[  s   r   c                       s2   e Zd ZdZd fdd	Ze dd Z  ZS )PostProcesszQ This module converts the model's output into the format expected by the coco apiNc                    s   t    || _d S r   )r   r   r(   )r   r(   rT   r   r   r   n  s   

zPostProcess.__init__c              	   C   s   t |t |ks
J |jd dksJ | }tj||jd dddd\}}|}||jd  }||jd  }	t|}
t|
d|d	ddd}
|
d\}}tj||||gddtj}|
|dddddf  }
d	d
 t||	|
D }|S )a   Perform the computation

        Args:
            out_logits: raw logits outputs of the model
            out_bbox: raw bbox outputs of the model
            target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
                          For evaluation, this must be the original image size (before any data augmentation)
                          For visualization, this should be the image size after data augment, but before padding
        r   r   r   r   d   r   r   Nc                 S   r   ))r   r   r   r   )r   slr   r   r   r   r     r   z'PostProcess.forward.<locals>.<listcomp>)r.   rf   rm   rA   topkviewr   gatherrb   repeatr   rn   rj   float32r   )r   
out_logitsout_bboxtarget_sizesprobtopk_valuestopk_indexesr   
topk_boxesr   r   img_himg_w	scale_fctresultsr   r   r   r   s  s0   

zPostProcess.forwardr   )	r   r   r   r   r   rA   no_gradr   r   r   r   rT   r   r   k  s
    r   c                    s   t  fddt|D S )z Clone a moudle N times c                    s   g | ]}t  qS r   )copydeepcopy)r   r   moduler   r   r     s    z_get_clones.<locals>.<listcomp>)r"   r5   r0   )r   Nr   r   r   rG     s   rG   )r   )r   )r   r?   rA   torch.nnr"   torch.nn.functional
functionalrg   Moduler   r$   rl   r   r   r   rG   r   r   r   r   <module>   s     2

/