o
    ߥin&                     @   s   d dl Z d dlZd dlmZ d dlm  mZ G dd dejZG dd dejZ	G dd dejZ
G dd	 d	ejZG d
d dejZG dd dejZG dd dejZG dd dejZG dd dejZdddZdS )    Nc                       (   e Zd ZdZ fddZdd Z  ZS )Q2VRankerStage1z
        Used to calculate the qv_ctx_score with query embedding and multi anchor context embeddings as input.
        The qv_ctx_score is used to pre-rank and retain top-k related anchors.
    c                    "   t    t||| _|| _d S Nsuper__init__nnLinearfcnscalesselfr   
hidden_dim	__class__ _/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/soonet/blocks.pyr         

zQ2VRankerStage1.__init__c              
   C   sX   |  |}t }t| jD ]}tdtj|| dddtj|ddd}|| q|S Nz
bld,bd->bl   pdim   )	r   listranger   torcheinsumF	normalizeappend)r   	ctx_featsqfeatqv_ctx_scoresiscorer   r   r   forward   s   
zQ2VRankerStage1.forward__name__
__module____qualname____doc__r   r'   __classcell__r   r   r   r   r   	   s    r   c                       r   )V2QRankerStage1zt
        Used to calculate the vq_ctx_score with anchor context embeddings and multi query embeddings as input.
    c                    r   r   r   r   r   r   r   r   %   r   zV2QRankerStage1.__init__c              
   C   sT   t  }t| jD ]}tdtj| || dddtj|ddd}|| q|S r   )	r   r   r   r   r   r   r    r   r!   )r   r"   r#   vq_ctx_scoresr%   r&   r   r   r   r'   *   s   zV2QRankerStage1.forwardr(   r   r   r   r   r.           r.   c                       *   e Zd ZdZd fdd	Zdd Z  ZS )Q2VRankerStage2z
        Used to calculate the qv_ctn_score with query embedding and video sequence embedding as input.
        The qv_ctn_score is used to re-rank anchors.
    
   c                    s0   t    || _|| _t||| _t | _d S r   )	r   r   r   snippet_lengthr	   r
   qfcV2VAttentionencoder)r   r   r   r4   r   r   r   r   ;   s
   
zQ2VRankerStage2.__init__c              
   C   s.  |  |}t }t }| \}}}	t }
t| jD ]w}| jd|  }|| || dks0J t|| d|| }||| ||		 }t|d|| }| 
|tj| d d |jd}|
| tdtj|ddddtj|ddd}tj|dd\}}|| |||  q|||
fS )	Nr   r   r   )devicebkld,bd->bkl   r   r   )r5   r   sizer   r   r4   r   index_selectviewdetachr7   onesr8   r!   r   r   r    	unsqueezemax)r   vfeatsr#   hit_indicesr$   qv_ctn_scoresqv_merge_scores_LD	ctn_featsr%   anchor_lengthqv_ctx_scorectn_featqv_ctn_scorer   r   r   r'   B   s<   



zQ2VRankerStage2.forward)r3   r(   r   r   r   r   r2   5   s    r2   c                       r   )V2QRankerStage2zt
        Used to calculate the vq_ctn_score with anchor content embeddings and multi query embeddings as input.
    c                    r   r   r   r   r   r   r   r   g   r   zV2QRankerStage2.__init__c              
   C   sh   t  }t| jD ])}tdtj| || ddddtj|ddd}tj	|dd}|
| q|S )Nr9   r   r   r:   r   r   r;   )r   r   r   r   r   r   r    r   rA   meanr!   )r   rJ   r#   vq_ctn_scoresr%   r&   r   r   r   r'   l   s   zV2QRankerStage2.forwardr(   r   r   r   r   rO   b   r0   rO   c                       r   )r6   z`
        Self-attention encoder for anchor frame sequence to encode intra-anchor knowledge.
    c                    s:   t    tdddd| _tdddd| _td| _d S )Ni  i           )max_lenr   dropout   g?)r   n_headsrT   )	r   r   PositionEncodingposembMultiHeadAttentionr7   r	   DropoutrT   )r   r   r   r   r   ~   s   
zV2VAttention.__init__c                 C   sX   t d||d}|}|| | }| j||||d}| || |d  }|S )Nz
bm,bn->bmnr   )querykeyvaluemaskr   )r   r   rA   rX   r7   rT   float)r   video_featsvideo_masksr^   residualoutr   r   r   r'      s    zV2VAttention.forwardr(   r   r   r   r   r6   y       r6   c                       r1   )BboxRegressorzK
        Predict the offset of bounding box for each candidate anchor.
    Fc                    s   t    t||| _t||| _|r6t||| _t|| _t	td| |t
 t|d| _nt	t||t
 t|d| _|| _d S )Nr   )r   r   r	   r
   fc_ctxfc_qfc_ctnSelfAttentionattn
SequentialReLU	predictorenable_stage2)r   r   rn   r   r   r   r      s   




zBboxRegressor.__init__c           	      C   s   |  |}tj|dd}t| |t|d }| jr`|r`t }t	t
|D ]$}t| || dt|dd }| |}|| q*tj|dd}tj||gdd}n|}| |}|S )Nr   r;   r   )rg   r   catr   relurf   rA   rn   r   r   lenrh   rj   r!   rm   )	r   r"   rJ   r#   ctx_fuse_featsctn_fuse_featsr%   rc   
fuse_featsr   r   r   r'      s$   



zBboxRegressor.forward)Fr(   r   r   r   r   re      s    re   c                       r   )ri   z?
        Obtain pooled features by self-attentive pooling.
    c                    s<   t    t||d | _t | _t|d d| _d S )Nr   r   )r   r   r	   r
   fc1rl   rq   fc2)r   r   r   r   r   r      s   

zSelfAttention.__init__c                 C   sF   |  | | |d}tj|ddd}tj|| dd}|S )Nr:   r   r;   )	rw   rq   rv   squeezer   softmaxrA   r   sum)r   xattrc   r   r   r   r'      s   zSelfAttention.forwardr(   r   r   r   r   ri      rd   ri   c                       r1   )rW   a!  
        An implementation of trainable positional embedding which is added to
        sequence features to inject time/position information.

        Args:
            max_len: The max number of trainable positional embeddings.
            dim: the dimension of positional embedding.
    rR   c                    s6   t t|   t||| _t | _t|| _	d S r   )
r   rW   r   r	   	Embeddingembedrl   rq   rZ   rT   )r   rS   r   rT   r   r   r   r      s   
zPositionEncoding.__init__c                 C   sR   |j d d \}}tj|tj|jd}|d|d}| | | 	|}|S )Nr   )dtyper8   r   r   )
shaper   arangelongr8   rA   repeatrT   rq   r~   )r   r{   
batch_sizeseq_lenpos_idspos_embr   r   r   r'      s
   zPositionEncoding.forwardrR   r(   r   r   r   r   rW      s    	rW   c                       s2   e Zd ZdZd	 fdd	Zdd Zdd Z  ZS )
rY   a  
        An implementation of multi-head attention module, as described in
        'Attention Is All You Need <https://arxiv.org/abs/1706.03762>'

        Args:
            dim: the dimension of features of hidden layers.
            n_heads: the number of head.
    rR   c                    sl   t t|   || _|| _|| | _t||| _t||| _	t||| _
t|| _tjdd| _d S )Nro   r;   )r   rY   r   r   rV   head_dimr	   r
   to_qto_kto_vrZ   rT   Softmaxry   )r   r   rV   rT   r   r   r   r      s   
zMultiHeadAttention.__init__c                 C   s6   |  d d | j| jf }|j| }|ddddS )Nro   r   r   r   r:   )r<   rV   r   r>   permute)r   r{   new_x_shaper   r   r   transpose_for_scores  s   
z'MultiHeadAttention.transpose_for_scoresc                 C   s   |  |}| |}| |}| |}| |}	| |}
t||	dd}|t| j	 }t
||}| |}| |}t||
}|dddd }| d d | jf }|j| }|S )Nro   r   r   r   r:   )r   r   r   r   r   matmul	transposemathsqrtr   mask_logitsry   rT   r   
contiguousr<   r   r>   )r   r[   r\   r]   r^   qkvq_transk_transv_transr|   ctx_vr   r   r   r   r'     s$   









zMultiHeadAttention.forwardr   )r)   r*   r+   r,   r   r   r'   r-   r   r   r   r   rY      s
    	rY   ꌠ9Y>)c                 C   s   | tj}| d| |  S )Ng      ?)typer   float32)inputsr^   
mask_valuer   r   r   r     s   r   )r   )r   r   torch.nnr	   torch.nn.functional
functionalr   Moduler   r.   r2   rO   r6   re   ri   rW   rY   r   r   r   r   r   <module>   s   --4