o
    ߥi*3                     @   sh   d dl Z d dlZd dlmZ d dlm  mZ G dd dejZG dd dejZ	G dd dejZ
dS )    Nc                       s@   e Zd Z					d fdd	Zdd Zd	d
 Zdd Z  ZS )SelfAttention   '     Nc                    s  t t|   ddg| _|dur$| }|| jv s$J dg | jR  || _|| _|| _|| _|| _	t
 t
 t
 | _| _| _t| jD ],}| jt
j||| dd | jt
j||| dd | jt
j||| dd qIt
j||dd| _t
jdd| _t
jd	d
| _dS )a   The basic (multi-head) Attention 'cell' containing the learnable parameters of Q, K and V

        :param int input_size: Feature input size of Q, K, V.
        :param int output_size: Feature -hidden- size of Q, K, V.
        :param int freq: The frequency of the sinusoidal positional encoding.
        :param int heads: Number of heads for the attention module.
        :param str | None pos_enc: The type of the positional encoding [supported: Absolute, Relative].
        absoluterelativeNzSupported encodings: F)in_featuresout_featuresbiasdim      ?p)superr   __init__permitted_encodingslower
input_sizeoutput_sizeheadspos_encfreqnn
ModuleListWkWqWvrangeappendLinearoutSoftmaxsoftmaxDropoutdrop)selfr   r   r   r   r   _	__class__ d/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/cv/video_summarization/pgl_sum.pyr      sP   
 zSelfAttention.__init__c                 C   s   | j }| j}tjdd t|D | jjjd}tjdd t|d D | jjjd}||j	d d}|j
|j	d dd}||j	d d}tj||| jjjd}t||d| |   ||d| f< t||d| |   ||d| d f< |S )	a.  Calculate the sinusoidal positional encoding based on the absolute position of each considered frame.
        Based on 'Attention is all you need' paper (https://arxiv.org/abs/1706.03762)

        :param int T: Number of frames contained in Q, K and V
        :return: Tensor with shape [T, T]
        c                 S      g | ]}|qS r+   r+   .0kr+   r+   r,   
<listcomp>I       z5SelfAttention.getAbsolutePosition.<locals>.<listcomp>devicec                 S   r-   r+   r+   r.   r+   r+   r,   r1   K   r2      r   r   r   )r   r   torchtensorr   r"   weightr4   reshapeshaperepeat_interleaverepeatzerossincos)r'   Tr   dposiAPr+   r+   r,   getAbsolutePosition?   s   &*z!SelfAttention.getAbsolutePositionc           
   	   C   s  | j }d| }|d  }tjdd t|D | jjjd}tjdd t|D | jjjd}||jd d}|j	|jd dd}|
|jd d}|| | }tj||| jjjd}tjd	d t|d D | jjjd}	t|d
d
d|	 f ||d
d
d|	 f |d
d
d|	 f  |   |d
d
d|	 f< t|d
d
d|	 d f ||d
d
d|	 d f |d
d
d|	 d f  |   |d
d
d|	 d f< |S )a+  Calculate the sinusoidal positional encoding based on the relative position of each considered frame.
        r_pos calculations as here: https://theaisummer.com/positional-embeddings/

        :param int T: Number of frames contained in Q, K and V
        :return: Tensor with shape [T, T]
        r5   r   c                 S   r-   r+   r+   r.   r+   r+   r,   r1   c   r2   z5SelfAttention.getRelativePosition.<locals>.<listcomp>r3   c                 S   r-   r+   r+   r.   r+   r+   r,   r1   d   r2   r   r   c                 S   r-   r+   r+   r.   r+   r+   r,   r1   o   r2   N)r   r6   r7   r   r"   r8   r4   r9   r:   r;   r<   r=   r>   r?   )
r'   r@   r   rA   min_rposrC   jr_posRPidxr+   r+   r,   getRelativePositionX   s,   
""B6z!SelfAttention.getRelativePositionc                 C   s   g }t | jD ]`}| j| |}| j| |}| j| |}t||dd}| jdurR| jdkr@| j	|j
d d}|| }n| jdkrR| j|j
d d}	||	 }| |}
| |
}t||}|| q| tj|dd}||
 fS )a   Compute the weighted frame features, based on either the global or local (multi-head) attention mechanism.

        :param torch.tensor x: Frame features with shape [T, input_size]
        :return: A tuple of:
                    y: Weighted features based on the attention weights, with shape [T, input_size]
                    att_weights : The attention weights (before dropout), with shape [T, T]
        r   r   Nr   )r@   r   r   )r   r   r   r   r   r6   matmul	transposer   rE   r:   rK   r$   r&   r    r"   catclone)r'   xoutputsheadKQVenergiesrD   rI   att_weights_att_weightsyr+   r+   r,   forwardx   s&   





zSelfAttention.forward)r   r   r   r   N)__name__
__module____qualname__r   rE   rK   rZ   __classcell__r+   r+   r)   r,   r      s    2 r   c                       4   e Zd Z							d	 fdd	Zdd Z  ZS )
MultiAttentionr   r   Nr   c           	   
      s   t t|   t|||||d| _|| _| jdur>| jdks"J dt | _t	| jD ]}| j
t||| ||dd q,g d| _|| _| jdurb| j | _| j| jv sdJ dg | jR  dS dS )a}   Class wrapping the MultiAttention part of PGL-SUM; its key modules and parameters.

        :param int input_size: The expected input feature size.
        :param int output_size: The hidden feature size of the attention mechanisms.
        :param int freq: The frequency of the sinusoidal positional encoding.
        :param None | str pos_enc: The selected positional encoding [absolute, relative].
        :param None | int num_segments: The selected number of segments to split the videos.
        :param int heads: The selected number of global heads.
        :param None | str fusion: The selected type of feature fusion.
        )r   r   r   r   r   Nr5   znum_segments must be None or 2+   )addmultavgmaxzFusion method must be: )r   r`   r   r   	attentionnum_segmentsr   r   local_attentionr   r    permitted_fusionsfusionr   )	r'   r   r   r   r   rg   r   rj   r(   r)   r+   r,   r      s:   



"zMultiAttention.__init__c                 C   sT  |  |\}}| jdur| jdurt|jd | j }t| jD ]}|| }|d | }||| }| j| |\}	}
tj	||| 
 ddd|||< tj	|	ddd}	| jdkrf|||  |	7  < q!| jdkrv|||  |	9  < q!| jdkr|||  |	7  < |||  d  < q!| jd	krt||| 
 |	|||< q!||fS )
a   Compute the weighted frame features, based on the global and locals (multi-head) attention mechanisms.

        :param torch.Tensor x: Tensor with shape [T, input_size] containing the frame features.
        :return: A tuple of:
            weighted_value: Tensor with shape [T, input_size] containing the weighted frame features.
            attn_weights: Tensor with shape [T, T] containing the attention weights.
        Nr   r   r5   )r   r   rb   rc   rd   re   )rf   rg   rj   mathceilr:   r   rh   F	normalizerO   r6   re   )r'   rP   weighted_valueattn_weightssegment_sizesegmentleft_pos	right_poslocal_xweighted_local_valueattn_local_weightsr+   r+   r,   rZ      s@   



zMultiAttention.forwardr   r   r   NNr   Nr[   r\   r]   r   rZ   r^   r+   r+   r)   r,   r`      s    /r`   c                       r_   )
PGL_SUMr   r   Nr   c              	      s   t t|   t|||||||d| _tj||d| _tj| jjdd| _	tj
dd| _tj|dd| _tj| jjdd| _t | _t | _dS )	al   Class wrapping the PGL-SUM model; its key modules and parameters.

        :param int input_size: The expected input feature size.
        :param int output_size: The hidden feature size of the attention mechanisms.
        :param int freq: The frequency of the sinusoidal positional encoding.
        :param None | str pos_enc: The selected positional encoding [absolute, relative].
        :param None | int num_segments: The selected number of segments to split the videos.
        :param int heads: The selected number of global heads.
        :param None | str fusion: The selected type of feature fusion.
        )r   r   r   r   rg   r   rj   )r   r	   r   r   r   gư>)normalized_shapeepsN)r   rz   r   r`   rf   r   r!   linear_1r	   linear_2r%   r&   	LayerNormnorm_ynorm_linearReLUreluSigmoidsigmoid)r'   r   r   r   r   rg   r   rj   r)   r+   r,   r      s.   
zPGL_SUM.__init__c                 C   s   | d|jd }|}| |\}}|| }| |}| |}| |}| |}| |}| |}| |}| 	|}|
dd}||fS )a   Produce frames importance scores from the frame features, using the PGL-SUM model.

        :param torch.Tensor frame_features: Tensor of shape [T, input_size] containing the frame features produced by
        using the pool5 layer of GoogleNet.
        :return: A tuple of:
            y: Tensor with shape [1, T] containing the frames importance scores in [0, 1].
            attn_weights: Tensor with shape [T, T] containing the attention weights.
        r   r   )r9   r:   rf   r&   r   r}   r   r   r~   r   view)r'   frame_featuresresidualro   rp   rY   r+   r+   r,   rZ     s   	







zPGL_SUM.forwardrx   ry   r+   r+   r)   r,   rz      s    (rz   )rk   r6   torch.nnr   torch.nn.functional
functionalrm   Moduler   r`   rz   r+   r+   r+   r,   <module>   s    X