o
    ei5                     @   s   d Z ddlZddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZmZ ddlmZ eeZeeG dd deZG dd dejZG dd dejZ G dd dejZ!G dd dejZ"G dd dejZ#G dd dejZ$G dd dejZ%G dd  d eZ&G d!d" d"ejZ'G d#d$ d$ejZ(eG d%d& d&eZ)G d'd( d(ejZ*G d)d* d*ejZ+e*e+d+Z,ed,d-G d.d/ d/e)Z-G d0d1 d1ejZ.ed2d-G d3d4 d4e)Z/g d5Z0dS )6zPyTorch TVP Model    N)	dataclass)nn   )initialization)ACT2FN)load_backbone)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingModelOutput)PreTrainedModel)auto_docstringlogging   )	TvpConfigc                   @   sj   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ejdf dB ed< dZe
ejdf dB ed< dS )TvpVideoGroundingOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Temporal-Distance IoU loss for video grounding.
    logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
        Contains start_time/duration and end_time/duration. It is the time slot of the videos corresponding to the
        input texts.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.
    Nlosslogits.hidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   tupler    r   r   b/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/tvp/modeling_tvp.pyr   #   s   
 r   c                       s@   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d Z  Z	S )TvpLossa~  
    This class computes the losses for `TvpForVideoGrounding`. The process happens in two steps: 1) we compute
    hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
    ground-truth / prediction (supervise class and box).

    Args:
        losses (`list[str]`):
            List of all the losses to be applied.
    c                    sL   t    | j| j| jd| _|D ]}|| jvr td| dq|| _d S )NioudistancedurationzLoss z not supported)super__init__loss_iouloss_distanceloss_durationloss_map
ValueErrorlosses)selfr,   r   	__class__r   r   r&   B   s   


zTvpLoss.__init__c           	      C   sH   t ||t || }t ||t || }d|jdd|  }|S )z6
        Measure the intersection over union.
        r   r   min)r   r1   maxclamp)	r-   
start_timeend_timecandidates_start_timecandidates_end_timer$   interunionr"   r   r   r   r'   O   s   zTvpLoss.loss_iouc           	      C   sT   t t ||d}t t ||d}t t ||t || |jdd}|S )z5
        Measure the distance of mid points.
        g       @g?r0   )r   divaddr2   r1   r3   )	r-   r4   r5   r6   r7   r$   mid_candidatesmid_groundtruthdistance_diffr   r   r   r(   Y   s   zTvpLoss.loss_distancec           	      C   sB   t ||}t ||}t t t |||}|jdd}|S )z5
        Measure the difference of duration.
        g?r0   )r   subsquarer:   r3   )	r-   r4   r5   r6   r7   r$   duration_candidatesduration_groundtruthduration_diffr   r   r   r)   e   s
   zTvpLoss.loss_durationc              
   C   st   |\}}}t ||}|dddf  |dddf  }}i }	| jD ]}
|	|
| j|
 |||||i q%|	S )am  
        This performs the loss computation.

        Args:
            logits (`torch.FloatTensor`):
                The output logits of head module.
            labels (`list[torch.FloatTensor]`):
                List of tensors ([start, end, duration]), which contains start time, end time of the video corresponding to the text, and also the duration.
        Nr   r   )r   mulfloatr,   updater*   )r-   r   labelsr$   r4   r5   
candidatesr6   r7   losses_dictr   r   r   r   forwardp   s   

*
zTvpLoss.forward)
r   r   r   r   r&   r'   r(   r)   rJ   __classcell__r   r   r.   r   r    7   s    

r    c                       $   e Zd Z fddZdd Z  ZS )TvpVisionModelc              	      s   t    t|| _|jd ur|jjd }n,t| jdr+t| jjdr+| jjjd }nt| jdr>t| jjdr>| jjj}nt	dt
j||jdddddd	| _d S )
Nconfighidden_sizeshidden_sizezBackbone config not foundr   r   F)kernel_sizestridepaddinggroupsbias)r%   r&   r   backbonebackbone_configrP   hasattrrO   rQ   r+   r   Conv2dgrid_encoder_conv)r-   rO   in_channelsr.   r   r   r&      s$   


zTvpVisionModel.__init__c                 C   s   |j \}}}}}||| |||}| |d d }| |}tjj|ddd}tjj|dd}|j dd  \}	}
}||||	|
|}|ddd	d
d}|S )Nfeature_mapsr      )rR   rS   T)inplacer   r      )	shapeviewrW   r[   r   
functional
max_pool2drelupermute)r-   pixel_values
batch_size
num_framesnum_channelsheightwidthgrid_feat_outputsgridnew_channel
new_height	new_widthr   r   r   rJ      s   
zTvpVisionModel.forwardr   r   r   r&   rJ   rK   r   r   r.   r   rM      s    rM   c                       s^   e Zd ZdZ fddZdejdededejfdd	Zdde	fddZ
dde	fddZ  ZS )TvpVisualInputEmbeddingz;
    Takes input of both image and video (multi-frame)
    c                    s   t    t|j|j| _t|j|j| _t|j	|j| _
td|j| _tj|j|jd| _t|j| _|j| _|j	| _	d S )Nr   eps)r%   r&   r   	Embeddingmax_position_embeddingsrQ   position_embeddings max_grid_row_position_embeddingsrow_position_embeddings max_grid_col_position_embeddingscol_position_embeddingstoken_type_embeddings	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutr-   rO   r.   r   r   r&      s   
z TvpVisualInputEmbedding.__init__	embeddingrl   rm   returnc                 C   sl   d }}|| j kr|| j  }|| jkr|| j }|dddd}tjj|||fddd}|dddd}|S )z
        This method allows to interpolate the pre-trained pad weights , to be able to use the model on collection of high
        resolution images (high resolution videos).

        r   r   r   r^   bicubicFscale_factormodealign_corners)rz   r|   rg   r   rd   interpolate)r-   r   rl   rm   h0w0r   r   r   interpolate_pos_encoding   s   



z0TvpVisualInputEmbedding.interpolate_pos_encodingFr   c                 C   s   |j \}}}}t| j|}tj|tj|jd}| |}	dt|j d  |d|f }
|	j	|
 }	t| j
|}tj|tj|jd}| |}|d||f}|j	| }|	| }|rj|| jks_|| j
krj|| ||| }|S || }|S )af  
        Args:
            grid: (batch_size, height, width, hidden_dim)
            interpolate_pos_encoding: (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the pre-trained position encodings.
        Returns:
            grid + col_position_embeddings.view(*col_shape): (batch_size, *, height, width, hidden_dim)
        dtypedevice)r   r   r   )rb   r1   rz   r   arangelongr   r{   lenrc   r|   r}   r   )r-   ro   r   ri   rl   rm   
hidden_dim
row_heightrow_position_idsr{   	row_shape	row_widthcol_position_idsr}   	col_shapepositional_embeddingsr   r   r   add_2d_positional_embeddings   s$   	



z4TvpVisualInputEmbedding.add_2d_positional_embeddingsc                 C   s   |j \}}}}}|d}| j||d}||d|}|j dd }	|j}
tj|	tj|
d}| |}|| }| 	|}| 
|}|S )a  
        Args:
            grid: Array of shape (batch_size, num_frames, height, width, num_channels).
                It contains processed frames extracted from videos, and is generated by Tvp image preprocessor. Note,
                num_frames can be 1
            interpolate_pos_encoding: (bool, *optional*, defaults to `False`):
                Whether to interpolate the pre-trained position encodings.

        Returns:
            embeddings: The embedding of grid with size (batch_size, height*width, num_channels)

        r   r   rN   Nr   )rb   meanr   rc   r   r   zerosr   r~   r   r   )r-   ro   r   ri   rj   rl   rm   rk   visual_tokensvisual_tokens_shaper   token_type_idsr~   
embeddingsr   r   r   rJ      s   



zTvpVisualInputEmbedding.forwardF)r   r   r   r   r&   r   Tensorintr   boolr   rJ   rK   r   r   r.   r   rt      s    )rt   c                       s*   e Zd ZdZ fddZdddZ  ZS )TvpTextInputEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    sl   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _d S )N)padding_idxru   )r%   r&   r   rw   
vocab_sizerQ   pad_token_idword_embeddingsrx   ry   type_vocab_sizer~   r   r   r   r   r   r   r   r.   r   r   r&   #  s   
zTvpTextInputEmbeddings.__init__Nc                 C   s   |d ur	|  }n|  d d }|d }|d ur|jn|j}|d u r4tj|tj|d}|d|}|d u rAtj|tj|d}|d u rJ| |}| 	|}| 
|}	|| |	 }
| |
}
| |
}
|
S )NrN   r   r   r   )sizer   r   r   r   	unsqueezeexpandr   r   ry   r~   r   r   )r-   	input_idsr   position_idsinputs_embedsinput_shape
seq_lengthr   ry   r~   r   r   r   r   rJ   +  s$   





zTvpTextInputEmbeddings.forwardNNNNr   r   r   r   r&   rJ   rK   r   r   r.   r   r      s    r   c                       sL   e Zd Z fddZdejdedefddZ		dd	edB fd
dZ	  Z
S )TvpAttentionc                    s   t    |j|j dkrt|dstd|j d|j |j| _t|j|j | _| j| j | _t	
|j| j| _t	
|j| j| _t	
|j| j| _t	|j| _t	
|j|j| _t	j|j|jd| _t	|j| _d S )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads ru   )r%   r&   rQ   num_attention_headsrY   r+   r   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probattn_dropoutdenser   r   r   r   r   r   r.   r   r   r&   E  s   
zTvpAttention.__init__tensorsequence_lengthri   c                 C   s    | ||| j| jdd S )Nr   r^   )rc   r   r   	transpose
contiguous)r-   r   r   ri   r   r   r   _reshapeY  s   zTvpAttention._reshapeNoutput_attentionsc                 C   s  |j d d \}}| |}| |}| |}| |||}	| |||}
| |||}t|	|
dd}|t	| j
 }|d urG|| }tjj|dd}| |}t||}|dd }|||| j}| |}| |}| || }|r||f}|S |f}|S )Nr^   rN   dimr   )rb   r   r   r   r   r   matmulr   mathsqrtr   r   rd   softmaxr   r   reshaper   r   r   r   )r-   r   attention_maskr   ri   r   mixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresattention_probsattn_outputoutputsr   r   r   rJ   `  s.   





zTvpAttention.forwardNN)r   r   r   r&   r   r   r   r   r   rJ   rK   r   r   r.   r   r   D  s    
r   c                       2   e Zd Z fddZdejdejfddZ  ZS )TvpIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S N)r%   r&   r   r   rQ   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr   r.   r   r   r&     s
   
zTvpIntermediate.__init__r   r   c                 C   s   |  |}| |}|S r   )r   r   )r-   r   r   r   r   rJ     s   

zTvpIntermediate.forwardr   r   r   r&   r   r   rJ   rK   r   r   r.   r   r     s    r   c                       s8   e Zd Z fddZdejdejdejfddZ  ZS )TvpOutputLayerc                    sB   t    t|j|j| _tj|j|jd| _	t
|j| _d S )Nru   )r%   r&   r   r   r   rQ   r   r   r   r   r   r   r   r   r.   r   r   r&     s   
zTvpOutputLayer.__init__r   input_tensorr   c                 C   s&   |  |}| |}| || }|S r   )r   r   r   )r-   r   r   r   r   r   rJ     s   

zTvpOutputLayer.forwardr   r   r   r.   r   r     s    $r   c                       s4   e Zd Z fddZ		ddedB fddZ  ZS )TvpEncodeLayerc                    s,   t    t|| _t|| _t|| _d S r   )r%   r&   r   	attentionr   intermediater   outputr   r.   r   r   r&     s   


zTvpEncodeLayer.__init__Nr   c           	      C   sH   | j |||d}|d }|dd  }| |}| ||}|f| }|S )N)r   r   r   )r   r   r   )	r-   r   r   r   self_attention_outputsattention_outputr   intermediate_outputlayer_outputr   r   r   rJ     s   

zTvpEncodeLayer.forwardr   )r   r   r   r&   r   rJ   rK   r   r   r.   r   r     s    	r   c                
       sP   e Zd Z fddZ				d
dedB dedB dedB deeB fdd	Z  ZS )
TvpEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r   )r   ).0_rO   r   r   
<listcomp>  s    z'TvpEncoder.__init__.<locals>.<listcomp>F)	r%   r&   rO   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   r.   r   r   r&     s   
 
zTvpEncoder.__init__Nr   output_hidden_statesreturn_dictr   c                 C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}d}d}t| jD ]\}}	|r2||f }|	|||}
|
d }|rE||
d f }q'|rM||f }|sb|f}|rY||f }|r`||f }|S t||rh|nd |ro|dS d dS )Nr   r   r   )last_hidden_stater   r   )rO   r   r   r   	enumerater   r	   )r-   r   r   r   r   r   all_hidden_statesall_attentionsilayer_modulelayer_outputsr   r   r   r   rJ     s<   




zTvpEncoder.forwardr   )	r   r   r   r&   r   r   r	   rJ   rK   r   r   r.   r   r     s    	r   c                       r   )	TvpPoolerc                    s*   t    t|j|j| _t | _d S r   )r%   r&   r   r   rQ   r   Tanh
activationr   r.   r   r   r&     s   
zTvpPooler.__init__r   r   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r	  )r-   r   first_token_tensorpooled_outputr   r   r   rJ     s   

zTvpPooler.forwardr   r   r   r.   r   r    s    r  c                   @   s:   e Zd ZU eed< dZdZdZe	 de
jfddZdS )	TvpPreTrainedModelrO   model)videotextTmodulec                 C   s   t |tjtjfrtj|jd| jjd n:t |tj	r(t
|j t|j n't |tjrDtj|jddd |jdurCt|jd nt |trOt|j t |tjr`|jdur`t
|j t|drkt|j t|d	rvt|j t|d
rt|j t|drt|j dS dS )zInitialize the weights        )r   stdfan_outrf   )r   nonlinearityNr   pad_uppad_downpad_left	pad_right)r   r   r   rw   initnormal_weightrO   initializer_ranger   zeros_rV   ones_rZ   kaiming_normal_	constant_TvpModeltext_promptrY   r  r  r  r  )r-   r  r   r   r   _init_weights  s.   





z TvpPreTrainedModel._init_weightsN)r   r   r   r   r   base_model_prefixinput_modalitiessupports_gradient_checkpointingr   no_gradr   Moduler#  r   r   r   r   r    s   
 r  c                       s(   e Zd ZdZ fddZdd Z  ZS )TvpFrameDownPadPrompterz>
    Pad frames extracted from videos only at the bottom.
    c              	      sb   |j dvr	tdt   |j| _|j| _|j| _|j | _ tt	
d|jd|j|jg| _d S )Nr;   replaceremove9`visual_prompter_apply` must be in (add, replace, remove)r   r   )visual_prompter_applyr+   r%   r&   visual_prompt_size	frame_nummax_img_sizer   	Parameterr   randnr  r   r.   r   r   r&   ,  s   


z TvpFrameDownPadPrompter.__init__c                 C   s   | j dkr&tj| j| jg|j|jd}d|| j| j | jd d f< ||9 }| j dkrctj|jd |jd d| j| jg|jd}| j| j }| j	|d d d d d d || jd d f< ||
|j7 }|S )	Nr;   r   r  r,  r   r   r   r   )r.  r   onesr1  r   r   r/  r   rb   r  to)r-   rh   visual_prompt_maskpromptstart_pointr   r   r   rJ   :  s   

*zTvpFrameDownPadPrompter.forwardr   r   r   r.   r   r)  '  s    r)  c                       sN   e Zd ZdZ fddZdejdededejfdd	Zdde	fddZ
  ZS )TvpFramePadPrompterz?
    Pad frames extracted from videos in the surroundings.
    c              
      s   |j dvr	tdt   |j| _|j| _|j | _ |j|jd  | _t	t
d|jd|j|jg| _t	t
d|jd|j|jg| _t	t
d|jd|j|jd  |jg| _t	t
d|jd|j|jd  |jg| _d S )Nr*  r-  r^   r   r   )r.  r+   r%   r&   rj   r1  r/  	base_sizer   r2  r   r3  r  r  r  r  r   r.   r   r   r&   Q  sB   


zTvpFramePadPrompter.__init__r8  rl   rm   r   c                 C   sh   || j  || j  }}|j\}}}}	}
||| ||	|
}tjj|||fddd}||||||}|S )z
        This method allows to interpolate the pre-trained pad weights, to be able to use the model on collection of high
        resolution images (high resolution videos).

        r   Fr   )r1  rb   r   r   rd   r   )r-   r8  rl   rm   r   r   batchrj   channelsprompt_heightprompt_widthr   r   r   interpolate_pad_encodingw  s   z,TvpFramePadPrompter.interpolate_pad_encodingFr@  c                 C   s   |r|j d |j d fn| j| jf\}}| jdvr!td| j | jdv r6tj||g|j|jd}||9 }| jdv r~tjd| j	d	| j
| j
|jd
}tj| j|| jgdd}tj| j|| jgd	d}t|d|g }|rv| |||}|||j }|S )Nr   rN   )r;   r,  r+  z$Invalid visual_prompter_apply value )r+  r,  r   )r+  r;   r   r   r4  ra   r   r   )rb   r1  r.  r+   r   r5  r   r   r   rj   r;  catr  r  r  r  r   r@  r6  )r-   rh   r@  rl   rm   r7  baser8  r   r   r   rJ     s$   



zTvpFramePadPrompter.forwardr   )r   r   r   r   r&   r   r   r   r@  r   rJ   rK   r   r   r.   r   r:  L  s
    &r:  )framedownpadframepadzw
    The bare Tvp Model transformer outputting BaseModelOutputWithPooling object without any specific head on top.
    )custom_introc                       s   e Zd Z fddZdd Zdd Ze							dd	ejdB d
ej	dB dejdB de
dB de
dB de
dB de
deeB fddZ  ZS )r!  c                    s   t  | || _t|| _t|| _t|| _t	|| _
t|| _ttdd|jg| _t|j| _|jtvr?tdt|j || _|   d S )Nr   
   z:`visual_prompter_type` must be in (framedownpad, framepad))r%   r&   rO   rM   vision_modelr   r   rt   visual_embeddingsr   encoderr  poolerr   r2  r   r3  rQ   r"  r   r   r   visual_prompter_typeTVP_PROMPTER_CLASSES_MAPPINGr+   visual_prompter	post_initr   r.   r   r   r&     s   





zTvpModel.__init__c                 C   s   | j jS r   r   r   )r-   r   r   r   get_input_embeddings  s   zTvpModel.get_input_embeddingsc                 C   s   || j _d S r   rO  )r-   r   r   r   r   set_input_embeddings  s   zTvpModel.set_input_embeddingsNFr   rh   r   r   r   r   r   r   c                 K   sD  |dur|n| j j}| | j||d}| j|d}	| j||d}
|durU||
jdd }t	|jd dj
|j|jd}tj|||gd	d
}| || 
|j}| j|	jd d	d	}tj||	|
gdd
}| j|||||d}|ry|jn|d }| |}| |}| |}|s||f|dd  S t|||j|jdS )a  
        Examples:
        ```python
        >>> import torch
        >>> from transformers import AutoConfig, AutoTokenizer, TvpModel

        >>> model = TvpModel.from_pretrained("Jiqing/tiny-random-tvp")

        >>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

        >>> pixel_values = torch.rand(1, 1, 3, 448, 448)
        >>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
        >>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
        ```N)r@  )r   r   r^   r   rF  )r   r   rN   r   r   )r   r   r   r   )r   pooler_outputr   r   )rO   r   rG  rM  r   rH  new_onesrb   r   r5  r6  r   r   rA  get_extended_attention_maskr   r"  r   rI  r   rJ  r   r
   r   r   )r-   r   rh   r   r   r   r   r   kwargstext_embedding_outputvisual_embedding_outputvisual_attention_maskpt_maskr"  embedding_outputencoder_outputsr   r  r   r   r   rJ     sH   


zTvpModel.forward)NNNNNNF)r   r   r   r&   rP  rQ  r   r   
LongTensorr   r   r   r
   rJ   rK   r   r   r.   r   r!    s:    
r!  c                       rL   )TvpVideoGroundingHeadc                    sL   t    t|j|jd | _t|jd d| _t | _t	 | _
d S )Nr^   )r%   r&   r   r   rQ   layer_0layer_1ReLUactivation_0Sigmoidactivation_1r   r.   r   r   r&     s
   

zTvpVideoGroundingHead.__init__c                 C   s$   |  | |}| | |}|S r   )ra  r^  rc  r_  )r-   rR  r   r   r   r   rJ     s   zTvpVideoGroundingHead.forwardrs   r   r   r.   r   r]    s    r]  zb
    Tvp Model with a video grounding head on top computing IoU, distance, and duration loss.
    c                       s   e Zd Z fddZe								ddejdB dejdB dejdB deej	 dB d	e
dB d
e
dB de
dB de
deeB fddZ  ZS )TvpForVideoGroundingc                    s2   t  | || _t|| _t|| _|   d S r   )r%   r&   rO   r!  r  r]  video_grounding_headrN  r   r.   r   r   r&   %  s
   

zTvpForVideoGrounding.__init__NFr   rh   r   rG   r   r   r   r   r   c	              	   K   s   |dur|n| j j}| j|||||||d}
|
d }| |}d}|durJtg d}|| j |||}|d | j j|d   | j j|d   }|s`|f|
dd  }
|dur^|f|
 }
|
S t	|||
j
|
jd	S )
a  
        labels (`torch.FloatTensor` of shape `(batch_size, 3)`, *optional*):
            The labels contains duration, start time, and end time of the video corresponding to the text.

        Examples:
        ```python
        >>> import torch
        >>> from transformers import AutoConfig, AutoTokenizer, TvpForVideoGrounding

        >>> model = TvpForVideoGrounding.from_pretrained("Jiqing/tiny-random-tvp")

        >>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

        >>> pixel_values = torch.rand(1, 1, 3, 448, 448)
        >>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
        >>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
        ```N)r   r   r   r   r   r!   r"   r#   r$   r^   )r   r   r   r   )rO   r   r  re  r    r6  r   distance_loss_weightduration_loss_weightr   r   r   )r-   r   rh   r   rG   r   r   r   r   rU  r   rR  r   r   	criterion	loss_dictr   r   r   rJ   -  sD   	


zTvpForVideoGrounding.forward)NNNNNNNF)r   r   r   r&   r   r   r\  r   r   r   r   r   rJ   rK   r   r   r.   r   rd    s<    	rd  )r!  r  rd  )1r   r   dataclassesr   r   r    r   r  activationsr   backbone_utilsr   modeling_layersr   modeling_outputsr	   r
   r   modeling_utilsr   utilsr   r   configuration_tvpr   
get_loggerr   loggerr   r(  r    rM   rt   r   r   r   r   r   r   r  r  r)  r:  rL  r!  r]  rd  __all__r   r   r   r   <module>   sX   
P(q$F5!%[`L