o
    ß¥µi±× ã                   @   s<  d Z ddlZddlZddlmZ ddlZddlm  mZ	 ddl
ZddlZddlmZmZmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZmZ dd	lmZmZmZ dd
lmZmZm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ej# .¡  e# /¡ Z0dZ1dZ2dZ3dd„ Z4dd„ Z5G dd„ dej6ƒZ7G dd„ dej6ƒZ8G dd„ dej6ƒZ9G dd„ dej6ƒZ:G dd „ d ej6ƒZ;G d!d"„ d"ej6ƒZ<G d#d$„ d$ej6ƒZ=G d%d&„ d&ej6ƒZ>G d'd(„ d(ej6ƒZ?G d)d*„ d*ej6ƒZ@G d+d,„ d,ej6ƒZAG d-d.„ d.ej6ƒZBG d/d0„ d0ej6ƒZCG d1d2„ d2ej6ƒZDG d3d4„ d4ej6ƒZEG d5d6„ d6ej6ƒZFG d7d8„ d8eƒZGd9ZHd:ZIed;eHƒG d<d=„ d=eGƒƒZJG d>d?„ d?eGƒZKed@eHƒG dAdB„ dBeGƒƒZLG dCdD„ dDeGƒZMG dEdF„ dFeƒZNG dGdH„ dHeNƒZOG dIdJ„ dJeNƒZPG dKdL„ dLeNƒZQG dMdN„ dNeƒZRG dOdP„ dPeRƒZSG dQdR„ dReRƒZTdS )SzPyTorch MPLUG model. é    N)ÚTuple)ÚTensorÚdeviceÚnn)ÚCrossEntropyLoss)Ú
BertConfigÚBertTokenizer)ÚACT2FN)Úadd_code_sample_docstringsÚadd_start_docstringsÚ%add_start_docstrings_to_model_forwardÚreplace_return_docstrings)Ú)BaseModelOutputWithPastAndCrossAttentionsÚ,BaseModelOutputWithPoolingAndCrossAttentionsÚ!CausalLMOutputWithCrossAttentions)ÚPreTrainedModelÚapply_chunking_to_forwardÚ find_pruneable_heads_and_indicesÚprune_linear_layer)Úlogging)ÚHiTeAConfigÚMPlugConfig)ÚMViTv2ÚMViTv2_Base_config)ÚTextGenerator)Ú	ModelFilezconfig.yamlr   r   c                 C   s¦  zddl }ddl}ddl}W n ty   t d¡ ‚ w tj |¡}t 	d 
|¡¡ |j |¡}g }g }	|D ]\}
}t 	d 
|
|¡¡ |j ||
¡}| |
¡ |	 |¡ q6t||	ƒD ]ö\}
}|
 d¡}
tdd„ |
D ƒƒrxt 	d	 
d |
¡¡¡ qZ| }|
D ]|}| d
|¡r‹| d|¡}n|g}|d dksš|d dkr t|dƒ}nH|d dks¬|d dkr²t|dƒ}n6|d dkr¾t|dƒ}n*|d dkrÊt|dƒ}nz	t||d ƒ}W n tyç   t 	d	 
d |
¡¡¡ Y q|w t|ƒdkrøt|d ƒ}|| }q||dd… dkrt|dƒ}n
|dkr| |¡}z|j|jks'J d|j› d|j› dƒ‚W n tyA } z| j|j|jf7  _‚ d}~ww t 	d 
|
¡¡ t |¡|_qZ| S )z'Load tf checkpoints in a pytorch model.r   Nz™Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z(Converting TensorFlow checkpoint from {}z"Loading TF weight {} with shape {}ú/c                 s   s    | ]}|d v V  qdS ))Úadam_vÚadam_mÚAdamWeightDecayOptimizerÚAdamWeightDecayOptimizer_1Úglobal_stepN© )Ú.0Únr"   r"   úf/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/mplug/modeling_mplug.pyÚ	<genexpr>X   s   € ýz*load_tf_weights_in_bert.<locals>.<genexpr>zSkipping {}z[A-Za-z]+_\d+z_(\d+)ÚkernelÚgammaÚweightÚoutput_biasÚbetaÚbiasÚoutput_weightsÚsquadÚ
classifieré   é   iõÿÿÿÚ_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight {})ÚreÚnumpyÚ
tensorflowÚImportErrorÚloggerÚerrorÚosÚpathÚabspathÚinfoÚformatÚtrainÚlist_variablesÚload_variableÚappendÚzipÚsplitÚanyÚjoinÚ	fullmatchÚgetattrÚAttributeErrorÚlenÚintÚ	transposeÚshapeÚAssertionErrorÚargsÚtorchÚ
from_numpyÚdata)ÚmodelÚconfigÚtf_checkpoint_pathr3   ÚnpÚtfÚtf_pathÚ	init_varsÚnamesÚarraysÚnamerL   ÚarrayÚpointerÚm_nameÚscope_namesÚnumÚer"   r"   r%   Úload_tf_weights_in_bert;   s„   ÿû

ýþ€

ÿ€þrb   c                 C   sB   | j tjkrt | ¡ ¡ rt | j ¡jd }tj| | |d} | S )Niè  )ÚminÚmax)ÚdtyperO   Úfloat16ÚisinfrD   Úfinford   Úclamp)ÚtensorÚclamp_valuer"   r"   r%   Ú	clamp_inf…   s   rl   c                       s4   e Zd ZdZ‡ fdd„Z					ddd„Z‡  ZS )	ÚBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    sš   t ƒ  ¡  tj|j|j|jd| _t |j|j¡| _	t |j
|j¡| _tj|j|jd| _t |j¡| _|  dt |j¡ d¡¡ t|ddƒ| _|| _d S )N)Úpadding_idx©ÚepsÚposition_ids)r1   éÿÿÿÿÚposition_embedding_typeÚabsolute)ÚsuperÚ__init__r   Ú	EmbeddingÚ
vocab_sizeÚhidden_sizeÚpad_token_idÚword_embeddingsÚmax_position_embeddingsÚposition_embeddingsÚtype_vocab_sizeÚtoken_type_embeddingsÚ	LayerNormÚlayer_norm_epsÚDropoutÚhidden_dropout_probÚdropoutÚregister_bufferrO   ÚarangeÚexpandrG   rs   rS   ©ÚselfrS   ©Ú	__class__r"   r%   rv      s2   
ýÿÿÿþþ
zBertEmbeddings.__init__Nr   c                 C   sÂ   |d ur	|  ¡ }n|  ¡ d d… }|d }|d u r&| jd d …||| …f }|d u r5tj|tj| jjd}|d u r>|  |¡}|  |¡}|| }	| jdkrU|  	|¡}
|	|
7 }	|  
|	¡}	|  |	¡}	|	S )Nrr   r1   ©re   r   rt   )Úsizerq   rO   ÚzerosÚlongr   r{   r   rs   r}   r€   r„   )r‰   Ú	input_idsÚtoken_type_idsrq   Úinputs_embedsÚpast_key_values_lengthÚinput_shapeÚ
seq_lengthr   Ú
embeddingsr}   r"   r"   r%   Úforwardª   s0   

ÿÿÿ





zBertEmbeddings.forward)NNNNr   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__rv   r—   Ú__classcell__r"   r"   rŠ   r%   rm   Œ   s    ûrm   c                       sZ   e Zd Z‡ fdd„Zdd„ Zdd„ Zdd„ Zd	d
„ Zdd„ Z						ddd„Z	‡  Z
S )ÚBertSelfAttentionc                    s"  t ƒ  ¡  || _|j|j dkrt|dƒstd|j|jf ƒ‚|j| _t|j|j ƒ| _| j| j | _	t
 |j| j	¡| _|rQt
 |j| j	¡| _t
 |j| j	¡| _nt
 |j| j	¡| _t
 |j| j	¡| _t
 |j¡| _t|ddƒ| _| jdks{| jdkrŒ|j| _t
 d|j d	 | j¡| _d
| _d S )Nr   Úembedding_sizezLThe hidden size (%d) is not a multiple of the number of attention heads (%d)rs   rt   Úrelative_keyÚrelative_key_queryr0   r1   F)ru   rv   rS   ry   Únum_attention_headsÚhasattrÚ
ValueErrorrJ   Úattention_head_sizeÚall_head_sizer   ÚLinearÚqueryÚencoder_widthÚkeyÚvaluer‚   Úattention_probs_dropout_probr„   rG   rs   r|   rw   Údistance_embeddingÚsave_attention©r‰   rS   Úis_cross_attentionrŠ   r"   r%   rv   Ð   sB   
ÿ
þÿÿþþ
zBertSelfAttention.__init__c                 C   ó
   || _ d S ©N©Úattn_gradients)r‰   r³   r"   r"   r%   Úsave_attn_gradientsò   ó   
z%BertSelfAttention.save_attn_gradientsc                 C   ó   | j S r±   r²   ©r‰   r"   r"   r%   Úget_attn_gradientsõ   ó   z$BertSelfAttention.get_attn_gradientsc                 C   r°   r±   ©Úattention_map)r‰   r»   r"   r"   r%   Úsave_attention_mapø   rµ   z$BertSelfAttention.save_attention_mapc                 C   r¶   r±   rº   r·   r"   r"   r%   Úget_attention_mapû   r¹   z#BertSelfAttention.get_attention_mapc                 C   s6   |  ¡ d d… | j| jf }|j|Ž }| dddd¡S )Nrr   r   r0   r1   é   )r   r¡   r¤   ÚviewÚpermute)r‰   ÚxÚnew_x_shaper"   r"   r%   Útranspose_for_scoresþ   s
   ÿ
z&BertSelfAttention.transpose_for_scoresNFc                 C   sŠ  |   |¡}|d u}	|	r|  |  |¡¡}
|  |  |¡¡}|}n;|d urI|  |  |¡¡}
|  |  |¡¡}tj|d |
gdd}
tj|d |gdd}n|  |  |¡¡}
|  |  |¡¡}|  |¡}|
|f}t ||
 dd¡¡}t|ƒ}| j	dksz| j	dkrÛ| 
¡ d }tj|tj|jd	 dd¡}tj|tj|jd	 dd¡}|| }|  || j d ¡}|j|jd
}| j	dkrÂt d||¡}|| }n| j	dkrÛt d||¡}t d|
|¡}|| | }|t | j¡ }|d urë|| }tjdd|ƒ}|	r| jr|  |¡ | | j¡ |  |¡}|d ur|| }t ||¡}| dddd¡ ¡ }| 
¡ d d… | j f }|j|Ž }|r;||fn|f}||f }|S )Nr   r0   ©Údimr1   rr   éþÿÿÿrŸ   r    rŒ   ©re   zbhld,lrd->bhlrzbhrd,lrd->bhlrr¾   )!r§   rÃ   r©   rª   rO   ÚcatÚmatmulrK   rl   rs   r   r†   r   r   r¿   r¬   r|   Útore   ÚeinsumÚmathÚsqrtr¤   r   ÚSoftmaxr­   r¼   Úregister_hookr´   r„   rÀ   Ú
contiguousr¥   )r‰   Úhidden_statesÚattention_maskÚ	head_maskÚencoder_hidden_statesÚencoder_attention_maskÚpast_key_valueÚoutput_attentionsÚmixed_query_layerr¯   Ú	key_layerÚvalue_layerÚquery_layerÚattention_scoresr•   Úposition_ids_lÚposition_ids_rÚdistanceÚpositional_embeddingÚrelative_position_scoresÚrelative_position_scores_queryÚrelative_position_scores_keyÚattention_probsÚattention_probs_droppedÚcontext_layerÚnew_context_layer_shapeÚoutputsr"   r"   r%   r—     s¦   

ÿÿ

ÿþþþþÿÿ
ÿ

ÿÿÿ


ÿ
ÿÿÿ
zBertSelfAttention.forward©NNNNNF)r˜   r™   rš   rv   r´   r¸   r¼   r½   rÃ   r—   rœ   r"   r"   rŠ   r%   r   Î   s    "	ør   c                       ó$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )ÚBertSelfOutputc                    sB   t ƒ  ¡  t |j|j¡| _tj|j|jd| _t |j	¡| _
d S ©Nro   )ru   rv   r   r¦   ry   Údenser€   r   r‚   rƒ   r„   rˆ   rŠ   r"   r%   rv   j  ó   
ÿzBertSelfOutput.__init__c                 C   s&   |   |¡}|  |¡}|  || ¡}|S r±   )rí   r„   r€   ©r‰   rÑ   Úinput_tensorr"   r"   r%   r—   q  s   

zBertSelfOutput.forward©r˜   r™   rš   rv   r—   rœ   r"   r"   rŠ   r%   rë   h  ó    rë   c                       s<   e Zd Zd	‡ fdd„	Zdd„ Z						d
dd„Z‡  ZS )ÚBertAttentionFc                    s,   t ƒ  ¡  t||ƒ| _t|ƒ| _tƒ | _d S r±   )ru   rv   r   r‰   rë   ÚoutputÚsetÚpruned_headsr®   rŠ   r"   r%   rv   z  s   

zBertAttention.__init__c                 C   s²   t |ƒdkrd S t|| jj| jj| jƒ\}}t| jj|ƒ| j_t| jj|ƒ| j_t| jj	|ƒ| j_	t| j
j|dd| j
_| jjt |ƒ | j_| jj| jj | j_| j |¡| _d S )Nr   r1   rÄ   )rI   r   r‰   r¡   r¤   rö   r   r§   r©   rª   rô   rí   r¥   Úunion)r‰   ÚheadsÚindexr"   r"   r%   Úprune_heads€  s   
þ
ÿzBertAttention.prune_headsNc              	   C   s<   |   |||||||¡}|  |d |¡}	|	f|dd …  }
|
S )Nr   r1   )r‰   rô   )r‰   rÑ   rÒ   rÓ   rÔ   rÕ   rÖ   r×   Úself_outputsÚattention_outputrè   r"   r"   r%   r—   “  s   
ù	
ÿzBertAttention.forward)Fré   )r˜   r™   rš   rv   rú   r—   rœ   r"   r"   rŠ   r%   ró   x  s    øró   c                       rê   )ÚBertIntermediatec                    sD   t ƒ  ¡  t |j|j¡| _t|jt	ƒrt
|j | _d S |j| _d S r±   )ru   rv   r   r¦   ry   Úintermediate_sizerí   Ú
isinstanceÚ
hidden_actÚstrr	   Úintermediate_act_fnrˆ   rŠ   r"   r%   rv   ®  s
   
zBertIntermediate.__init__c                 C   ó   |   |¡}|  |¡}|S r±   )rí   r  ©r‰   rÑ   r"   r"   r%   r—   ¶  ó   

zBertIntermediate.forwardrñ   r"   r"   rŠ   r%   rý   ¬  s    rý   c                       rê   )Ú
BertOutputc                    sB   t ƒ  ¡  t |j|j¡| _tj|j|jd| _t 	|j
¡| _d S rì   )ru   rv   r   r¦   rþ   ry   rí   r€   r   r‚   rƒ   r„   rˆ   rŠ   r"   r%   rv   ¾  rî   zBertOutput.__init__c                 C   s6   |   |¡}t|ƒ}|  |¡}t|ƒ}|  || ¡}|S r±   )rí   rl   r„   r€   rï   r"   r"   r%   r—   Å  s   

zBertOutput.forwardrñ   r"   r"   rŠ   r%   r  ¼  rò   r  c                       s<   e Zd Z‡ fdd„Z							d	dd„Zdd„ Z‡  ZS )
ÚFusionLayerc                    s^   t ƒ  ¡  || _t| jddƒ| _|j| _d| _t|ƒ| _t|dd| _	t
|ƒ| _t|ƒ| _d S )NÚstride_layeréd   r1   T©r¯   )ru   rv   rS   rG   r  Úchunk_size_feed_forwardÚseq_len_dimró   Ú	attentionÚcrossattentionrý   Úintermediater  rô   ©r‰   rS   Ú	layer_numrŠ   r"   r%   rv   Ð  s   


zFusionLayer.__init__NFc	                 C   s2  |d ur
|d d… nd }	|dks|| j  dkrO| j|||||	d}
|
d }|
dd… }|
d }|d us7J dƒ‚| j||||||d}|d }||dd…  }n/|dkr~|| j  dkr~| jt ||gd¡t ||gd¡|||	d}
|
d }|
dd… }|
d }t| j| j| j|ƒ}|f| }||d |d f }|S )	Nr0   r   ©r×   rÖ   r1   rr   ú>encoder_hidden_states must be given for cross-attention layers©r×   r¾   )	r  r  r  rO   rÈ   r   Úfeed_forward_chunkr  r  )r‰   rÑ   rÒ   rÓ   rÔ   rÕ   Ú
layer_numsrÖ   r×   Úself_attn_past_key_valueÚself_attention_outputsrü   rè   Úpresent_key_valueÚcross_attention_outputsÚlayer_outputr"   r"   r%   r—   Ü  sb   ÿÿÿûúÿûý
zFusionLayer.forwardc                 C   ó   |   |¡}|  ||¡}|S r±   ©r  rô   ©r‰   rü   Úintermediate_outputr  r"   r"   r%   r    ó   
zFusionLayer.feed_forward_chunk)NNNNNNF©r˜   r™   rš   rv   r—   r  rœ   r"   r"   rŠ   r%   r  Î  s    
÷=r  c                       s:   e Zd Z‡ fdd„Z						d	dd„Zdd„ Z‡  ZS )
Ú	BertLayerc                    sd   t ƒ  ¡  || _|j| _d| _t|ƒ| _t| jddƒ| _| jr&t|dd| _	t
|ƒ| _t|ƒ| _d S )Nr1   Úadd_cross_attentionFTr
  )ru   rv   rS   r  r  ró   r  rG   Úhas_cross_attentionr  rý   r  r  rô   r  rŠ   r"   r%   rv   !  s   

ÿÿ
zBertLayer.__init__NFc              	   C   s*  |d ur
|d d… nd }| j |||||d}	|	d }
|	dd… }|	d }| jrz|d us/J dƒ‚t|ƒtkrc| j|
|||| j| jj t|ƒ  || j| jj t|ƒ  |d}|d }
||dd…  }n| j|
|||||d}|d }
||dd…  }t	| j
| j| j|
ƒ}|f| }||d |d f }|S )Nr0   r  r   r1   rr   r  r  )r  r$  ÚtypeÚlistr  r  rS   Úfusion_layerrI   r   r  r  r  )r‰   rÑ   rÒ   rÓ   rÔ   rÕ   rÖ   r×   r  r  rü   rè   r  r  r  r"   r"   r%   r—   0  sv   ÿÿÿûÿþÿþöúÿÿý
zBertLayer.forwardc                 C   r  r±   r  r  r"   r"   r%   r  s  r   zBertLayer.feed_forward_chunkré   r!  r"   r"   rŠ   r%   r"    s    
øCr"  c                       ó8   e Zd Z‡ fdd„Z									ddd„Z‡  ZS )	ÚFusionEncoderc                    sH   t ƒ  ¡  ˆ | _t ‡ fdd„tˆ jƒD ƒ¡| _tdˆ jˆ j	 ƒ| _
d S )Nc                    ó   g | ]}t ˆ |ƒ‘qS r"   )r  ©r#   Úi©rS   r"   r%   Ú
<listcomp>  ó    z*FusionEncoder.__init__.<locals>.<listcomp>r   )ru   rv   rS   r   Ú
ModuleListÚrangeÚnum_hidden_layersÚlayerrd   Úfusion_layersÚstart_layerrˆ   rŠ   r-  r%   rv   {  s   
ÿ

ÿzFusionEncoder.__init__NFTc                    sˆ  |	rdnd }ˆ r
dnd }|rdnd }t | jddƒ| _|jd }|jd }t| jt| jƒƒD ]‹}| j| }|	r;||f }|d urC|| nd }|d urM|| nd ‰t | jddƒr|| jr||rbt	 
d¡ d}‡ ‡fdd	„}tjj ||ƒ||||||| j ¡}n|||||||| j ˆˆ ƒ}|d
 }|r—||d f7 }ˆ r ||d f }|jd || kr¸t |||fd¡\}}||7 }q-|	rÀ||f }||gS )Nr"   r  r	  r1   Úgradient_checkpointingFúh`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting `use_cache=False`...c                    ó   ‡ ‡‡fdd„}|S )Nc                     ó   t ˆ g | ¢ˆ‘ˆ‘R Ž ƒS r±   ©Útuple©Úinputs©Úmoduler×   rÖ   r"   r%   Úcustom_forwardª  ó   ÿzLFusionEncoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr"   ©r?  r@  r  ©r?  r%   Úcreate_custom_forward¨  ó   z4FusionEncoder.forward.<locals>.create_custom_forwardr   rr   )rG   rS   r  rL   r1  r5  rI   r3  Útrainingr7   ÚwarningrO   ÚutilsÚ
checkpointrC   )r‰   rÑ   rÒ   rÓ   rÔ   rÕ   Úpast_key_valuesÚ	use_cacher×   Úoutput_hidden_statesÚreturn_dictÚall_hidden_statesÚall_self_attentionsÚnext_decoder_cacheÚimage_lengthÚtext_lengthr,  Úlayer_moduleÚlayer_head_maskrD  Úlayer_outputsÚencoder_hidden_states_newr"   r  r%   r—   ƒ  s~   



ÿÿÿÿÿÿù
øÿ
ÿ€
zFusionEncoder.forward©	NNNNNNFFTrñ   r"   r"   rŠ   r%   r)  y  s    
ör)  c                       r(  )	ÚBertEncoderc                    s4   t ƒ  ¡  ˆ | _t ‡ fdd„tˆ jƒD ƒ¡| _d S )Nc                    r*  r"   )r"  r+  r-  r"   r%   r.  Û  r/  z(BertEncoder.__init__.<locals>.<listcomp>)ru   rv   rS   r   r0  r1  r2  r3  rˆ   rŠ   r-  r%   rv   ×  s
   

ÿzBertEncoder.__init__NFTc              	      s^  |	rdnd }ˆ r
dnd }ˆ r| j jrdnd }|rdnd }tt| jƒƒD ]k}| j| }|	r1||f }|d ur9|| nd }|d urC|| nd ‰t| j ddƒrn| jrn|rXt d¡ d}‡ ‡fdd„}t	j
j ||ƒ|||||¡}n
||||||ˆˆ ƒ}|d }|r…||d f7 }ˆ rŽ||d	 f }q#|	r–||f }|
s¦td
d„ |||||fD ƒƒS t|||||dS )Nr"   r6  Fr7  c                    r8  )Nc                     r9  r±   r:  r<  r>  r"   r%   r@    rA  zJBertEncoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr"   rB  r  rC  r%   rD     rE  z2BertEncoder.forward.<locals>.create_custom_forwardr   rr   r1   c                 s   s    | ]	}|d ur|V  qd S r±   r"   )r#   Úvr"   r"   r%   r&   !  s   € úúz&BertEncoder.forward.<locals>.<genexpr>)Úlast_hidden_staterJ  rÑ   Ú
attentionsÚcross_attentions)rS   r#  r1  rI   r3  rG   rF  r7   rG  rO   rH  rI  r;  r   )r‰   rÑ   rÒ   rÓ   rÔ   rÕ   rJ  rK  r×   rL  rM  rN  rO  Úall_cross_attentionsrP  r,  rS  rT  rD  rU  r"   r  r%   r—   Ý  sŒ   ÿÿÿ

ÿÿÿÿÿÿýù
ÿ€

ûûzBertEncoder.forwardrW  rñ   r"   r"   rŠ   r%   rX  Õ  s    örX  c                       rê   )Ú
BertPoolerc                    s*   t ƒ  ¡  t |j|j¡| _t ¡ | _d S r±   )ru   rv   r   r¦   ry   rí   ÚTanhÚ
activationrˆ   rŠ   r"   r%   rv   3  s   
zBertPooler.__init__c                 C   s(   |d d …df }|   |¡}|  |¡}|S )Nr   )rí   r`  )r‰   rÑ   Úfirst_token_tensorÚpooled_outputr"   r"   r%   r—   8  s   

zBertPooler.forwardrñ   r"   r"   rŠ   r%   r^  1  ó    r^  c                       rê   )ÚBertPredictionHeadTransformc                    sV   t ƒ  ¡  t |j|j¡| _t|jtƒrt	|j | _
n|j| _
tj|j|jd| _d S rì   )ru   rv   r   r¦   ry   rí   rÿ   r   r  r	   Útransform_act_fnr€   r   rˆ   rŠ   r"   r%   rv   C  s   
ÿz$BertPredictionHeadTransform.__init__c                 C   s"   |   |¡}|  |¡}|  |¡}|S r±   )rí   re  r€   r  r"   r"   r%   r—   M  s   


z#BertPredictionHeadTransform.forwardrñ   r"   r"   rŠ   r%   rd  A  s    
rd  c                       rê   )ÚBertLMPredictionHeadc                    sL   t ƒ  ¡  t|ƒ| _tj|j|jdd| _t 	t
 |j¡¡| _| j| j_d S )NF)r,   )ru   rv   rd  Ú	transformr   r¦   ry   rx   ÚdecoderÚ	ParameterrO   rŽ   r,   rˆ   rŠ   r"   r%   rv   V  s   


ÿzBertLMPredictionHead.__init__c                 C   r  r±   )rg  rh  r  r"   r"   r%   r—   d  r  zBertLMPredictionHead.forwardrñ   r"   r"   rŠ   r%   rf  T  s    rf  c                       rê   )ÚBertOnlyMLMHeadc                    s   t ƒ  ¡  t|ƒ| _d S r±   )ru   rv   rf  Úpredictionsrˆ   rŠ   r"   r%   rv   l  s   
zBertOnlyMLMHead.__init__c                 C   ó   |   |¡}|S r±   )rk  )r‰   Úsequence_outputÚprediction_scoresr"   r"   r%   r—   p  ó   
zBertOnlyMLMHead.forwardrñ   r"   r"   rŠ   r%   rj  j  ó    rj  c                       rê   )ÚBertOnlyNSPHeadc                    s   t ƒ  ¡  t |jd¡| _d S ©Nr0   )ru   rv   r   r¦   ry   Úseq_relationshiprˆ   rŠ   r"   r%   rv   w  s   
zBertOnlyNSPHead.__init__c                 C   rl  r±   )rs  )r‰   rb  Úseq_relationship_scorer"   r"   r%   r—   {  ro  zBertOnlyNSPHead.forwardrñ   r"   r"   rŠ   r%   rq  u  rp  rq  c                       rê   )ÚBertPreTrainingHeadsc                    s(   t ƒ  ¡  t|ƒ| _t |jd¡| _d S rr  )ru   rv   rf  rk  r   r¦   ry   rs  rˆ   rŠ   r"   r%   rv   ‚  s   

zBertPreTrainingHeads.__init__c                 C   s   |   |¡}|  |¡}||fS r±   )rk  rs  )r‰   rm  rb  rn  rt  r"   r"   r%   r—   ‡  s   

zBertPreTrainingHeads.forwardrñ   r"   r"   rŠ   r%   ru  €  rc  ru  c                   @   s*   e Zd ZdZeZeZdZdgZ	dd„ Z
dS )ÚBertPreTrainedModelz†
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    Úbertrq   c                 C   s~   t |tjtjfƒr|jjjd| jjd nt |tj	ƒr(|j
j ¡  |jj d¡ t |tjƒr;|j
dur=|j
j ¡  dS dS dS )z Initialize the weights g        )ÚmeanÚstdç      ð?N)rÿ   r   r¦   rw   r)   rQ   Únormal_rS   Úinitializer_ranger€   r,   Úzero_Úfill_)r‰   r?  r"   r"   r%   Ú_init_weights˜  s   ÿÿz!BertPreTrainedModel._init_weightsN)r˜   r™   rš   r›   r   Úconfig_classrb   Úload_tf_weightsÚbase_model_prefixÚ_keys_to_ignore_on_load_missingr  r"   r"   r"   r%   rv    s    rv  a  
    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
    general usage and behavior.
    Parameters:
        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
            weights.
a  
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
            Indices of input sequence tokens in the vocabulary.
            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
            details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
            1]``:
            - 0 corresponds to a `sentence A` token,
            - 1 corresponds to a `sentence B` token.
            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
            config.max_position_embeddings - 1]``.
            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
            vectors than the model's internal embedding lookup matrix.
        output_attentions (:obj:`bool`, `optional`):
            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
            tensors for more detail.
        output_hidden_states (:obj:`bool`, `optional`):
            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
            more detail.
        return_dict (:obj:`bool`, `optional`):
            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
z^The bare Bert Model transformer outputting raw hidden-states without any specific head on top.c                       ó¢   e Zd ZdZd‡ fdd„	Zdd„ Zdd„ Zd	d
„ Zee	 
d¡ƒeedeeddedee dededef
dd„ƒƒZ															ddd„Z‡  ZS )Ú	BertModelá=  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
    input to the forward pass.
    Tc                    sD   t ƒ  |¡ || _t|ƒ| _t|ƒ| _|rt|ƒnd | _|  	¡  d S r±   )
ru   rv   rS   rm   r–   rX  Úencoderr^  ÚpoolerÚinit_weights©r‰   rS   Úadd_pooling_layerrŠ   r"   r%   rv   ì  s   

zBertModel.__init__c                 C   ó   | j jS r±   ©r–   r{   r·   r"   r"   r%   Úget_input_embeddingsø  ó   zBertModel.get_input_embeddingsc                 C   ó   || j _d S r±   r  ©r‰   rª   r"   r"   r%   Úset_input_embeddingsû  ó   zBertModel.set_input_embeddingsc                 C   ó*   |  ¡ D ]\}}| jj| j |¡ qdS ©z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N©Úitemsr‡  r3  r  rú   ©r‰   Úheads_to_pruner3  rø   r"   r"   r%   Ú_prune_headsþ  ó   ÿzBertModel._prune_headsúbatch_size, sequence_lengthúbert-base-uncased©Úprocessor_classrI  Úoutput_typer€  rÒ   r”   r   Ú
is_decoderÚreturnc                 C   óX  |  ¡ dkr|dd…ddd…dd…f }nˆ|  ¡ dkr”|r‡|\}}tj||d}|dddd…f  ||d¡|ddd…df k}	|	 |j¡}	|	jd |jd k rl|jd |	jd  }
tjtj|||
f||	jd|	gdd}	|	dd…ddd…dd…f |dd…dddd…f  }n|dd…dddd…f }n	t	d	 
||j¡ƒ‚|j| jd
}d| d }|S ©aW  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (:obj:`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (:obj:`Tuple[int]`):
                The shape of the input to the model.
            device: (:obj:`torch.device`):
                The device of the input to the model.

        Returns:
            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
        r¾   Nr0   ©r   r1   )r   re   rr   )ÚaxiszAWrong shape for input_ids (shape {}) or attention_mask (shape {})rÇ   rz  g     ˆÃÀ©rÅ   rO   r†   ÚrepeatrÊ   re   rL   rÈ   Úonesr£   r=   ©r‰   rÒ   r”   r   r¡  Úextended_attention_maskÚ
batch_sizer•   Úseq_idsÚcausal_maskÚprefix_seq_lenr"   r"   r%   Úget_extended_attention_mask  s\   ÿÿÿÿýûøÿþÿ
þ	ÿz%BertModel.get_extended_attention_maskNFc                    sŠ  |dur|nˆ j j}|dur|nˆ j j}|dur|nˆ j j}|r+|dur&|nˆ j j}nd}|dur9|dur9tdƒ‚|durI| ¡ }|\}}|j}n,|dur]| ¡ dd… }|\}}|j}n|durq| ¡ dd… }|\}}|j}ntdƒ‚|
dur‚|
d d jd nd}|du r“t	j
||| f|d}|du r t	j|t	j|d	}ˆ  ||||¡}|durît|ƒtkr¼|d  ¡ \}}}n| ¡ \}}}||f}t|	ƒtkr×‡ fd
d„|	D ƒ}n|	du rèt	j
||d}	ˆ  |	¡}nˆ  |	¡}nd}ˆ  |ˆ j j¡}|du rˆ j|||||d}n|}ˆ j||||||
||||d
}|d }ˆ jdur(ˆ  |¡nd}|s7||f|dd…  S t|||j|j|j|jdS )á~  
        encoder_hidden_states
        (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        past_key_values
        (:obj:`tuple(tuple(torch.FloatTensor))` of length
         :obj:`config.n_layers` with each tuple having 4 tensors of shape
         :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
        use_cache (:obj:`bool`, `optional`):
            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
            decoding (see :obj:`past_key_values`).
        NFúDYou cannot specify both input_ids and inputs_embeds at the same timerr   úGYou have to specify either input_ids or inputs_embeds or encoder_embedsr   r0   r¥  rŒ   c                    ó   g | ]}ˆ   |¡‘qS r"   ©Úinvert_attention_mask©r#   Úmaskr·   r"   r%   r.  º  ó    ÿÿz%BertModel.forward.<locals>.<listcomp>©r   rq   r‘   r’   r“   ©	rÒ   rÓ   rÔ   rÕ   rJ  rK  r×   rL  rM  r1   ©rZ  Úpooler_outputrJ  rÑ   r[  r\  ©rS   r×   rL  Úuse_return_dictrK  r£   r   r   rL   rO   r©  rŽ   r   r°  r%  r&  r¶  Úget_head_maskr2  r–   r‡  rˆ  r   rJ  rÑ   r[  r\  ©r‰   r   rÒ   r‘   rq   rÓ   r’   Úencoder_embedsrÔ   rÕ   rJ  rK  r×   rL  rM  r¡  r”   r¬  r•   r   r“   r«  Úencoder_batch_sizeÚencoder_sequence_lengthÚ_Úencoder_hidden_shapeÚencoder_extended_attention_maskÚembedding_outputÚencoder_outputsrm  rb  r"   r·   r%   r—   T  sà   (þÿÿÿÿÿ
þÿÿÿ
ÿÿ
þÿÿÿÿ
ûöÿÿÿúzBertModel.forward©T©NNNNNNNNNNNNNNF©r˜   r™   rš   r›   rv   rŽ  r’  rš  r   ÚBERT_INPUTS_DOCSTRINGr=   r
   Ú_TOKENIZER_FOR_DOCr   Ú_CONFIG_FOR_DOCr   r   rJ   r   Úboolr°  r—   rœ   r"   r"   rŠ   r%   r…  Þ  sP    	ÿüÿÿþþHðr…  c                       r„  )ÚFusionModelr†  Tc                    s:   t ƒ  |¡ || _t|ƒ| _|rt|ƒnd | _|  ¡  d S r±   )ru   rv   rS   r)  r‡  r^  rˆ  r‰  rŠ  rŠ   r"   r%   rv     s
   
zFusionModel.__init__c                 C   rŒ  r±   r  r·   r"   r"   r%   rŽ  
  r  z FusionModel.get_input_embeddingsc                 C   r  r±   r  r‘  r"   r"   r%   r’    r“  z FusionModel.set_input_embeddingsc                 C   r”  r•  r–  r˜  r"   r"   r%   rš    r›  zFusionModel._prune_headsrœ  r  rž  rÒ   r”   r   r¡  r¢  c                 C   r£  r¤  r§  rª  r"   r"   r%   r°    s\   ÿÿÿÿýûøÿþÿ
þ	ÿz'FusionModel.get_extended_attention_maskNFc                    s~  |dur|nˆ j j}|dur|nˆ j j}|dur|nˆ j j}|r+|dur&|nˆ j j}nd}|dur9|dur9tdƒ‚|durI| ¡ }|\}}|j}n,|dur]| ¡ dd… }|\}}|j}n|durq| ¡ dd… }|\}}|j}ntdƒ‚|
dur‚|
d d jd nd}|du r“t	j
||| f|d}|du r t	j|t	j|d	}ˆ  ||||¡}|durît|ƒtkr¼|d  ¡ \}}}n| ¡ \}}}||f}t|	ƒtkr×‡ fd
d„|	D ƒ}n|	du rèt	j
||d}	ˆ  |	¡}nˆ  |	¡}nd}ˆ  |ˆ j j¡}|du rˆ j|||||d}n|}ˆ j||||||
||||d
}|\}}ˆ jdur(ˆ  |¡nd}|s1||gS t|||j|j|j|jdS )r±  NFr²  rr   r³  r   r0   r¥  rŒ   c                    r´  r"   rµ  r·  r·   r"   r%   r.  Ë  r¹  z'FusionModel.forward.<locals>.<listcomp>rº  r»  r¼  r¾  rÁ  r"   r·   r%   r—   g  sà   &þÿÿÿÿÿ
þÿÿÿ
ÿÿ
þÿÿÿÿ
ûöÿÿÿúzFusionModel.forwardrÊ  rË  rÌ  r"   r"   rŠ   r%   rÑ  ø  sP    	ÿûÿÿþþGñrÑ  zGBert Model with a `language modeling` head on top for CLM fine-tuning. c                       sš   e Zd ZdgZddgZ‡ fdd„Zdd„ Zdd	„ Zee	 
d
¡ƒeeed																			ddd„ƒƒZ		ddd„Zdd„ Z‡  ZS )ÚBertLMHeadModelrˆ  rq   úpredictions.decoder.biasc                    ó0   t ƒ  |¡ t|dd| _t|ƒ| _|  ¡  d S ©NF©r‹  ©ru   rv   r…  rw  rj  Úclsr‰  rˆ   rŠ   r"   r%   rv     ó   
zBertLMHeadModel.__init__c                 C   ó
   | j jjS r±   ©rØ  rk  rh  r·   r"   r"   r%   Úget_output_embeddings  rµ   z%BertLMHeadModel.get_output_embeddingsc                 C   ó   || j j_d S r±   rÛ  ©r‰   Únew_embeddingsr"   r"   r%   Úset_output_embeddings  ó   z%BertLMHeadModel.set_output_embeddingsrœ  )r   r€  NTrx  r   Fc                 C   sŽ  |dur|n| j j}|	durd}| j|||||||||
|||||d}|d }|  |¡}|r=|dd…dd…dd…f  ¡ S d}|	dur}|dd…dd…dd…f  ¡ }|	dd…dd…f  ¡ }	t|d}|| d| j j¡|	 d¡ƒ}| | d¡d¡ 	d¡}|dur£t
j	tj|dd| dd }||	d	k  	d¡}d| | ||  }|s¹|f|d
d…  }|dur·|f| S |S t|||j|j|j|jdS )av
  
        encoder_hidden_states
        (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
        past_key_values
        (:obj:`tuple(tuple(torch.FloatTensor))` of length
         :obj:`config.n_layers` with each tuple having 4 tensors of shape
         :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
        use_cache (:obj:`bool`, `optional`):
            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
            decoding (see :obj:`past_key_values`).
        Returns:

        Example:
            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
            >>> import torch
            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
            >>> config = BertConfig.from_pretrained("bert-base-cased")
            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
            >>> outputs = model(**inputs)
            >>> prediction_logits = outputs.logits
        NF©rÒ   r‘   rq   rÓ   r’   rÔ   rÕ   rJ  rK  r×   rL  rM  r¡  r   rr   r1   )Ú	reductionrÄ   éœÿÿÿr0   ©ÚlossÚlogitsrJ  rÑ   r[  r\  )rS   r¿  rw  rØ  rÐ   r   r¿   rx   r   ÚsumrO   ÚFÚlog_softmaxr   rJ  rÑ   r[  r\  ©r‰   r   rÒ   r‘   rq   rÓ   r’   rÔ   rÕ   ÚlabelsrJ  rK  r×   rL  rM  r¡  rã  Úsoft_labelsÚalphaÚreturn_logitsrè   rm  rn  Úlm_lossÚshifted_prediction_scoresÚloss_fctÚloss_distillrô   r"   r"   r%   r—   "  sr   @ò

ÿÿÿ
þþúzBertLMHeadModel.forwardc                 K   sV   |j }|d u r| |¡}|d ur|d d …dd …f }|||| dd ¡| dd ¡ddœS )Nrr   rÔ   rÕ   T)r   rÒ   rJ  rÔ   rÕ   r¡  )rL   Únew_onesÚget)r‰   r   ÚpastrÒ   Úmodel_kwargsr”   r"   r"   r%   Úprepare_inputs_for_generation  s   


ôz-BertLMHeadModel.prepare_inputs_for_generationc                    s.   d}|D ]}|t ‡ fdd„|D ƒƒf7 }q|S )Nr"   c                 3   s    | ]	}|  d ˆ ¡V  qdS )r   N)Úindex_select)r#   Ú
past_state©Úbeam_idxr"   r%   r&   ½  s
   € 
ÿ
ÿz1BertLMHeadModel._reorder_cache.<locals>.<genexpr>r:  )r‰   rö  rü  Úreordered_pastÚ
layer_pastr"   rû  r%   Ú_reorder_cacheº  s   þzBertLMHeadModel._reorder_cache©NNNNNNNNNNNNNNTrx  Nr   F)NN)r˜   r™   rš   Ú"_keys_to_ignore_on_load_unexpectedrƒ  rv   rÜ  rà  r   rÍ  r=   r   r   rÏ  r—   rø  rÿ  rœ   r"   r"   rŠ   r%   rÒ  
  sL    ÿÿþìx
ýrÒ  c                       sˆ   e Zd ZdgZddgZ‡ fdd„Zdd„ Zdd	„ Zee	 
d
¡ƒeedeed																			ddd„ƒƒZ‡  ZS )ÚBertPrefixModelrˆ  rq   rÓ  c                    rÔ  rÕ  r×  rˆ   rŠ   r"   r%   rv   Ê  rÙ  zBertPrefixModel.__init__c                 C   rÚ  r±   rÛ  r·   r"   r"   r%   rÜ  Ò  rµ   z%BertPrefixModel.get_output_embeddingsc                 C   rÝ  r±   rÛ  rÞ  r"   r"   r%   rà  Õ  rá  z%BertPrefixModel.set_output_embeddingsrœ  r  rž  NTrx  r   Fc                 C   sp  |d ur|n| j j}|	d urd}| j|||||||||
|||||d}|d }|  |¡}|r=|d d …d d…d d …f  ¡ S d }|	d uro|d d …d d…d d …f  ¡ }|	d d …dd …f  ¡ }	tƒ }|| d| j j¡|	 d¡ƒ}|d ur”tj	t
j|dd| dd }||	dk  ¡ }d| | ||  }|sª|f|dd …  }|d ur¨|f| S |S t|||j|j|j|jd	S )
NFrâ  r   rr   r1   rÄ   rä  r0   rå  )rS   r¿  rw  rØ  rÐ   r   r¿   rx   rO   rè  ré  rê  rx  r   rJ  rÑ   r[  r\  rë  r"   r"   r%   r—   Ø  sp   ò

ÿÿÿþþúzBertPrefixModel.forwardr   )r˜   r™   rš   r  rƒ  rv   rÜ  rà  r   rÍ  r=   r
   rÎ  r   rÏ  r—   rœ   r"   r"   rŠ   r%   r  Ã  sH    ÿÿüìr  c                       sŠ   e Zd ZeZ‡ fdd„Zeddd„ƒZeddd	„ƒZ	d
d„ Z
dd„ Zdd„ Ze ¡ dd„ ƒZe ¡ dd„ ƒZddd„Zedd„ ƒZ‡  ZS )ÚMPlugc                    sf   t ƒ  |¡ || _t tj |jt	j
¡¡| _|  |¡ |  |¡| _t| jdd| _t| jdd| _d S rÕ  )ru   rv   rS   r   Úfrom_pretrainedr9   r:   rE   Ú	model_dirr   Ú
VOCAB_FILEÚ	tokenizerÚmodule_settingÚ_initialize_clipÚvisual_encoderr…  Úconfig_encoderÚtext_encoderrÑ  Úconfig_fusionÚfusion_encoderrˆ   rŠ   r"   r%   rv   3  s   ÿ
ÿÿzMPlug.__init__NTc                 C   sÒ   ddl m} |jt|jt|jti}| j 	t
j |t¡¡}||_|d u r&|j}|| |ƒ}|rgt
j |tj¡}tj|dd}	d|	v rE|	d }	d|	v rM|	d }	dd„ |	 ¡ D ƒ}	|j|	d	d
}
td| ƒ t|
ƒ |S )Nr   ©ÚTasksÚcpu©Úmap_locationrR   r?  c                 S   ó   i | ]\}}|  d d¡|“qS ©zmodel.Ú ©Úreplace©r#   ÚkrY  r"   r"   r%   Ú
<dictcomp>V  ó    ÿÿz)MPlug.from_pretrained.<locals>.<dictcomp>F©Ústrictzload checkpoint from %s)Úmodelscope.utils.constantr  Úvisual_question_answeringÚMPlugForVisualQuestionAnsweringÚimage_captioningÚMPlugForImageCaptionÚimage_text_retrievalÚMPlugForImageTextRetrievalr€  Úfrom_yaml_filer9   r:   rE   ÚCONFIG_NAMEr  Útaskr   ÚTORCH_MODEL_BIN_FILErO   Úloadr—  Úload_state_dictÚprint)rØ  r  r(  Úload_checkpointr  Útask_mappingrS   rR   Úcheckpoint_pathrI  Úmsgr"   r"   r%   r  ?  s8   ýÿÿþzMPlug.from_pretrainedéð   c                 C   s®   dd„ }ddl m } | | ¡}d| jv r,t| j| j d ƒ}t t |d d¡ 	¡ ¡}nt| j| j d ƒ}t t |d d	¡ 	¡ ¡}||j
j d
¡| d
¡ƒ|_||j
_|S )Nc                 S   sÐ   |j d }| d d …d d…f | ddd …f }}|d8 }tt t|ƒ¡ƒ}tt |¡ƒ}| d||d¡ dddd¡}|j}tj	| 
¡ ||fdd}| |¡}| dddd¡ d|| d¡}tj||gdd} | S )	Nr1   r   rr   r¾   r0   Úbilinear)r   ÚmoderÄ   )rL   rJ   rÌ   rÍ   rI   ÚreshaperÀ   re   ré  ÚinterpolateÚfloatrÊ   rO   rÈ   )ÚposembÚ
posemb_newÚntok_newÚ
posemb_tokÚposemb_gridÚgs_oldÚgs_newÚorigr"   r"   r%   Úresize_pos_embedc  s(   
&
ÿÿÿ

ÿz0MPlug._initialize_clip.<locals>.resize_pos_embedr1   )ÚclipzViT-B-16é   i   éÄ   i   r   )r@  Úload_from_configÚ	clip_namerJ   Ú	image_resr   ri  rO   rŽ   r6  Úvisualrà   Ú	unsqueezer)   )rS   Únum_patchesr?  r@  Ú
clip_modelÚ	pos_embedr"   r"   r%   r	  `  s    

ÿþzMPlug._initialize_clipc                 C   sæ   |j | _ | j rq|  |¡| _t| jdd| _t| jdd| _t	| j
ƒ| _| j| jg| j| jg| j| jgg| _| jj|jkrht |j| jj¡| _tj| jjdd| _t | jj¡| _| j | j| jg| j| jgg¡ |  ¡  d| _d S d S ©NFrÖ  çê-™—q=ro   ç×£p=
×ï?)Údistillr	  Úvisual_encoder_mr…  r  Útext_encoder_mrÑ  r  Úfusion_encoder_mrÒ  Úconfig_decoderÚtext_decoder_mr
  r  Útext_decoderÚmodel_pairsry   Úvision_widthr   r¦   Ú	visn_fc_mr€   Úvisn_layer_norm_mr‚   rƒ   Ú	dropout_mÚextendÚvisn_fcÚvisn_layer_normÚcopy_paramsÚmomentumrˆ   r"   r"   r%   Úinit_distillˆ  s@   ÿÿ


ýÿÿÿ

ÿÿ
ézMPlug.init_distillc                 O   ó   t ‚r±   ©ÚNotImplementedError©r‰   rN   Úkwargsr"   r"   r%   r—   £  ó   zMPlug.forwardc                 C   s°   t j |j|j¡}t |¡| _| jj| j_	t |¡| _
t |¡| _d| j_| jj| j_	d| _| jj|jkrVt |j| jj¡| _tj| jjdd| _t | jj¡| _d| _d S d S )NTFrL  ro   )r9   r:   rE   r  Úbert_configr   Úfrom_json_filer  Útext_encoder_layersr2  r  rR  r#  Útext_decode_layersÚlargery   rV  r   r¦   r[  r€   r\  r‚   rƒ   r„   ©r‰   rS   Úbert_config_pathr"   r"   r%   r  ¦  s$   ÿÿ
úzMPlug.module_settingc                 C   óH   | j D ]}t|d  ¡ |d  ¡ ƒD ]\}}|j |j¡ d|_qqd S ©Nr   r1   F©rU  rB   Ú
parametersrQ   Úcopy_Úrequires_grad©r‰   Ú
model_pairÚparamÚparam_mr"   r"   r%   r]  ·  ó   

ÿýÿzMPlug.copy_paramsc                 C   óR   | j D ]#}t|d  ¡ |d  ¡ ƒD ]\}}|j| j |jd| j   |_qqd S ©Nr   r1   rz  ©rU  rB   rp  rQ   r^  rs  r"   r"   r%   Ú_momentum_update¿  ó   

ÿ
ÿþÿzMPlug._momentum_updater1   c                 C   ó$   ||g}| j j||d\}}||fS ©N©Úout_size©Úbeam_generatorÚtranslate_batch©r‰   Úquestion_statesÚquestion_attsr€  Úencoder_inputsÚtopk_idsÚtopk_scoresr"   r"   r%   Ú
generationÇ  ó
   
ÿzMPlug.generationc                    ón   dd l ‰|  |¡‰ dg|  ¡  }ˆ||< | j|Ž } t ˆ ‡ ‡‡fdd„tˆ ƒD ƒ¡¡}t | || 	| j
¡¡S )Nr   r1   c                    ó   g | ]}ˆ ˆ  ˆ¡ | ‘qS r"   ©r†   r+  ©Úinit_dimÚn_tilerU   r"   r%   r.  Ö  ó    zMPlug._tile.<locals>.<listcomp>©r4   r   rÅ   r¨  rO   Ú
LongTensorÚconcatenater1  rù  rÊ   r   ©rÁ   rÅ   r‘  Ú
repeat_idxÚorder_indexr"   r  r%   Ú_tileÍ  ó   

ÿÿzMPlug._tile©NT)r1  ©r1   )r˜   r™   rš   r   r€  rv   Úclassmethodr  Ústaticmethodr	  r_  r—   r  rO   Úno_gradr]  r{  rŠ  r™  rœ   r"   r"   rŠ   r%   r  0  s"     '


r  c                       ó0   e Zd Z‡ fdd„Z					ddd„Z‡  ZS )	r!  c                    ó4   t ƒ  |¡ t| jƒ| _t|| jƒ| _|  |¡ d S r±   ©ru   rv   rÒ  rR  rT  r   r‚  r_  rˆ   rŠ   r"   r%   rv   Ü  ó   z(MPlugForVisualQuestionAnswering.__init__Nr   Tc           !      C   s.  |j t|  ¡ ƒjd}| jj|dd}| jr!|  |  |  	|¡¡¡}t
j| ¡ d d… t
jd  |j¡}	|r^	 |j |j| jjkd¡}
| j|j|jdd}|j}| j||j||	dd}|\}}t
 ||gd	¡}t
 |	|jgd	¡}|d u r|d	g|jd
  }g }g }t|ƒD ]\}}||| g| 7 }||| g| 7 }q„t
 |d
¡}t
 |d
¡}| jr9t
 ¡ l |  ¡  | jj|dd}| jrÊ|  |   |  !|¡¡¡}| j"|j|jdd}|j}| j#||j||	dd}|\}}t
 ||gd	¡}g }t|ƒD ]\}}||| g| 7 }qôt
 |d
¡}| j$|j|j||dd}W d   ƒ n	1 sw   Y  | j%|j|j|||
dt&j'|dddd}n| j%|j|j|||
ddd}|d u rNd	}||j( }| )¡ | d
¡ }|S | j|j|jdd}|j}| j||j||	dd}|\}}t
 ||gd	¡}t
 |	|jgd	¡}|  *||¡\}} || fS )NrÇ   T©Úskip_last_layerrr   rä  ©rÒ   rM  F©rÂ  rÒ   rÔ   rÕ   rM  r1   r   ©rÒ   rÔ   rÕ   rï  rÄ   Únone©rÒ   rÔ   rÕ   rì  rM  rí  rã  ©rÒ   rÔ   rÕ   rì  rM  rã  )+rÊ   Únextrp  re   r
  rF  rj  r„   r\  r[  rO   r©  r   r   r   r   Úmasked_fillr  rz   r  rÒ   rZ  r  rÈ   rL   Ú	enumerateÚstackrN  rŸ  r{  rO  rY  rX  rW  rP  rQ  rS  rT  ré  Úsoftmaxræ  rè  rŠ  )!r‰   ÚimageÚquestionÚanswerrî  r  Úweightsr>   Úimage_embedsÚ
image_attsÚanswer_targetsÚtext_outputÚtext_embedsÚfusion_outputÚimage_outputÚquestion_outputÚmerge_text_attentionr…  r†  Úbr$   Úimage_embeds_mÚtext_output_mÚtext_embeds_mÚfusion_output_mÚimage_output_mÚquestion_output_mÚquestion_states_mÚlogits_mÚanswer_outputræ  rˆ  Ú
topk_probsr"   r"   r%   r—   â  s  ÿÿÿÿýû
ÿ
ÿÿÿýûÿûã%øù
	
ýû
ÿÿz'MPlugForVisualQuestionAnswering.forward©Nr   NNTrñ   r"   r"   rŠ   r%   r!  Ú  ó    	ùr!  c                       ó>   e Zd Z‡ fdd„Z			ddd„Z				dd	d
„Z‡  ZS )r#  c                    ó*   t ƒ  |¡ t| jƒ| _t|| jƒ| _d S r±   ©ru   rv   r  rR  rT  r   r‚  rˆ   rŠ   r"   r%   rv   m  ó   zMPlugForImageCaption.__init__NTé   c                 C   sÆ   | j j|dd}| jr|  |  |  |¡¡¡}tj| ¡ d d… tj	d 
|j¡}| j|j|jdd}|j}	| j|	|j||dd}
|
\}}t ||gd¡}t ||jgd¡}| j|||d	\}}||fS )
NTr¤  rr   rÇ   r¦  Fr§  r1   r  )r
  rF  rj  r„   r\  r[  rO   r©  r   r   rÊ   r   r  r   rÒ   rZ  r  rÈ   rŠ  )r‰   r±  r²  r³  r>   r€  rµ  r¶  r¸  r¹  rº  r»  r¼  r½  rˆ  rÈ  r"   r"   r%   Úbeam_searchr  s@   ÿÿÿýûÿ
ÿz MPlugForImageCaption.beam_searchFc              	   C   sØ   |r| j |||d|dS |jt|  ¡ ƒjd}| jj|dd}| jr-|  |  	|  
|¡¡¡}tj| ¡ d d… tjd |j¡}|r`|j |j| jjkd¡}	| j|j|j|||	ddd}
|
j}|S |  ||¡\}}||fS )	NT©r>   r€  rÇ   r¤  rr   rä  r©  r«  )rÐ  rÊ   r¬  rp  re   r
  rF  rj  r„   r\  r[  rO   r©  r   r   r   r   r­  r  rz   rT  rÒ   ræ  rŠ  )r‰   r±  r²  r³  r>   r€  Úscstrµ  r¶  r·  rÇ  ræ  rˆ  rÈ  r"   r"   r%   r—   ‘  s@   
ÿÿÿÿÿùzMPlugForImageCaption.forward©NTrÏ  ©NTrÏ  F©r˜   r™   rš   rv   rÐ  r—   rœ   r"   r"   rŠ   r%   r#  k  s    
û"úr#  c                       s>   e Zd Z‡ fdd„Zdd„ Ze ¡ dd„ ƒZdd	d
„Z‡  Z	S )r%  c                    s&  t ƒ  |¡ |j| _t t g ¡|j ¡| _|j| _|j	| _	|j
| _
|j| _| jj| _|j| _t | j| j¡| _t | j| j¡| _t | jd¡| _|  dt | j| j¡¡ |  dt | j| j¡¡ |  dt d| jfd¡¡ |  dtjdtjd¡ tj| jd	d
| _tj| jd	d
| _|  |¡ d S )Nr0   Úimage_queueÚ
text_queueÚ	idx_queuer1   rä  Ú	queue_ptrrÇ   r   rÄ   )ru   rv   Ú	embed_dimr   ri  rO   r©  ÚtempÚ
queue_sizer^  rî  r  ry   Ú
text_widthr¦   Úvision_projÚ	text_projÚitm_headr…   ÚrandnÚfullrŽ   r   ré  Ú	normalizerÖ  r×  r_  rˆ   rŠ   r"   r%   rv   ¸  s2   
ÿÿÿz#MPlugForImageTextRetrieval.__init__c                 C   s  |j | _ | j r‚|  |¡| _t| jdd| _t| jdd| _t	 
| j| j¡| _t	 
| j| j¡| _| j| jg| j| jg| j| jg| j| jgg| _| jj|jkryt	 
|j| jj¡| _t	j| jjdd| _t	 | jj¡| _| j | j| jg| j| jgg¡ |  ¡  d| _ d S d S rK  )!rN  r	  rO  r…  r  rP  rÑ  r  rQ  r   r¦   rÝ  rÚ  Úvision_proj_mÚtext_proj_mr
  r  rß  rÞ  rU  ry   rV  rW  r€   rX  r‚   rƒ   rY  rZ  r[  r\  r]  r^  rˆ   r"   r"   r%   r_  Ô  sD   ÿÿ


üÿÿÿ

ÿÿ
çz'MPlugForImageTextRetrieval.init_distillc           
      C   s¤   dd„ }||ƒ}||ƒ}||ƒ}|j d }t| jƒ}	|j| jd d …|	|	| …f< |j| jd d …|	|	| …f< |j| jd d …|	|	| …f< |	| | j }	|	| jd< d S )Nc                    sN   t j ¡ sˆ S ‡ fdd„tt j ¡ ƒD ƒ}t jj|ˆ dd t j|dd}|S )z›
            Performs all_gather operation on the provided tensors.
            *** Warning ***: torch.distributed.all_gather has no gradient.
            c                    s   g | ]}t  ˆ ¡‘qS r"   )rO   Ú	ones_like)r#   rÅ  ©rj   r"   r%   r.  û  r¹  z^MPlugForImageTextRetrieval._dequeue_and_enqueue.<locals>.concat_all_gather.<locals>.<listcomp>F)Úasync_opr   rÄ   )rO   ÚdistributedÚis_initializedr1  Úget_world_sizeÚ
all_gatherrÈ   )rj   Útensors_gatherrô   r"   rç  r%   Úconcat_all_gatherô  s   

þÿzJMPlugForImageTextRetrieval._dequeue_and_enqueue.<locals>.concat_all_gatherr   )rL   rJ   rÙ  ÚTrÖ  r×  rØ  rÜ  )
r‰   Ú
image_featÚ	text_featÚidxrî  Úimage_featsÚ
text_featsÚidxsr¬  Úptrr"   r"   r%   Ú_dequeue_and_enqueueñ  s   

z/MPlugForImageTextRetrieval._dequeue_and_enqueueNTc           6   	   C   s(  |rÄ| j j|dd}| jr|  |  |  |¡¡¡}tj| ¡ d d… tj	d 
|j¡}tj|  |d d …dd d …f ¡dd}| j|j|jdd}|j}	tj|  |	d d …dd d …f ¡dd}
| dd¡}tj| ¡ | j ¡  ¡ gdd}t ||¡ ¡ }||jddd	 }t ¡ § |  ¡  | jj|dd}| jr¤|   |  !|  "|¡¡¡}tj|  #|d d …dd d …f ¡dd}tj| ¡ | j$ ¡  ¡ gdd}| j%|j|jdd}tj|  &|jd d …dd d …f ¡dd}tj| ¡ | j' ¡  ¡ gdd}| j(r)|| | j) }|| | j) }| j*tj+|dd d| j* |  }| j*tj+|dd d| j* |  }W d   ƒ n	1 s4w   Y  || | j) }|
| | j) }| j(rntjtj,|dd| dd -¡  }tjtj,|dd| dd -¡  }n"tjtj,|dd| dd -¡  }tjtj,|dd| dd -¡  }|| d
 }|  .|||¡ | j/|	|j||dd\}}t ¡ > | d¡}tj+|d d …d |…f dd} tj+|d d …d |…f dd}!t ||j0¡}"|  1|"d¡ |! 1|"d¡ W d   ƒ n	1 sðw   Y  g }#t2|ƒD ]}$t 3|!|$ d¡ 4¡ }%|# 5||% ¡ qûtj6|#dd}#g }&g }'t2|ƒD ]}$t 3| |$ d¡ 4¡ }%|& 5|	|% ¡ |' 5|j|% ¡ qtj6|&dd}&tj6|'dd}'tj|	|&gdd}(tj|j|'gdd})tj|#|gdd}*tj||gdd}+| j/|(|)|*|+dd\}},tj|d d …dd d …f |,d d …dd d …f gdd}-|  7|-¡}.tj|tj	d}/tj8d
| tj	d}0tj|/|0gdd 
|j¡}1t 9|.|1¡}2||2 S | j|j|jd}|j}
| j j|dd}|  |  |¡¡}tj| ¡ d d… tj	|jd}3| j/|
|j||3dd\}}4|  7|4d d …dd d …f ¡}5tj+|5dd}5|5S )NTr¤  rr   rÇ   r   rÄ   r¦  r1   )Úkeepdimr0   Fr§  )rÒ   rŒ   ):r
  rF  rj  r„   r\  r[  rO   r©  r   r   rÊ   r   ré  rã  rÞ  r  r   rÒ   rZ  rß  r¿   rÈ   ÚtrØ  ÚcloneÚdetachÚeqr6  rè  rŸ  r{  rO  rY  rX  rW  rä  rÖ  rP  rå  r×  rN  rÛ  rî  r°  rê  rx  r÷  r  rï  Úmasked_fill_r1  ÚmultinomialÚitemrA   r¯  rà  rŽ   Úcross_entropy)6r‰   r±  Útextrò  r>   rµ  r¶  rð  r¸  r¹  rñ  Úidx_allÚpos_idxÚsim_targetsr¿  Úimage_feat_mÚimage_feat_allrÀ  Útext_feat_mÚtext_feat_allÚ	sim_i2t_mÚ	sim_t2i_mÚsim_i2t_targetsÚsim_t2i_targetsÚsim_i2tÚsim_t2iÚloss_i2tÚloss_t2iÚloss_itarÅ  Ú
output_posÚbsÚweights_i2tÚweights_t2ir¸  Úimage_embeds_negr¾  Úneg_idxÚtext_embeds_negÚtext_atts_negÚtext_embeds_allÚtext_atts_allÚimage_embeds_allÚimage_atts_allÚ
output_negÚvl_embeddingsÚ	vl_outputÚones_tmpÚ	zeros_tmpÚ
itm_labelsÚloss_itmÚ	image_attrô   Úscoresr"   r"   r%   r—   	  sr  ÿÿÿÿÿýÿÿ
ÿÿÿÿýýþÿþÿÿÿÿ€â!þþþþÿÿÿÿ
û

ù
ÿ
ÿ
û,ÿ

ÿÿÿÿý
ûz"MPlugForImageTextRetrieval.forwardr›  )
r˜   r™   rš   rv   r_  rO   rŸ  r÷  r—   rœ   r"   r"   rŠ   r%   r%  ¶  s    
%r%  c                       s|   e Zd ZeZ‡ fdd„Zeddd„ƒZdd„ Zdd	„ Z	d
d„ Z
e ¡ dd„ ƒZe ¡ dd„ ƒZddd„Zedd„ ƒZ‡  ZS )ÚHiTeAc                    sn   t ƒ  |¡ || _t tj |jt	j
¡¡| _|  |¡ t|jt|jd| _t| jdd| _t| jdd| _d S )N©Úimg_sizerS   Ú
num_framesFrÖ  )ru   rv   rS   r   r  r9   r:   rE   r  r   r  r  r  r   rE  r   r*  r
  r…  r  r  rÑ  r  r  rˆ   rŠ   r"   r%   rv   ¾	  s"   ÿ
ýÿÿzHiTeA.__init__Tc           	      C   s¬   ddl m} |jt|jti}| j tj	 
|t¡¡}||_||j |ƒ}|rTtj	 
|tj¡}tj|dd}d|v r<|d }d|v rD|d }dd„ | ¡ D ƒ}|j|d	d
 |S )Nr   r  r  r  rR   r?  c                 S   r  r  r  r  r"   r"   r%   r  á	  r  z)HiTeA.from_pretrained.<locals>.<dictcomp>Fr  )r  r  Úvideo_question_answeringÚHiTeAForVideoQuestionAnsweringÚvideo_captioningÚHiTeAForVideoCaptionr€  r&  r9   r:   rE   r'  r  r(  r   r)  rO   r*  r—  r+  )	rØ  r  r-  r  r.  rS   rR   r/  rI  r"   r"   r%   r  Í	  s.   þÿÿþzHiTeA.from_pretrainedc                 C   sˆ   |j | _ | j rBt|jt|jd| _t| jdd| _t	| j
dd| _t| jƒ| _| j| jg| j| jg| j| jgg| _|  ¡  d| _d S d S )Nr(  FrÖ  rM  )rN  r   rE  r   r*  rO  r…  r  rP  rÑ  r  rQ  rÒ  rR  rS  r
  r  rT  rU  r]  r^  rˆ   r"   r"   r%   r_  é	  s*   ýÿÿ


ý
ðzHiTeA.init_distillc                 O   r`  r±   ra  rc  r"   r"   r%   r—   ý	  re  zHiTeA.forwardc                 C   sZ   t j |j|j¡}t |¡| _| jj| j_	t |¡| _
t |¡| _d| j_| jj| j_	d S r›  )r9   r:   rE   r  rf  r   rg  r  rh  r2  r  rR  r#  ri  rk  r"   r"   r%   r   
  s   zHiTeA.module_settingc                 C   rm  rn  ro  rs  r"   r"   r%   r]  	
  rw  zHiTeA.copy_paramsc                 C   rx  ry  rz  rs  r"   r"   r%   r{  
  r|  zHiTeA._momentum_updater1   c                 C   r}  r~  r  r„  r"   r"   r%   rŠ  
  r‹  zHiTeA.generationc                    rŒ  )Nr   r1   c                    r  r"   rŽ  r+  r  r"   r%   r.  (
  r’  zHiTeA._tile.<locals>.<listcomp>r“  r–  r"   r  r%   r™  
  rš  zHiTeA._tilerÊ  rœ  )r˜   r™   rš   r   r€  rv   r  r  r_  r—   r  rO   rŸ  r]  r{  rŠ  rž  r™  rœ   r"   r"   rŠ   r%   r'  »	  s    	


r'  c                       r   )	r,  c                    r¡  r±   r¢  rˆ   rŠ   r"   r%   rv   .
  r£  z'HiTeAForVideoQuestionAnswering.__init__Nr   Tc           !      C   sè  |j t|  ¡ ƒjd}|  |¡}tj| ¡ d d… tjd  |j	¡}	|r;	 |j
 |j
| jjkd¡}
| j|j
|jdd}|j}| j||j||	dd}|\}}t ||gd¡}t |	|jgd¡}|d u rkdg|jd	  }g }g }t|ƒD ]\}}||| g| 7 }||| g| 7 }qst |d	¡}t |d	¡}| jrt ¡ [ |  ¡  |  |¡}| j|j
|jdd}|j}| j||j||	dd}|\}}t ||gd¡}g }t|ƒD ]\}}||| g| 7 }qÒt |d	¡}| j|j
|j||dd
}W d   ƒ n1 süw   Y  | j|j
|j|||
dtj|dddd}n| j|j
|j|||
ddd}|d u r+d}||j  }| !¡ | d	¡ }|S | j|j
|jdd}|j}| j||j||	dd}|\}}t ||gd¡}t |	|jgd¡}|  "||¡\}} || fS )NrÇ   rr   rä  Tr¦  Fr§  r1   r   r¨  rÄ   r©  rª  r«  )#rÊ   r¬  rp  re   r
  rO   r©  r   r   r   r   r­  r  rz   r  rÒ   rZ  r  rÈ   rL   r®  r¯  rN  rŸ  r{  rO  rP  rQ  rS  rT  ré  r°  ræ  rè  rŠ  )!r‰   Úvideor²  r³  rî  r  r´  r>   Úvideo_embedsÚ
video_attsr·  r¸  r¹  rº  Úvideo_outputr¼  r½  r…  r†  r¾  r$   Úvideo_embeds_mrÀ  rÁ  rÂ  rÃ  rÄ  rÅ  rÆ  rÇ  ræ  rˆ  rÈ  r"   r"   r%   r—   4
  sì   
ÿÿÿýû
ÿ

ýûÿûè øù
	
ýû
ÿÿz&HiTeAForVideoQuestionAnswering.forwardrÉ  rñ   r"   r"   rŠ   r%   r,  ,
  rÊ  r,  c                       rË  )r.  c                    rÌ  r±   rÍ  rˆ   rŠ   r"   r%   rv   ·
  rÎ  zHiTeAForVideoCaption.__init__NTrÏ  c                 C   s¤   |   |¡}tj| ¡ d d… tjd |j¡}| j|j|j	dd}|j
}	| j|	|j	||dd}
|
\}}t ||gd¡}t ||j	gd¡}| j|||d\}}||fS )	Nrr   rÇ   Tr¦  Fr§  r1   r  )r
  rO   r©  r   r   rÊ   r   r  r   rÒ   rZ  r  rÈ   rŠ  )r‰   r/  r²  r³  r>   r€  r0  r1  r¸  r¹  rº  r2  r¼  r½  rˆ  rÈ  r"   r"   r%   rÐ  ¼
  s8   
ÿÿýûÿ
ÿz HiTeAForVideoCaption.beam_searchFc              	   C   s¶   |r| j |||d|dS |jt|  ¡ ƒjd}|  |¡}tj| ¡ d d… tj	d |j
¡}|rO|j |j| jjkd¡}	| j|j|j|||	ddd}
|
j}|S |  ||¡\}}||fS )NTrÑ  rÇ   rr   rä  r©  r«  )rÐ  rÊ   r¬  rp  re   r
  rO   r©  r   r   r   r   r­  r  rz   rT  rÒ   ræ  rŠ  )r‰   r/  r²  r³  r>   r€  rÒ  r0  r1  r·  rÇ  ræ  rˆ  rÈ  r"   r"   r%   r—   Ø
  s8   
ÿ
ÿÿÿùzHiTeAForVideoCaption.forwardrÓ  rÔ  rÕ  r"   r"   rŠ   r%   r.  µ
  s    
ûúr.  )Ur›   rÌ   r9   Útypingr   rO   Útorch.nn.functionalr   Ú
functionalré  Útorch.utils.checkpointÚtransformersr   r   Útorch.nnr   r   r   Útransformers.activationsr	   Útransformers.file_utilsr
   r   r   r   Útransformers.modeling_outputsr   r   r   Útransformers.modeling_utilsr   r   r   r   Útransformers.utilsr   Ú7modelscope.models.multi_modal.mplug.configuration_mplugr   r   Ú(modelscope.models.multi_modal.mplug.mvitr   r   Ú-modelscope.models.multi_modal.mplug.predictorr   r  r   Úset_verbosity_errorÚ
get_loggerr7   r'  rÏ  rÎ  rb   rl   ÚModulerm   r   rë   ró   rý   r  r  r"  r)  rX  r^  rd  rf  rj  rq  ru  rv  ÚBERT_START_DOCSTRINGrÍ  r…  rÑ  rÒ  r  r  r!  r#  r%  r'  r,  r.  r"   r"   r"   r%   Ú<module>   s˜   
JB 4QZ\\*þ    þ 7m + K  q 