o
    ߥi/A                     @   s6  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZmZ d d	lmZmZm Z  d d
l!m"Z" e" Z#dgZ$G dd de%Z&G dd deZ'ej(e j)ej*dG dd deZ+			dddZ,dS )    N)	roi_align)Models)
TorchModel)MODELS)FPNTrans)LayoutRobertaModelLayoutRobertaPreTrainedModel)TransformerDecoderTransformerDecoderLayer)ModeKeys	ModelFileTasks)
get_loggerVLDocForDocVLEmbeddingc                   @   s   e Zd Zdd ZdS )GeoVLDocModelOutputsc                 C   s"   || _ || _|| _|| _|| _d S )Ntext_featurestext_mm_featuresblock_vis_featuresblock_vis_mm_featuresimage_mm_features)selfr   r   r   r   r    r   ]/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/vldoc/model.py__init__"   s
   	
zGeoVLDocModelOutputs.__init__N)__name__
__module____qualname__r   r   r   r   r   r       s    r   c                       sZ   e Zd Zd
 fdd	ZdefddZ																		ddd	Z  ZS )GeoVLDocModelFc                    s   t  | || _|| _t| jdd r'| jjd dkr!t|| _nt|| _nt|| _t| jj	dd| _
tddg| _td| jj| _t| jj| jj| jjdd	}t|d| _t| jj| jj| jjdd	}t|d| _|   d S )
Narchitecturesr   r   F)img_size	inner_vit      T)	self_attn)superr   confighard_negtive_samplinggetattrr   r   text_encoderr   
image_sizevisual_encodernnAdaptiveAvgPool2dpoolLinearhidden_size
vis_linearr
   num_attention_headsintermediate_sizer	   cross_modal_textcross_modal_visualinit_weights)r   r&   r'   cross_modal_text_layercross_modal_visual_layer	__class__r   r   r   8   s<   
zGeoVLDocModel.__init__	ckpt_pathc                 C   sF   t j|dd}i }| D ]\}}|dd}|||< q| | d S )Ncpumap_locationgeo_vl_doc_model. )torchloaditemsreplaceload_state_dict)r   r;   
state_dictstate_dict_newkvr   r   r   from_pretrainedZ   s   
zGeoVLDocModel.from_pretrainedNc           +      K   s  |j \}}|d ur|n| jj}||d< | jjd dkr/| j|f||||	|
||||d	|}n| j|f||||	|
||||d	|}|d d \}}|j \}}tjd||jd|d	||}|||f }|j \}}}| 
|}tjd||jd|d	||d}t||fd|| d	j|d
 jd} |d
 jtjkrt|d
 tj| tjd|d
 dd d}!|!j|d
 jd}!nt|d
 | tjd|d
 dd d}!|!dd||d}!| |!}!|!|d }!| |d
 dd}"| |"d}"t|"|!fd}#t|df|j}$t|$|fd}%d| dk}&d|% dk}'| j|dd|#dd|&|'d}(| j|#dd|dd|'|&d})|(dd}(|)dd})|)d d dd f }*t||(|!|*|)dS )N	line_bboxr   r   )	bboxattention_masktoken_type_idsposition_ids	head_maskinputs_embedsoutput_attentionsoutput_hidden_statesreturn_dict   )devicer"      feat_ms)dtypeg     @@)spatial_scaler#   )tgtmemorytgt_key_padding_maskmemory_key_padding_maskr   )shaper&   use_return_dictr   r)   rA   arangerV   reshapeexpandr+   	unsqueezecattorZ   float16r   float32sizesqueezer1   r.   onesr4   	transposer5   r   )+r   	input_idsimagerL   bbox_4p_normalizedrM   first_token_idxesfirst_token_idxes_maskrN   rO   rP   rQ   encoder_hidden_statesencoder_attention_maskpast_key_values	use_cacherR   rS   rT   kwargs
batch_sizeseq_lenoutputssequence_outputpooled_output_	num_firstB_batch_dimfeature_bbox	block_num
visual_out
batch_idxsbatch_idx_with_bboxblk_vis_featuresfull_img_featuresvis_inpsglb_feat_attnvis_masknew_attention_masknew_vis_masktext_mm_featvis_mm_featr   r   r   r   forwardb   s   










zGeoVLDocModel.forward)FNNNNNNNNNNNNNNNNNN)r   r   r   r   strrJ   r   __classcell__r   r   r9   r   r   6   s,    "	r   )module_namec                       sT   e Zd ZdZdef fddZ																		dddZ  ZS )	r   z
    Generate multi-modal document embeddings in segment-level and token-level.

    Args:
        model_dir:
            the path in model hub, e.g., 'damo/multi-modal_convnext-roberta-base_vldoc-embedding'
    	model_dirc           	   	      sF  t  j|d|i| ddlm} tj|d}td	| tj
|s(J ||| _t| j| _tj|tj}tj
|sDJ | j| td	| ddlm} tj|tj}||| _tj rwd	ttjd	dnd
| _tj r| j| j td	ttjd	d d S | j  td d S )Nr   r   )LayoutRobertaConfigzconfig.jsonzLoading config file from {}zLoading model from {})VLDocXLMTokenizerzcuda:{}
LOCAL_RANKr<   z%Use GPU {} for finetuning & inferencez"Use CPU for finetuning & inference)r%   r   ;modelscope.models.multi_modal.vldoc.modeling_layout_robertar   ospathjoinloggerinfoformatexistsfrom_json_filer&   r   	doc_modelr   TORCH_MODEL_FILErJ   0modelscope.models.multi_modal.vldoc.tokenizationr   TOKENIZER_FOLDER	tokenizerrA   cudais_availableintenvirongetrV   rg   float)	r   r   argsrw   r   model_cfg_path
model_pathr   tokenizer_pathr9   r   r   r   	  s6   


zVLDocForDocVLEmbedding.__init__Nc                 K   s   | j di d|d|d|d|d|d|d|d|d	|	d
|
d|d|d|d|d|d|d|d||}t|j|jdS )a  
        Args:
            - input_ids: :math:`(B, T, E)`, the input tokens, where B is the batch size,
              T is the max token size, E is the embedding dimension.
            - image: :math:`(B, C, H, W)`, normalized images.
            - bbox: :math:`(B, T, 4)`, segment boxes denoted by top-left and bottom-right
              vertexes whose values are normalized to [0, 1000).
            - bbox_4p_normalized: :math:`(B, T, 8)`, word boxes denoted by 4 vertexes, whose
              values are normalized to [0, 1).
            - attention_mask: :math:`(B, T)`, mask for input tokens, where 0 means masked.
            - first_token_idxes: :math:`(B, S)`, indexes of the corresponding first tokens
              of all segments, where S is the max segment size.
            - first_token_idxes_mask: :math:`(B, S)`, mask for segments, where 0 means masked.
        Optional:
            - line_rank_id: :math:`(B, T)`, orders of segments.
            - line_rank_inner_id: :math:`(B, T)`, BIE-like tags.

        To be more specific, please refer to the class `TextLayoutSerializer` in
          `modelscope/models/multi_modal/vldoc/processing.py`.
        rn   ro   rL   rp   rM   rq   rr   rN   rO   rP   rQ   rs   rt   ru   rv   rR   rS   rT   )img_embeddingtext_embeddingNr   )r   dictr   r   )r   rn   ro   rL   rp   rM   rq   rr   rN   rO   rP   rQ   rs   rt   ru   rv   rR   rS   rT   rw   vldoc_outputsr   r   r   r   *  sT   
)	
zVLDocForDocVLEmbedding.forwardr   )r   r   r   __doc__r   r   r   r   r   r   r9   r   r      s,    "robertac                    s  d u rt j|ddg }g }t }|dkrFtt|D ]&}|| }	d }
|	dr7|	dd}
t	|
}	|
rE|
||  |
|
 qt||D ]\}}
||
< qKg g g  tdd  d urn_d fdd		d}t| d
stdd  D rd}| |d tdkrtd| jj tdkrtd| jj t dkrtd| jjd | S )Nr<   r=   r   zroberta.zgeo_vl_doc_model.text_encoder.	_metadatar@   c              	      sh   d u ri n	 |d d i }| ||d  | j D ]\}}|d ur1||| d  q d S )NrW   T.)r   _load_from_state_dict_modulesrC   )moduleprefixlocal_metadatanamechild
error_msgsrB   metadatamissing_keysrF   unexpected_keysr   r   rB     s   z$init_pretrained_weight.<locals>.loadgeo_vl_doc_modelc                 s   s    | ]}| d V  qdS )r?   N)
startswith).0sr   r   r   	<genexpr>  s    

z)init_pretrained_weight.<locals>.<genexpr>r?   )r   r   z7Weights of {} not initialized from pretrained model: {}z0Weights from pretrained model not used in {}: {}z*Error(s) in loading state_dict for {}:
	{}z
	)r@   )rA   rB   listkeysrangelenr   rD   copydeepcopyappendzippopr(   r   hasattranyr   r   r   r:   r   RuntimeErrorr   )modelpretrained_model_pathrF   	cache_dirinit_backboneold_keysnew_keysstate_dict_keysikeynew_keyold_keystart_prefixr   r   r   init_pretrained_weightn  sf   


	r   )NNr   )-r   loggingmathr   resysjsonrA   torch.distributeddistributeddisttorch.nnr,   torchvision.opsr   modelscope.metainfor   modelscope.modelsr   modelscope.models.builderr   2modelscope.models.multi_modal.vldoc.conv_fpn_transr   r   r   r   5modelscope.models.multi_modal.vldoc.transformer_localr	   r
   modelscope.utils.constantr   r   r   modelscope.utils.loggerr   r   __all__objectr   r   register_moduledocument_vl_embeddingvldocr   r   r   r   r   r   <module>   s<    Jq