o
    ॵiw                     @   st  d dl mZ d dlZd dlmZ d dlmZmZm	Z	m
Z
mZ d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZ d d	lmZ d d
l m!Z! d dl"m#Z# d dl$m%Z%m&Z&m'Z'm(Z(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/m0Z0m1Z1m2Z2m3Z3 ddl4m5Z5 ddl6m7Z7 ddl8T ddl9m:Z: ddl;m<Z< g dZ=e7j>e/j?e!j@dG dd de5ZAe7j>e/j?e!jBdG dd de5ZCdd ZDe7j>e/j?e!jEdG dd de5ZFe7j>e/j?e!jGdG d d! d!e5ZHe7j>e/j?e!jIdG d"d# d#e5ZJe7j>e/j?e!jKdG d$d% d%e5ZLe7j>e/j?e!jMdG d&d' d'e5ZNe7j>e/j?e!jOdG d(d) d)e5ZPdS )*    N)BytesIO)AnyDictListTupleUnion)Image)create_transform
transforms)ImageFolder)Compose	NormalizeResizeToTensor)snapshot_download)Preprocessors)Input)VCenterCropVCompose
VNormalizeVRescale	VToTensor)
load_image)Config)FieldsInvokeModeKeys	ModelFileTasks   )Preprocessor)PREPROCESSORS)*)
collate_fn)OFA_TASK_KEY_MAPPING)$DiffusionImageGenerationPreprocessorOfaPreprocessorMPlugPreprocessorHiTeAPreprocessorMplugOwlPreprocessor)module_namec                       s6   e Zd ZdZ fddZdeeef fddZ  Z	S )r&   a    Preprocessor the data with the combination of image and text.
        Args:
            data: process the value as an image for keys ending with 'FILE'
                or existing in preprocessor_image_keys and pass-through the values of other keys.

    c              	      s   t  j|i | |dd| _|ddg| _|ddg| _t|dg | _|dd| _t	
t	j| jt	jjd	| jrDt	| jnt	| jt	 t	| j| jg| _d S )
N
resolutioni   meang      ?std
image_keyscenter_cropTinterpolation)super__init__poppreprocessor_resolutionpreprocessor_meanpreprocessor_stdsetpreprocessor_image_keysr0   r   r   r   InterpolationModeBILINEAR
CenterCrop
RandomCropr   r   transform_input)selfargskwargs	__class__ X/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/preprocessors/multi_modal.pyr4   2   s*   z-DiffusionImageGenerationPreprocessor.__init__returnc                 C   sj   i }|  D ],\}}|ds|| jv r(t|}| |}|||dd < q|r,|nd|| < q|S )Nz:FILE )itemsendswithr:   r   r?   replacelower)r@   dataresultskeyvalueimageimgrE   rE   rF   __call__F   s   
z-DiffusionImageGenerationPreprocessor.__call__)
__name__
__module____qualname____doc__r4   r   strr   rS   __classcell__rE   rE   rC   rF   r&   '   s    r&   c                       s   e Zd Zejfdef fddZdeee	e f de
eef fddZdd	 Zdeeee
eef f de
eef fd
dZ  ZS )r'   	model_dirc                    s   t  j|i | tjttjttjttj	t
tjttjttjttjttjttjttjttjti}t|r6|n	t|tjtj id}t!"t#|t$j%| _&|| j&j' | j&||d| _(t)| j&j' | _*| j(j+| _+|,ddrod| _-dS d| _-dS )preprocess the data

        Args:
            model_dir (str): model path
            mode: preprocessor mode (model mode)
        
user_agent)cfgrZ   mode
no_collateNTF).r3   r4   r   ocr_recognitionOfaOcrRecognitionPreprocessorimage_captioningOfaImageCaptioningPreprocessorvisual_groundingOfaVisualGroundingPreprocessorvisual_question_answering&OfaVisualQuestionAnsweringPreprocessorvisual_entailmentOfaVisualEntailmentPreprocessorimage_classification"OfaImageClassificationPreprocessortext_classification!OfaTextClassificationPreprocessortext_summarizationOfaSummarizationPreprocessortext_to_image_synthesis#OfaTextToImageSynthesisPreprocessorauto_speech_recognitionOfaASRPreprocessorsudokuOfaSudokuPreprocessortext2sqlOfaTextToSqlPreprocessorospexistsr   r   KEYPREPROCESSORr   	from_filejoinr   CONFIGURATIONr^   task
preprocessr%   keys	tokenizergetr`   )r@   rZ   r_   rA   rB   preprocess_mappingrC   rE   rF   r4   V   s:   



zOfaPreprocessor.__init__inputrG   c                 C   sB   t  }t|tst|ts|f}t| j|D ]\}}|||< q|S N)dict
isinstancetuplelistzipr   )r@   r   rM   rO   itemrE   rE   rF   _build_dict   s   
zOfaPreprocessor._build_dictc                 C   sz   d|v r;| j jdd dkr;t|d trt|d }n|d }|jdkr*|d}t }|j	|dd t
||d< |S )NrQ   typeofaRGBJPEG)format)r^   modelr   r   rX   r   r_   convertr   saver   open)r@   rM   rQ   
img_bufferrE   rE   rF   #_ofa_input_compatibility_conversion   s   

z3OfaPreprocessor._ofa_input_compatibility_conversionc           	      O   sr   t |tr|}n| |}| |}t }| D ]
\}}t|||< q||d< | jr-|S t|g| jj	| jj
dS )Nsample)pad_idxeos_idx)r   r   r   r   rI   rX   r`   r$   r   pad_token_ideos_token_id)	r@   r   rA   rB   rM   r   str_datakvrE   rE   rF   rS      s   


zOfaPreprocessor.__call__)rT   rU   rV   r   	INFERENCErX   r4   r   r   r   r   r   r   r   r   rS   rY   rE   rE   rC   rF   r'   R   s    &)
r'   c                 C   s
   |  dS )Nr   )r   )rQ   rE   rE   rF   _convert_to_rgb   s   
r   c                       s   e Zd Zejfdef fddZdd Z	ddeee	e f de
d	ejfd
dZdefddZdefddZdeeeeeef f d	eeef fddZ  ZS )CLIPPreprocessorrZ   c                    s   t  j|i | t|r|n	t|tjtjid}|| _ddl	m
} d|v r4t|d |r4|d | _n| dtj }||d| _d|v rSt|d trS|d | _nttd|d	d
d | _|  | _ddd| _dS )r[   r\   r   )FullTokenizerr   /)
vocab_filer,   z{}/vision_model_config.jsonutf-8encodingimage_resolutionrR   text)rR   r   N)r3   r4   ry   rz   r   r   r{   r|   r_   1modelscope.models.multi_modal.clip.bert_tokenizerr   r   r   r   
VOCAB_FILEintr   jsonloadr   r   _build_image_transformimg_preprocess
input_keys)r@   rZ   r_   rA   rB   r   r   rC   rE   rF   r4      s0   
zCLIPPreprocessor.__init__c              
   C   s~   | j tjkr(t| jddd ddddd}t|jd d tg |jdd   }|S tt| j| jft	j
d	tt tddg}|S )
N)g?g      ?Toriginalbicubicg3<4'?gwgM?gy{ ?gB91?gwt.?g	U?)
input_sizescaleis_trainingcolor_jitterauto_augmentr2   r-   r.   r1   )r_   r   TRAINr	   r   r   r   r   r   r   BICUBICr   r   )r@   	transformrE   rE   rF   r      s4   
z'CLIPPreprocessor._build_image_transform4   textscontext_lengthrG   c              	   C   s   t |tr|g}g }|D ]#}|| jjd g| j| j|d|d   | jjd g  qtjt	||tj
d}t|D ]\}}t	||ksKJ t|||dt	|f< q?|S )a  
        Returns the tokenized representation of given input string(s)
        Parameters
        ----------
        texts : Union[str, List[str]]
            An input string or a list of input strings to tokenize
        context_length : int
            The context length to use; all baseline models use 24 as the context length
        Returns
        -------
        A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
        z[CLS]N   z[SEP])dtype)r   rX   appendr   vocabconvert_tokens_to_idstokenizetorchzeroslenlong	enumeratetensor)r@   r   r   
all_tokensr   resultitokensrE   rE   rF   r      s(   


zCLIPPreprocessor.tokenizenew_keyc                 C      || j d< d S )NrR   r   r@   r   rE   rE   rF   set_input_img_key     z"CLIPPreprocessor.set_input_img_keyc                 C   r   )Nr   r   r   rE   rE   rF   set_input_text_key  r   z#CLIPPreprocessor.set_input_text_keyr   c                    s^  i } j d }||v r^|| d ur^|| }t|tjr$ |d}n6t|trQtdd |D rAtj fdd|D dd}ndd |D d }t	d| t	d	t
| ||d<  j d
 }	|	|v r||	 d ur||	 }
t|
tr| |
}n-t|
trtdd |
D r |
}ndd |
D d }t	d| t	dt
|
 ||d
< |S )NrR   r   c                 S   s   g | ]}t |tjqS rE   )r   r   .0elemrE   rE   rF   
<listcomp>"  s    z-CLIPPreprocessor.__call__.<locals>.<listcomp>c                    s   g | ]}  |qS rE   )r   r   r@   rE   rF   r   %  s    dimc                 S   s    g | ]}t |tjst|qS rE   )r   r   r   r   rE   rE   rF   r   )  s    
zfimg should be PIL.Image or List[PIL.Image],                             but got a List containing one z4img should be PIL.Image or List[PIL.Image], but got r   c                 S   s   g | ]}t |tqS rE   )r   rX   r   rE   rE   rF   r   B  s    c                 S   s   g | ]}t |tst|qS rE   )r   rX   r   r   rE   rE   rF   r   E  s    z?text should be str or List[str], but got a List containing one z)text should be str or List[str], but got )r   r   r   r   	unsqueezer   allr   stack	TypeErrorr   rX   r   )r@   r   rA   rB   outputinput_img_keyimage_inputimage_tensorunsupported_elem_typeinput_text_key
text_inputtext_tensorrE   r   rF   rS     sf   





zCLIPPreprocessor.__call__)r   )rT   rU   rV   r   r   rX   r4   r   r   r   r   r   
LongTensorr   r   r   r   r   r   rS   rY   rE   rE   rC   rF   r      s&    $
"
r   c                       s   e Zd Zejdfdededef fddZedd Z	ed	d
 Z
dedeejef fddZdeejeeeef f deeef fddZ  ZS )r(      rZ   r_   tokenizer_max_lengthc                    s:   t  j|i | || _|| _|| _d | _d | _i | _d S r   )r3   r4   rZ   r_   r   
_tokenizer_patch_resize_transform
_image_mapr@   rZ   r_   r   rA   rB   rC   rE   rF   r4   Z  s   
zMPlugPreprocessor.__init__c                 C   *   ddl m} | jd u r|| j| _| jS Nr   )BertTokenizertransformersr   r   from_pretrainedrZ   r@   r   rE   rE   rF   r   i     
zMPlugPreprocessor.tokenizerc                 C   |   | j d u r;ddlm} ddlm}m} |t| j	|}d}d}|
|j|j|jftjd| |j||dg| _ | j S )Nr   r
   )CONFIG_NAMEMPlugConfigr   r   r1   r-   r.   )r   torchvisionr   #modelscope.models.multi_modal.mplugr   r   from_yaml_filery   r~   rZ   r   r   	image_resr   r   r   r   )r@   r   r   r   configr-   r.   rE   rE   rF   patch_resize_transformq      
z(MPlugPreprocessor.patch_resize_transformpathrG   c                 C   0   || j vrt| j }t||f| j |< | j | S r   r   r   r   r@   r	  indexrE   rE   rF   
image_open     


zMPlugPreprocessor.image_openrM   c                 C   sN  t t| jtj| _t|t	j	t
fr|}nt|tr!|d }n|d }d}t|t
r3| |\}}|d}| |}| jjtjkrFdn|t|trNdnd|v rTdnd }| j| dd	| jd
d}| jtjkrwtj|gdd}||dS |d }| j|dd	| jd
d}||j |j |j |j d}| jjtjkr||d< |S )Nr   rQ   r   rH   r    r   question
max_lengthTptpadding
truncationr  return_tensorsr   )rQ   r  answer)rQ   question_input_idsquestion_attention_maskanswer_input_idsanswer_attention_maskr  )r   r}   ry   r~   rZ   r   r   r^   r   r   rX   r   r  r   r  r   r   rc   r   rL   r   r_   r   r   r   r   	input_idssqueezeattention_maskimage_text_retrieval)r@   rM   rQ   r  r  r  r   rE   rE   rF   rS     sZ   





zMPlugPreprocessor.__call__)rT   rU   rV   r   r   rX   r   r4   propertyr   r  r   r   r  r   r   r   r   rS   rY   rE   rE   rC   rF   r(   V  s,    



r(   c                       sN   e Zd Zejfdedef fddZdeeef deeef fddZ	  Z
S )	VLDocPreprocessorrZ   r_   c                    s   t  j|i | || _|| _t|d}t|ddd}t|}W d   n1 s,w   Y  ddl	m
} t|tj}	||	| _ddlm}
m} |d	d	|d
 d |d
 d dd	dd| _|
|d |d | j| j|d
 d |d
 d d| _dS )zPreprocess data for the model `VLDocForDocVLEmbedding`.

        Args:
            model_dir (str): model path in model hub.
            mode (str): model mode, in ('train', 'eval', 'inference').
        zconfig.jsonrr   r   Nr   )VLDocXLMTokenizer)	ProcessorImageProcessorT
image_sizer    )heightwidthF)do_preprocess	do_resizer&  do_normalize	apply_ocrmax_seq_lengthmax_block_num)r-  r.  img_processorr   r(  r'  )r3   r4   rZ   r_   ry   r~   r   r   r   0modelscope.models.multi_modal.vldoc.tokenizationr#  r   TOKENIZER_FOLDERr   r   .modelscope.models.multi_modal.vldoc.processingr$  r%  img_procproc)r@   rZ   r_   rA   rB   model_cfg_pathf	model_cfgr#  tokenizer_pathr$  r%  rC   rE   rF   r4     s8   

	

zVLDocPreprocessor.__init__r   rG   c           
   	   O   sz   g }|d D ]%}t |d}t|}|d }|| W d   n1 s&w   Y  q|d |d}| jdi |}	|	S )z
        Args:
            input: {
                'images': ['img_path1', 'img_path2', ...],
                'ocr_info_paths': ['json_path1', 'json_path2', ...]
            }
        Return:
            encodings: Dict[str, Tensor]
        ocr_info_pathsr"  formNimages)r;  	ocr_infosrE   )r   r   r   r   r4  )
r@   r   rA   rB   r<  one_ocr_info_pathr6  ocr_info
proc_input	encodingsrE   rE   rF   rS     s   
zVLDocPreprocessor.__call__)rT   rU   rV   r   r   rX   r4   r   r   rS   rY   rE   rE   rC   rF   r!    s    +
r!  c                       s   e Zd Zejdfdededef fddZedd Z	ed	d
 Z
edd Zdedeejef fddZdededee fddZdeejeeeef f deeef fddZ  ZS )r)   r   rZ   r_   r   c                    s@   t  j|i | || _|| _|| _d | _d | _d | _i | _d S r   )	r3   r4   rZ   r_   r   r   r   _num_frames
_video_mapr   rC   rE   rF   r4   	  s   
zHiTeAPreprocessor.__init__c                 C   r   r   r   r   rE   rE   rF   r     r   zHiTeAPreprocessor.tokenizerc                 C   r   )Nr   r
   r   HiTeAConfigr   r   r1   r  )r   r  r   r  r   rD  r  ry   r~   rZ   r   r   r  r   r   r   r   )r@   r   r   rD  r  r-   r.   rE   rE   rF   r  !  r  z(HiTeAPreprocessor.patch_resize_transformc                 C   sH   | j d u r!ddlm} ddlm}m} |t| j	|}|j
| _ | j S )Nr   r
   rC  )rA  r  r   r  r   rD  r  ry   r~   rZ   
num_frames)r@   r   r   rD  r  rE   rE   rF   rE  5  s   
zHiTeAPreprocessor.num_framesr	  rG   c                 C   s@   || j vrt| j }tj|tdd}||f| j |< | j | S )Nr   )ctx)rB  r   decordVideoReadercpu)r@   r	  r  vrrE   rE   rF   
video_openA  s
   


zHiTeAPreprocessor.video_openrE  vlenc           
      C   s   t ||}tjd||d dt}g }t|d d D ]\}}||||d  d f qdd |D }t||k rL|d g| }	||	d t|< |	}|S )Nr   r    )startstopnumc                 S   s    g | ]}|d  |d  d qS )r   r    r   rE   r   xrE   rE   rF   r   Q  s     z3HiTeAPreprocessor.sample_frames.<locals>.<listcomp>)minnplinspaceastyper   r   r   r   )
r@   rE  rL  acc_samples	intervalsrangesidxintervframe_indicespadded_frame_indicesrE   rE   rF   sample_framesH  s   

zHiTeAPreprocessor.sample_framesrM   c                    sx  t t jtj _t|t	j
tfr|}nt|tr!|d }n|d }d}t|tr3 |\}}  jt|}|d t|| } fdd| D }tj|dd} jjtjkrfdn|t|trndnd|v rtdnd	 } j| d
d jdd} jtj krtj|gdd}||dS |d } j|d
d jdd}||j!" |j#" |j!" |j#" d}|S )Nr   videoc                    s   g | ]
}  t|qS rE   )r  r   	fromarray)r   r6  r   rE   rF   r   k  s    z.HiTeAPreprocessor.__call__.<locals>.<listcomp>r   rH   r    r   r  r  Tr  r  )r_  r  r  )r_  r  r  r  r  )$r   r}   ry   r~   rZ   r   r   r^   r   rG  rH  rX   r   rK  r^  rE  r   seekr   
from_numpy	get_batchasnumpynumpyr   r   r   video_captioningr   rL   r   r_   r   r   r  r  r  )r@   rM   r_  r  r\  r  r  r   rE   r   rF   rS   Y  s`   





zHiTeAPreprocessor.__call__)rT   rU   rV   r   r   rX   r   r4   r   r   r  rE  r   rG  rH  rK  r   r^  r   r   r   r   rS   rY   rE   rE   rC   rF   r)     s2    




r)   c                       s   e Zd Zejfdedef fddZedd Zedd Z	d	ed
e
ejef fddZded
ee fddZdeeee f d
efddZdeeef d
eeef fddZ  ZS )r*   rZ   r_   c                    s>   t  j|i | || _|| _d | _d | _ddi| _i | _d S )N	<|image|>A   )r3   r4   rZ   r_   r   r   media_tokenr   )r@   rZ   r_   rA   rB   rC   rE   rF   r4     s   

zMplugOwlPreprocessor.__init__c                 C   r   )Nr   )LlamaTokenizer)modelscope.models.nlp.llamarj  r   r   rZ   )r@   rj  rE   rE   rF   r     r   zMplugOwlPreprocessor.tokenizerc                 C   sP   | j d u r%ddlm} d}d}||jdtjd| |j||dg| _ | j S )Nr   r
   r   r   )   rl  r1   r  )	r   r  r   r   r   r   r   r   r   )r@   r   r-   r.   rE   rE   rF   r    s   
z+MplugOwlPreprocessor.patch_resize_transformr	  rG   c                 C   r
  r   r  r  rE   rE   rF   r    r  zMplugOwlPreprocessor.image_openr   c                    s   dd t | j D }| j }| jjg} fdd| D }t|r2|| j ddd  }|S |}dtt	j
t| }t	d	| d
 }dd |D }t |D ]!\}	}
|
|v rj|||
 g||
  7 }qV| j|
ddd }||7 }qV|S )Nc                 S   s    i | ]\}}|t |d   qS )r    )r   )r   r   r   rE   rE   rF   
<dictcomp>  s    z6MplugOwlPreprocessor.tokenize_text.<locals>.<dictcomp>c                    s   g | ]}| vqS rE   rE   )r   ri  r   rE   rF   r     s    z6MplugOwlPreprocessor.tokenize_text.<locals>.<listcomp>F)add_special_tokensr  |()c                 S   s   g | ]
}t |d kr|qS )r   )r   rQ  rE   rE   rF   r     s    )r   ri  r   copyr   bos_token_idr   r~   mapreescaper   split)r@   r   media_tokensmedia_lengthsprompt_chunk	condition	enc_chunkpattern
chunk_strsrZ  	chunk_str	tmp_chunkrE   rn  rF   tokenize_text  s:   




z"MplugOwlPreprocessor.tokenize_textmessagesc                 C   s   g }g }|d }|D ]O}|d dkrd}n|d dkrd}nd}t |d tr5| |d  }|| q
|d D ]}t |trG| | }n| d	}||d
  || q9q
d|}|d7 }||fS )Nr  rolesystemrH   userzHuman: zAI: contentrg  rQ   
z
AI: )r   rX   r   r~   )r@   r  r   rQ   turnr  r   trE   rE   rF   r     s,   


zMplugOwlPreprocessor.convertc           	      K   s   i }|  |\}}t|dkr+g }|D ]}|| | |d  tj|dd}qnd}| |}t|g}||d|}|S )a  
        Args:
            messages: {[
                {'role': 'system', 'content': 'message1'},
                {'role': 'user', 'content': 'message2'},
                {'role': 'user', 'content': ['message2', {"image": 'image_path'}, 'message3', ...]},
            ]}
            The 'role' should be choose from ['system', 'user', 'assistant'].
            The 'content' can be either str or List[Union[str, Dict]]
        Return:
            output: Dict[str, Tensor]
        r   r   N)pixel_valuesr  )	r   r   r   r  r  r   r   r  r   )	r@   r  forward_paramsr   r;  r   r  rQ   r  rE   rE   rF   rS     s&   
zMplugOwlPreprocessor.__call__)rT   rU   rV   r   r   rX   r4   r   r   r  r   r   r   r  r   r  r   r   r   rS   rY   rE   rE   rC   rF   r*     s"    

!
r*   c                       s2   e Zd Z fddZdeeef fddZ  ZS )+ImageCaptioningClipInterrogatorPreprocessorc                    s   t  jdi | d S )NrE   )r3   r4   )r@   rB   rC   rE   rF   r4   $  s   z4ImageCaptioningClipInterrogatorPreprocessor.__init__rG   c                 C   s    t |}t|ddd}|S )Nr   r   r    )r   rT  array	transpose)r@   rM   rQ   rE   rE   rF   rS   '  s   z4ImageCaptioningClipInterrogatorPreprocessor.__call__)	rT   rU   rV   r4   r   rX   r   rS   rY   rE   rE   rC   rF   r    s    r  )Qos.pathr	  ry   rv  ior   typingr   r   r   r   r   rG  r   re  rT  r   PILr   	timm.datar	   r  r   torchvision.datasetsr   torchvision.transformsr   r   r   r    modelscope.hub.snapshot_downloadr   modelscope.metainfor   modelscope.pipelines.baser   7modelscope.pipelines.cv.cmdssl_video_embedding_pipeliner   r   r   r   r   modelscope.preprocessorsr   modelscope.utils.configr   modelscope.utils.constantr   r   r   r   r   baser!   builderr"   r   ofa.utils.collater$   ofa.utils.constantr%   __all__register_modulemulti_modal'diffusion_image_generation_preprocessorr&   ofa_tasks_preprocessorr'   r   clip_preprocessorr   mplug_tasks_preprocessorr(   vldoc_preprocessorr!  hitea_tasks_preprocessorr)   mplug_owl_preprocessorr*   /image_captioning_clip_interrogator_preprocessorr  rE   rE   rE   rF   <module>   s   (S *eF 	 