o
    ॵi$!                     @   s|   d dl mZmZ d dlZd dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ ddlmZ ddlm	Z G d	d
 d
eZdS )    )AnyDictN)Image)
transforms)
load_image)ModeKeys   )OfaBasePreprocessorc                       s   e Zd ZdZejf fdd	Zdeee	f deee	f fddZ
deee	f deee	f fdd	Zdeee	f deee	f fd
dZ  ZS )OfaVisualGroundingPreprocessorz6
    OFA preprocessor for visual grounding tasks.
    c              	      s   t t| j|||g|R i | | jjdd| _| jtj	kr>t
t
j| jg| jdt
 t
j| j| j| jdg| _d	S tdd tj| j| jftjjdt tj| j| jdg| _d	S )
zpreprocess the data

        Args:
            cfg(modelscope.utils.config.ConfigDict) : model config
            model_dir (str): model path,
            mode: preprocessor mode (model mode)
        num_binsi  )max_size)meanstdmax_image_sizec                 S   s
   |  dS )NRGB)convert)image r   a/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/preprocessors/ofa/visual_grounding.py<lambda>3   s   
 z9OfaVisualGroundingPreprocessor.__init__.<locals>.<lambda>)interpolation)r   r   N)superr
   __init__cfgmodelgetr   moder   TRAINTComposeRandomResizepatch_image_sizeToTensor	Normalizer   r   r   positioning_transformr   ResizeInterpolationModeBICUBICpatch_resize_transform)selfr   	model_dirr   argskwargs	__class__r   r   r      s>   

z'OfaVisualGroundingPreprocessor.__init__datareturnc                 C   s    | j tjkr| |S | |S )N)r   r   r   _build_train_sample_build_infer_sample)r)   r/   r   r   r   __call__;   s   

z'OfaVisualGroundingPreprocessor.__call__c              	   C   sF  |  || jd  }|j\}}g g g t||gd}|| jd   d\}}}}	tt|t|t|t|	g}
tt|t|t|t|	gg|d< t	dg|d< t|t| t|	t|  g}t||d< | 
||\}}|d	 d |d	 d
 }}dt|d d d | jd
   }dt|d d d
 | jd
   }dt|d d d | jd
   }dt|d d d | jd
   }d||||}| || jd  | j}| jjdd}||}| |}| j|dd}t| j|dd g}||tdg|||| || |
d}|S )aI  
        Building training samples.

        step 1. Preprocessing the image input for model's image input.
            - get the pillow image.
            - calculate the target boxes using for getting the exact area
            in the pillow image for input text by input `region_coord`. in
            training setting, `region_coord` will be a label data.
            - getting the target image as patch images and do some transforms
            such as resize, normalize etc.
        step 2. Preprocessing the text input for model's source text input.
            - do the str preprocessing to text input by function `pre_caption`.
            - build the instruction. the default instruction is
            ` which region does the text " {} " describe?`, `{}` refer to the
            text input.
            - tokenize the instruction as source text input.
        step 3. Preprocessing the patch image boxes for model's target text input.
            - quantize the coordinate of selected patch images
            - concatenate the quantization results by blank
            - tokenize the result above as target text input.
        step 4. Get the previous output tokens using target item without eos token.

        Args:
            data (`Dict[str, Any]`): Input data, should contains the key of `image`
                `text` and `region_coord`.
        Return:
            A dict object, contains source text input, patch images, patch masks
            with `Tensor([True])` value, target, previous output tokens,
            width scale ratio, height scale ratio and region coordinate.
        r   )boxeslabelsareasizeregion_coord,r4   r   r5   r6   r7   r   z<bin_{}>      z{} {} {} {}textprompt, which region does the text " {} " describe?F)add_bosNT)sourcepatch_image
patch_masktargetprev_output_tokensw_resize_ratioh_resize_ratior8   )get_img_pil
column_mapr7   torchtensorstripsplitfloatnparrayr$   formatintr   roundpre_captionmax_src_lengthr   r   r   tokenize_textcatbos_item)r)   r/   r   whboxes_targetx0y0x1y1regionr6   rB   patch_boxesresize_hresize_wquant_x0quant_y0quant_x1quant_y1r8   src_captionr=   r<   src_itemtarget_itemprev_output_itemsampler   r   r   r1   A   sx   
""    




z2OfaVisualGroundingPreprocessor._build_train_samplec                 C   s   |  || jd  }|j\}}| |}t| j| }t| j| }| || jd  | j}| j	j
dd}	|	|}
| |
}||tdg||d}d| jv rz| jd |v rz|| jd   d\}}}}t|t|t|t|g|d	< |S )
a  
        Building inference samples.

        step 1. Preprocessing image input for model's image input.
            - get pillow image from data.
            - do some transforms to the pillow image, such as resize, normalize etc.
        step 2. Preprocessing the text input for model's text input.
            - do the str preprocessing to text input by function `pre_caption`.
            - build the instruction. the default instruction is
            ` which region does the text " {} " describe?`, `{}` refer to the
            text input.
            - tokenize the instruction as source text input.
        step 3. Whether or not to add label data which refer to a region coordinate
            in this task.

        Args:
            data (`Dict[str, Any]`): Input data, should contains the key of `image`
                `text`.
        Return:
            A dict object, contains source text input, patch images, patch masks
            with `Tensor([True])` value, width scale ratio, height scale ratio
            and label.
        r   r<   r=   r>   T)rA   rB   rC   rF   rG   r8   r9   label)rH   rI   r7   r(   rJ   rK   r!   rT   rU   r   r   r   rQ   rV   rL   rM   rN   )r)   r/   r   rY   rZ   rB   rF   rG   rh   r=   r<   ri   rl   r\   r]   r^   r_   r   r   r   r2      s>   






 z2OfaVisualGroundingPreprocessor._build_infer_sample)__name__
__module____qualname____doc__r   	INFERENCEr   r   strr   r3   r1   r2   __classcell__r   r   r-   r   r
      s    "'"*Tr
   )typingr   r   numpyrO   rJ   PILr   torchvisionr   modelscope.preprocessors.imager   modelscope.utils.constantr   baser	   utilsr   r
   r   r   r   r   <module>   s   