o
    	۷iM(                     @   s   d dl mZmZmZ ddlmZ ddlmZmZm	Z	m
Z
mZmZ ddlmZmZ e
 r7d dlmZ ddlmZ e r@dd	lmZ e	 rMd d
lZddlmZ eeZeedddG dd deZd
S )    )AnyUnionoverload   )GenerationConfig)add_end_docstringsis_tf_availableis_torch_availableis_vision_availableloggingrequires_backends   )Pipelinebuild_pipeline_init_args)Image)
load_image)'TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMESN)$MODEL_FOR_VISION_2_SEQ_MAPPING_NAMEST)has_tokenizerhas_image_processorc                       s   e Zd ZdZdZdZdZdZdZe	ddZ
 fddZdd	d
Zedeedf dedeeeef  fddZedeee ed f dedeeeeef   fddZdeeee ded f f fddZdddZdd Zdd Z  ZS )ImageToTextPipelinea  
    Image To Text pipeline using a `AutoModelForVision2Seq`. This pipeline predicts a caption for a given image.

    Unless the model you're using explicitly sets these generation parameters in its configuration files
    (`generation_config.json`), the following default values will be used:
    - max_new_tokens: 256

    Example:

    ```python
    >>> from transformers import pipeline

    >>> captioner = pipeline(model="ydshieh/vit-gpt2-coco-en")
    >>> captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
    [{'generated_text': 'two birds are standing next to each other '}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This image to text pipeline can currently be loaded from pipeline() using the following task identifier:
    "image-to-text".

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-to-text).
    TF   )max_new_tokensc                    s>   t  j|i | t| d | | jdkrt d S t d S )Nvisiontf)super__init__r   check_model_type	frameworkr   r   )selfargskwargs	__class__ Z/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/pipelines/image_to_text.pyr   T   s   
zImageToTextPipeline.__init__Nc                 C   s   i }i }|d ur||d< |d ur||d< |d ur||d< |d ur1|d ur,d|v r,t d|| | jd ur;| j|d< | jd urJ| j|d< | j|d< ||i fS )Nprompttimeoutr   zp`max_new_tokens` is defined both as an argument and inside `generate_kwargs` argument, please use only 1 versionassistant_model	tokenizerassistant_tokenizer)
ValueErrorupdater(   r*   r)   )r   r   generate_kwargsr&   r'   forward_paramspreprocess_paramsr$   r$   r%   _sanitize_parameters[   s(   






z(ImageToTextPipeline._sanitize_parametersinputszImage.Imager!   returnc                 K      d S Nr$   r   r1   r!   r$   r$   r%   __call__v      zImageToTextPipeline.__call__c                 K   r3   r4   r$   r5   r$   r$   r%   r6   y   r7   c                    s6   d|v r	| d}|du rtdt j|fi |S )a  
        Assign labels to the image(s) passed as inputs.

        Args:
            inputs (`str`, `list[str]`, `PIL.Image` or `list[PIL.Image]`):
                The pipeline handles three types of images:

                - A string containing a HTTP(s) link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

                The pipeline accepts either a single image or a batch of images.

            max_new_tokens (`int`, *optional*):
                The amount of maximum tokens to generate. By default it will use `generate` default.

            generate_kwargs (`Dict`, *optional*):
                Pass it to send all of these arguments directly to `generate` allowing full control of this function.

            timeout (`float`, *optional*, defaults to None):
                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
                the call may block forever.

        Return:
            A list or a list of list of `dict`: Each result comes as a dictionary with the following key:

            - **generated_text** (`str`) -- The generated text.
        imagesNzBCannot call the image-to-text pipeline without an inputs argument!)popr+   r   r6   r5   r"   r$   r%   r6   |   s
   
c                 C   s  t ||d}|d urtd t|tstdt| d| jjj	}|dkrY| j
|| jd}| jdkr:|| j}| j|dd	j}| jjg| }t|d
}|d|i nY|dkrr| j
||| jd}| jdkrq|| j}n@|dkr| j
|| jd}| jdkr|| j}| j|| jd}|| ntd| d| j
|| jd}| jdkr|| j}| jjj	dkr|d u rd |d< |S )N)r'   u   Passing `prompt` to the `image-to-text` pipeline is deprecated and will be removed in version 4.48 of 🤗 Transformers. Use the `image-text-to-text` pipeline insteadz&Received an invalid text input, got - zy - but expected a single string. Note also that one single text can be provided for conditional image to text generation.git)r8   return_tensorsptF)textadd_special_tokensr   	input_ids
pix2struct)r8   header_textr;   zvision-encoder-decoder)r;   zModel type z- does not support conditional text generation)r   loggerwarning_once
isinstancestrr+   typemodelconfig
model_typeimage_processorr   todtyper)   r?   cls_token_idtorchtensor	unsqueezer,   )r   imager&   r'   rI   model_inputsr?   text_inputsr$   r$   r%   
preprocess   sH   





zImageToTextPipeline.preprocessc                 K   sp   d|v rt |d trtdd |d D rd |d< d|vr#| j|d< || jj}| jj|fi ||}|S )Nr?   c                 s   s    | ]}|d u V  qd S r4   r$   ).0xr$   r$   r%   	<genexpr>   s    z/ImageToTextPipeline._forward.<locals>.<genexpr>generation_config)rD   listallrX   r9   rG   main_input_namegenerate)r   rR   r-   r1   model_outputsr$   r$   r%   _forward   s   
zImageToTextPipeline._forwardc                 C   s0   g }|D ]}d| j j|ddi}|| q|S )Ngenerated_textT)skip_special_tokens)r)   decodeappend)r   r]   records
output_idsrecordr$   r$   r%   postprocess   s   zImageToTextPipeline.postprocess)NNNN)NN)__name__
__module____qualname____doc___pipeline_calls_generate_load_processor_load_image_processor_load_feature_extractor_load_tokenizerr   _default_generation_configr   r0   r   r   rE   r   rY   dictr6   rT   r^   rf   __classcell__r$   r$   r"   r%   r   .   s(    
,8&
$3r   )typingr   r   r   
generationr   utilsr   r   r	   r
   r   r   baser   r   PILr   image_utilsr   models.auto.modeling_tf_autor   rN   models.auto.modeling_autor   
get_loggerrg   rB   r   r$   r$   r$   r%   <module>   s    
