o
    ei:                     @   s  d Z ddlZddlmZ ddlmZmZ ddlZddl	m
Z
 ddlmZmZmZ ddlmZmZmZmZ dd	lmZmZmZ dd
lmZmZ erNddlmZ eeZdefddZ dd Z!dd Z"dd Z#dd Z$G dd deddZ%eG dd deZ&dgZ'dS )z
Processor class for Idefics3.
    N)
accumulate)TYPE_CHECKINGUnion   )BatchFeature)
ImageInputis_valid_image
load_image)MultiModalDataProcessingKwargsProcessorMixinUnpack)
AddedTokenBatchEncoding	TextInput)auto_docstringlogging)PreTokenizedInputreturnc                 C   s   t | to	| dS )Nhttp)
isinstancestr
startswith)val r   n/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/idefics3/processing_idefics3.pyis_url%   s   r   c                 C   s   t | pt| S N)r   r   )elemr   r   r   is_image_or_image_url)   s   r   c           	   	   C   s~   d}t |D ]$}t |D ]}|| d|d  d|d  d | |   7 }q|d7 }q|d| |  | |   |  7 }|S )zKPrompt with expanded image tokens for when the image is split into patches. <row_   _col_>
)range)	image_seq_len
image_rows
image_colsfake_token_around_imageimage_tokenglobal_img_tokentext_split_imagesn_hn_wr   r   r   _prompt_split_image-   s"   (
r0   c                 C   s   | |  | |   |  S )z5Prompt with expanded image tokens for a single image.r   )r'   r*   r+   r,   r   r   r   _prompt_single_image@   s   r1   c                 C   s2   | dkr|dkrt ||||dS t|| ||||S )Nr   )r*   r+   r,   )r1   r0   )r(   r)   r'   r*   r+   r,   r   r   r   get_image_prompt_stringJ   s   r2   c                   @   s$   e Zd ZdddddddidZdS )Idefics3ProcessorKwargsTF)add_special_tokenspaddingis_split_into_wordsreturn_mm_token_type_idsreturn_row_col_info)text_kwargsimages_kwargsN)__name__
__module____qualname__	_defaultsr   r   r   r   r3   Y   s    
r3   F)totalc                       s   e Zd Z	ddededB f fddZdd Ze			dd	ee	e B e	e	e  B d
e
ede	e e	d f dedB dee def
ddZdddZ  ZS )Idefics3ProcessorN   r'   chat_templatec                    s   t ddddj| _t ddddj| _t ddddj| _d| _|| _ | j| _ | j| _	 | j| _
 fdd	td
D | _td| _d| j| j| jgi} |  | j| _t j| fd|i| dS )a  
        image_seq_len (`int`, *optional*, defaults to 169):
            The length of the image sequence i.e. the number of <image> tokens per image in the input.
            This parameter is used to build the string from the input prompt and image tokens and should match the
            value the model used. It is computed as: image_seq_len = int(((image_size // patch_size) ** 2) / (scale_factor**2))
        z<fake_token_around_image>FT)
normalizedspecialz<image>z<end_of_utterance>z<global-img>c              
      s:   g | ]}t d D ]} d|d  d|d  dqqS )   r!   r"   r#   r$   )r&   convert_tokens_to_ids).0ij	tokenizerr   r   
<listcomp>z   s
     z.Idefics3Processor.__init__.<locals>.<listcomp>rE   z*(\n?<global-img>\n?|<row_\d+_col_\d+>\n?)+additional_special_tokensrB   N)r   contentfake_image_tokenr+   end_of_utterance_tokenglobal_image_tagr'   rF   image_token_idfake_image_token_idglobal_image_token_idr&   row_col_idsrecompile%_regex_to_remove_extra_special_tokensr4   super__init__)selfimage_processorrK   r'   rB   kwargstokens_to_add	__class__rJ   r   rZ   i   s*   	

zIdefics3Processor.__init__c                 C   sT   g }|D ]#}g }|D ]}t |r|| q
t|r!|t| q
|| q|S r   )r   appendr   r	   )r[   promptsprompt_imagespromptimagesr   r   r   r   _extract_images_from_prompts   s   z.Idefics3Processor._extract_images_from_promptsre   textr   r]   r   c           '   
      sn  |du rdu rt djtfdjji|}|dur|nj}|d dd}|d dd}g }g }	i }
|dur^t|trE|g}nt|t	sUt|d tsUt d	fd
d|D }durt
rlggnhtt	tfrt
d r|durt|tkrt dj dt| dj dt d	dgt	t|   fddtt|D n gntt	tfstd t	tfst
d d st ddd D }	dd D jfi |d }|
| |dur|	|krt d| d|	 d|
ddd |D }|
ddd |D }j}j}j}g }g }t|||D ]k\}}}g }g }t||D ])\}}t||||||d}jd | d }|jd  ||   || q>|| ||}t|dkr}t d!|d }t|D ]\}}||||d   7 }q|| q0j|fi |d }j||d"gd# |
| n)|durt|rt d$t| dj d%jd*d&|i|d }|
| |r1t|
d' }t|} t|D ]8\}}!t|| j kd }"d}#|!D ]"}$|#t|"kr n|"|# }%|%|$ }&d| ||%|&f< t!|"|&}#qq| " |
d(< t#|
|d)S )+a  
        image_seq_len (`int`, *optional*):
            The length of the image sequence. If not provided, the default value of self.image_seq_len is used.
            image_seq_len should be equal to int(((image_size // patch_size) ** 2) / (scale_factor**2))
        Nz+You must provide either `text` or `images`.tokenizer_init_kwargsr9   r7   Freturn_tensorsr   zAInvalid input text. Please provide a string, or a list of stringsc                    s   g | ]}|  jqS r   )countr+   rG   sample)r[   r   r   rL      s    z.Idefics3Processor.__call__.<locals>.<listcomp>zThe total number of zP tokens in the prompts should be the same as the number of images passed. Found  z tokens and z images.c                    s$   g | ]} |  |d    qS )r"   r   )rG   rH   )cumsum_images_in_textre   r   r   rL      s    zdInvalid input images. Please provide a single image or a list of images or a list of list of images.c                 S   s   g | ]}t |qS r   )lenrk   r   r   r   rL      s    c                 S   s   g | ]	}d d |D qS )c                 S   s    g | ]}t |rt|n|qS r   )r   r	   )rG   imr   r   r   rL      s     z9Idefics3Processor.__call__.<locals>.<listcomp>.<listcomp>r   rk   r   r   r   rL      s    r:   z!The number of images in the text z and images z should be the same.rowsc                 S      g | ]}d g| qS r   r   rG   n_imagesr   r   r   rL          colsc                 S   rr   rs   r   rt   r   r   r   rL      rv   )r+   r*   r,      r"   r   z.The image token should be present in the text.image)
modalitieszFound z. tokens in the text but no images were passed.rg   	input_idsmm_token_type_ids)datatensor_typer   )$
ValueError_merge_kwargsr3   rK   init_kwargsr'   popr   r   listr   tuplesumro   r+   r   r&   r\   updaterO   rQ   zipr2   ra   split	enumerate_check_special_mm_tokensanynparray
zeros_likewhererS   searchsortedtolistr   )'r[   re   rg   r'   r]   output_kwargsr7   ri   n_images_in_textn_images_in_imagesinputsimage_inputsr(   r)   rO   r+   r,   prompt_stringsbatch_image_seq_lengthsrl   sample_rowssample_colsimage_prompt_stringsimage_seq_lengthsn_rowsn_colsimage_prompt_string
row_lengthsplit_samplerH   text_inputs	array_idsr|   seq_lengthsimage_start_positionsrI   seq_lenstartendr   )rn   re   r[   r   __call__   s   







	






zIdefics3Processor.__call__c                    s   i }|durNt jdi   |  fdd|D }jd }jd }g }g }|D ]\}	}
}|| d }||||
   ||	 q,|||d td	i |S )
a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.

        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        Nr:   c                    s"   g | ]}j jg | R  qS r   )r\   get_number_of_image_patches)rG   
image_sizer:   r[   r   r   rL   =  s    z@Idefics3Processor._get_num_multimodal_tokens.<locals>.<listcomp>r   rx   r"   )num_image_tokensnum_image_patchesr   )r3   r>   getr   r'   ra   r
   )r[   image_sizesr]   vision_datanum_image_row_colsbase_image_length
col_lengthr   r   num_patchesnum_rowsnum_colsr   r   r   r   _get_num_multimodal_tokens+  s"   


z,Idefics3Processor._get_num_multimodal_tokens)NrA   N)NNNr   )r;   r<   r=   intr   rZ   rf   r   r   r   r   r   r   r3   r   r   r   __classcell__r   r   r_   r   r@   g   s2    % r@   )(__doc__rV   	itertoolsr   typingr   r   numpyr   feature_extraction_utilsr   image_utilsr   r   r	   processing_utilsr
   r   r   r   tokenization_utils_baser   r   r   utilsr   r   r   
get_loggerr;   loggerboolr   r   r0   r1   r2   r3   r@   __all__r   r   r   r   <module>   s.   

 
j