o
    iQ                     @   s   d dl Z d dlZd dlZddlmZ ddlmZmZm	Z	m
Z
 ddlmZmZmZmZmZ e	 r;d dlZddlmZmZ e rJd dlZddlmZmZ G d	d
 d
eZeeddG dd deZdS )    N   )GenerationConfig)add_end_docstringsis_tf_availableis_torch_availablerequires_backends   )ArgumentHandlerDatasetPipelinePipelineExceptionbuild_pipeline_init_args),MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES0MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES)/TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES3TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMESc                   @   s   e Zd ZdZdddZdS )%TableQuestionAnsweringArgumentHandlerzB
    Handles arguments for the TableQuestionAnsweringPipeline
    Nc                 K   s`  t | d dd l}|d u rtd|d u rt|tr,|dd ur,|dd ur,|g}nbt|trmt|dkrmtdd |D sLtdd	d |D  |d dd ura|d dd ura|}n-td
|d 	  dt
d urvt|t
s|t|tjr~|S tdt| d||dg}|D ]}t|d |js|d d u rtd||d |d< q|S )Npandasr   z(Keyword argument `table` cannot be None.querytablec                 s   s    | ]}t |tV  qd S N)
isinstancedict.0d r   c/home/ubuntu/.local/lib/python3.10/site-packages/transformers/pipelines/table_question_answering.py	<genexpr>6   s    zATableQuestionAnsweringArgumentHandler.__call__.<locals>.<genexpr>z:Keyword argument `table` should be a list of dict, but is c                 s   s    | ]}t |V  qd S r   )typer   r   r   r   r   8   s    zIf keyword argument `table` is a list of dictionaries, each dictionary should have a `table` and `query` key, but only dictionary has keys z `table` and `query` keys.zZInvalid input. Keyword argument `table` should be either of type `dict` or `list`, but is ))r   r   zTable cannot be None.)r   r   
ValueErrorr   r   getlistlenallkeysr
   typesGeneratorTyper   	DataFrame)selfr   r   kwargspdtqa_pipeline_inputstqa_pipeline_inputr   r   r   __call__&   sD   
&$
z.TableQuestionAnsweringArgumentHandler.__call__)NN)__name__
__module____qualname____doc__r/   r   r   r   r   r   !   s    r   T)has_tokenizerc                       s   e Zd ZdZdZdZdZdZdZdZ	e
ddZe f fdd	Zd	d
 Zdd Z fddZdddZdddZdddZdd Z  ZS )TableQuestionAnsweringPipelinea  
    Table Question Answering pipeline using a `ModelForTableQuestionAnswering`. This pipeline is only available in
    PyTorch.

    Unless the model you're using explicitly sets these generation parameters in its configuration files
    (`generation_config.json`), the following default values will be used:
    - max_new_tokens: 256

    Example:

    ```python
    >>> from transformers import pipeline

    >>> oracle = pipeline(model="google/tapas-base-finetuned-wtq")
    >>> table = {
    ...     "Repository": ["Transformers", "Datasets", "Tokenizers"],
    ...     "Stars": ["36542", "4512", "3934"],
    ...     "Contributors": ["651", "77", "34"],
    ...     "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
    ... }
    >>> oracle(query="How many stars does the transformers repository have?", table=table)
    {'answer': 'AVERAGE > 36542', 'coordinates': [(0, 1)], 'cells': ['36542'], 'aggregator': 'AVERAGE'}
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This tabular question answering pipeline can currently be loaded from [`pipeline`] using the following task
    identifier: `"table-question-answering"`.

    The models that this pipeline can use are models that have been fine-tuned on a tabular question answering task.
    See the up-to-date list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=table-question-answering).
    ztable,queryTF   )max_new_tokensc                    s   t  jdi | || _| jdkrt }|t n	t }|t	 | 
| t| jjdd o8t| jjdd | _t| jjdrFd| _d S d | _d S )Ntfaggregation_labelsnum_aggregation_labelstapasr   )super__init___args_parser	frameworkr   copyupdater   r   r   check_model_typegetattrmodelconfig	aggregatehasattrr   )r*   args_parserr+   mapping	__class__r   r   r=      s   



"z'TableQuestionAnsweringPipeline.__init__c                 K   s   | j di |S )Nr   )rD   )r*   inputsr   r   r   batch_inference   s   z.TableQuestionAnsweringPipeline.batch_inferencec                    sd  | j dkr`g }g }d}|d jd }|d | j}|d | j}|d | j}d}	t|D ]}
|dur|	dddf }t|  }||
 }	t|jd D ]@}|	dddf 	 | }|	dddf 	 | d }|	ddd	f 	 | d }|dkr|dkr|dkrt
|||f ||< qTt|tj| j|	dddf< ||
 }||
 }||
 }	| j|d|d|	dd
}|j}| jr||j || tjj|d}|j|tj|jj }tt t| 	 D ]D\}}|	dddf 	 | }|	dddf 	 | d }|	ddd	f 	 | d }|dkr<|dkr<|dkr< ||f | q fdd D }q1tt|d}| jsV|fS |tt|dfS g }g }d}|d jd }|d }|d }|d  }d}	t|D ]}
|dur|	dddf }tj|tj d}||
 }	t|jd D ]D}|	dddf 	 | }|	dddf 	 | d }|	ddd	f 	 | d }|dkr|dkr|dkrt
|||f ||< q||	dddf< ||
 }||
 }||
 }	| jtj!|ddtj!|ddtj!|	ddd
}|j}| jr!||j || t"j#$t"%|t"jt"%|t"j }tt tt"| 	 D ]E\}}|	dddf 	 | }|	dddf 	 | d }|	ddd	f 	 | d }|dkr|dkr|dkr ||f | qH fdd D }qt"&t|d}| js|fS |t"&t|dfS )z
        Inference used for models that need to process sequences in a sequential fashion, like the SQA models which
        handle conversational query related to a table.
        ptN	input_idsr   attention_masktoken_type_ids   r   r   )rO   rP   rQ   )logitsc                    $   i | ]}|t  |  d kqS g      ?nparraymeanr   keycoords_to_probsr   r   
<dictcomp>      $ zGTableQuestionAnsweringPipeline.sequential_inference.<locals>.<dictcomp>)dtype)axisc                    rT   rU   rV   rZ   r\   r   r   r^     r_   )'r?   shapetodevicerangerW   
zeros_likecpunumpytolistinttorch
from_numpyr   longrD   	unsqueezerS   rF   appendlogits_aggregationdistributions	Bernoulliprobsfloat32collectionsdefaultdictr#   	enumeratesqueezecattupleint32expand_dimsr8   mathsigmoidcastconcat)r*   rL   
all_logitsall_aggregationsprev_answers
batch_sizerO   rP   rQ   token_type_ids_exampleindexprev_labels_examplemodel_labelsi
segment_idcol_idrow_idinput_ids_exampleattention_mask_exampleoutputsrS   dist_per_tokenprobabilitiespcolrowlogits_batchr   r\   r   sequential_inference   s   &

"


"z3TableQuestionAnsweringPipeline.sequential_inferencec                    s<   | j |i |}t j|fi |}t|dkr|d S |S )a  
        Answers queries according to a table. The pipeline accepts several types of inputs which are detailed below:

        - `pipeline(table, query)`
        - `pipeline(table, [query])`
        - `pipeline(table=table, query=query)`
        - `pipeline(table=table, query=[query])`
        - `pipeline({"table": table, "query": query})`
        - `pipeline({"table": table, "query": [query]})`
        - `pipeline([{"table": table, "query": query}, {"table": table, "query": query}])`

        The `table` argument should be a dict or a DataFrame built from that dict, containing the whole table:

        Example:

        ```python
        data = {
            "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
            "age": ["56", "45", "59"],
            "number of movies": ["87", "53", "69"],
            "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
        }
        ```

        This dictionary can be passed in as such, or can be converted to a pandas DataFrame:

        Example:

        ```python
        import pandas as pd

        table = pd.DataFrame.from_dict(data)
        ```

        Args:
            table (`pd.DataFrame` or `Dict`):
                Pandas DataFrame or dictionary that will be converted to a DataFrame containing all the table values.
                See above for an example of dictionary.
            query (`str` or `list[str]`):
                Query or list of queries that will be sent to the model alongside the table.
            sequential (`bool`, *optional*, defaults to `False`):
                Whether to do inference sequentially or as a batch. Batching is faster, but models like SQA require the
                inference to be done sequentially to extract relations within sequences, given their conversational
                nature.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
                Activates and controls padding. Accepts the following values:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).

            truncation (`bool`, `str` or [`TapasTruncationStrategy`], *optional*, defaults to `False`):
                Activates and controls truncation. Accepts the following values:

                - `True` or `'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument `max_length`
                  or to the maximum acceptable input length for the model if that argument is not provided. This will
                  truncate row by row, removing rows from the table.
                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
                  greater than the model maximum admissible input size).


        Return:
            A dictionary or a list of dictionaries containing results: Each result is a dictionary with the following
            keys:

            - **answer** (`str`) -- The answer of the query given the table. If there is an aggregator, the answer will
              be preceded by `AGGREGATOR >`.
            - **coordinates** (`list[tuple[int, int]]`) -- Coordinates of the cells of the answers.
            - **cells** (`list[str]`) -- List of strings made up of the answer cell values.
            - **aggregator** (`str`) -- If the model has an aggregator, this returns the aggregator.
        r   r   )r>   r<   r/   r$   )r*   argsr+   pipeline_inputsresultsrJ   r   r   r/     s
   Kz'TableQuestionAnsweringPipeline.__call__Nc                 K   s   i }|d ur
||d< |d ur||d< i }|d ur||d< t | dd d ur)| j|d< t | dd d ur;| j|d< | j|d< ||i fS )Npadding
truncation
sequentialassistant_modelassistant_tokenizer	tokenizer)rC   r   r   r   )r*   r   r   r   r+   preprocess_paramsforward_paramsr   r   r   _sanitize_parametersm  s   



z3TableQuestionAnsweringPipeline._sanitize_parametersc                 C   sv   |d u r| j dkrd}nd}|d |d }}|jrtd|d u s&|dkr*td| j||| j||d	}||d< |S )
Nr;   drop_rows_to_fitdo_not_truncater   r   ztable is empty zquery is empty)return_tensorsr   r   )r   emptyr!   r   r?   )r*   pipeline_inputr   r   r   r   rL   r   r   r   
preprocess  s   
z)TableQuestionAnsweringPipeline.preprocessc                 K   st   | d}| jdkr|r| jdi |}n| jdi |}nd|vr'| j|d< | jjdi ||}|||d}|S )Nr   r;   generation_config)model_inputsr   r   r   )popr   r   rM   r   rD   generate)r*   r   r   generate_kwargsr   r   model_outputsr   r   r   _forward  s   


z'TableQuestionAnsweringPipeline._forwardc                    sz  |d }|d |d }j dkrjrE|d d \}}j|||}|\}}fddt|D  jjj fddt|D }	n|d	 }j||}|d	 }i  i }	g }
t|D ]6\}}fd
d|D } |d}|	|d}|d	| |fdd|D d}|r||d< |

| q^t|d	krtdjjdndd jj|ddD }
t|
dkr|
S |
d	 S )Nr   r   r   r;   r   c                    s    i | ]\}}| j jj| qS r   )rD   rE   r9   r   r   pred)r*   r   r   r^     s     z>TableQuestionAnsweringPipeline.postprocess.<locals>.<dictcomp>c                    s&   i | ]\}}|kr| | d  qS )z > r   r   )aggregatorsno_agg_label_indexr   r   r^     s     r   c                       g | ]} j | qS r   iatr   
coordinater   r   r   
<listcomp>      z>TableQuestionAnsweringPipeline.postprocess.<locals>.<listcomp>r   z, c                    r   r   r   r   r   r   r   r     r   )answercoordinatescells
aggregatorzTable question answeringzEmpty answerc                 S   s   g | ]}d |iqS )r   r   )r   r   r   r   r   r     s    T)skip_special_tokensr   )r   rF   r   convert_logits_to_predictionsrw   rD   rE   no_aggregation_label_indexr"   joinro   r$   r   name_or_pathbatch_decode)r*   r   rL   r   rS   
logits_aggpredictionsanswer_coordinates_batchagg_predictionsaggregators_prefixanswersr   r   r   r   aggregator_prefixr   r   )r   r   r*   r   r   postprocess  sF   

z*TableQuestionAnsweringPipeline.postprocess)NNN)TN)F)r0   r1   r2   r3   default_input_names_pipeline_calls_generate_load_processor_load_image_processor_load_feature_extractor_load_tokenizerr   _default_generation_configr   r=   rM   r   r/   r   r   r   r   __classcell__r   r   rJ   r   r5   V   s(    " 
R

r5   )ru   r'   rh   rW   
generationr   utilsr   r   r   r   baser	   r
   r   r   r   rk   models.auto.modeling_autor   r   
tensorflowr8   models.auto.modeling_tf_autor   r   r   r5   r   r   r   r   <module>   s    5