o
    eiC                     @   s   d dl Z d dlZd dlZddlmZ ddlmZmZm	Z	 ddl
mZmZmZmZmZ e r9d dlZddlmZmZ G dd	 d	eZeed
dG dd deZdS )    N   )GenerationConfig)add_end_docstringsis_torch_availablerequires_backends   )ArgumentHandlerDatasetPipelinePipelineExceptionbuild_pipeline_init_args),MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES0MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMESc                   @   s   e Zd ZdZdddZdS )%TableQuestionAnsweringArgumentHandlerzB
    Handles arguments for the TableQuestionAnsweringPipeline
    Nc                 K   s`  t | d dd l}|d u rtd|d u rt|tr,|dd ur,|dd ur,|g}nbt|trmt|dkrmtdd |D sLtdd	d |D  |d dd ura|d dd ura|}n-td
|d 	  dt
d urvt|t
s|t|tjr~|S tdt| d||dg}|D ]}t|d |js|d d u rtd||d |d< q|S )Npandasr   z(Keyword argument `table` cannot be None.querytablec                 s   s    | ]}t |tV  qd S N)
isinstancedict.0d r   m/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/pipelines/table_question_answering.py	<genexpr>-   s    zATableQuestionAnsweringArgumentHandler.__call__.<locals>.<genexpr>z:Keyword argument `table` should be a list of dict, but is c                 s   s    | ]}t |V  qd S r   )typer   r   r   r   r   /   s    zIf keyword argument `table` is a list of dictionaries, each dictionary should have a `table` and `query` key, but only dictionary has keys z `table` and `query` keys.zZInvalid input. Keyword argument `table` should be either of type `dict` or `list`, but is ))r   r   zTable cannot be None.)r   r   
ValueErrorr   r   getlistlenallkeysr	   typesGeneratorTyper   	DataFrame)selfr   r   kwargspdtqa_pipeline_inputstqa_pipeline_inputr   r   r   __call__   sD   
&$
z.TableQuestionAnsweringArgumentHandler.__call__)NN)__name__
__module____qualname____doc__r,   r   r   r   r   r      s    r   T)has_tokenizerc                       s   e Zd ZdZdZdZdZdZdZdZ	e
ddZe f fdd	Zd	d
 Zdd Z fddZdddZdddZdddZdd Z  ZS )TableQuestionAnsweringPipelinea  
    Table Question Answering pipeline using a `ModelForTableQuestionAnswering`. This pipeline is only available in
    PyTorch.

    Unless the model you're using explicitly sets these generation parameters in its configuration files
    (`generation_config.json`), the following default values will be used:
    - max_new_tokens: 256

    Example:

    ```python
    >>> from transformers import pipeline

    >>> oracle = pipeline(model="google/tapas-base-finetuned-wtq")
    >>> table = {
    ...     "Repository": ["Transformers", "Datasets", "Tokenizers"],
    ...     "Stars": ["36542", "4512", "3934"],
    ...     "Contributors": ["651", "77", "34"],
    ...     "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
    ... }
    >>> oracle(query="How many stars does the transformers repository have?", table=table)
    {'answer': 'AVERAGE > 36542', 'coordinates': [(0, 1)], 'cells': ['36542'], 'aggregator': 'AVERAGE'}
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This tabular question answering pipeline can currently be loaded from [`pipeline`] using the following task
    identifier: `"table-question-answering"`.

    The models that this pipeline can use are models that have been fine-tuned on a tabular question answering task.
    See the up-to-date list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=table-question-answering).
    ztable,queryTF   )max_new_tokensc                    sx   t  jdi | || _t }|t | | t| j	j
dd o)t| j	j
dd | _t| j	j
dr7d| _d S d | _d S )Naggregation_labelsnum_aggregation_labelstapasr   )super__init___args_parserr   copyupdater   check_model_typegetattrmodelconfig	aggregatehasattrr   )r'   args_parserr(   mapping	__class__r   r   r9   }   s   


"z'TableQuestionAnsweringPipeline.__init__c                 K   s   | j di |S )Nr   )r?   )r'   inputsr   r   r   batch_inference   s   z.TableQuestionAnsweringPipeline.batch_inferencec                    s  g }g }d}|d j d }|d | j}|d | j}|d | j}d}	t|D ]}
|dur|	dddf }t|  }||
 }	t|j d D ]@}|	dddf  | }|	dddf  | d }|	dddf  | d }|dkr|dkr|dkrt	|||f ||< qNt
|t
j| j|	dddf< ||
 }||
 }||
 }	| j|d|d|	dd	}|j}| jr||j || t
jj|d
}|j|t
j|jj }tt t|  D ]D\}}|	dddf  | }|	dddf  | d }|	dddf  | d }|dkr6|dkr6|dkr6 ||f | q fdd D }q+t
t|d}| jsP|fS |t
t|dfS )z
        Inference used for models that need to process sequences in a sequential fashion, like the SQA models which
        handle conversational query related to a table.
        N	input_idsr   attention_masktoken_type_ids   r   r   )rI   rJ   rK   )logitsc                    s$   i | ]}|t  |  d kqS )g      ?)nparraymean)r   keycoords_to_probsr   r   
<dictcomp>   s   $ zGTableQuestionAnsweringPipeline.sequential_inference.<locals>.<dictcomp>)shapetodevicerangerN   
zeros_likecpunumpytolistinttorch
from_numpyr   longr?   	unsqueezerM   rA   appendlogits_aggregationdistributions	Bernoulliprobsfloat32collectionsdefaultdictr    	enumeratesqueezecattuple)r'   rG   
all_logitsall_aggregationsprev_answers
batch_sizerI   rJ   rK   token_type_ids_exampleindexprev_labels_examplemodel_labelsi
segment_idcol_idrow_idinput_ids_exampleattention_mask_exampleoutputsrM   dist_per_tokenprobabilitiespcolrowlogits_batchr   rR   r   sequential_inference   s`   &

"z3TableQuestionAnsweringPipeline.sequential_inferencec                    s<   | j |i |}t j|fi |}t|dkr|d S |S )a  
        Answers queries according to a table. The pipeline accepts several types of inputs which are detailed below:

        - `pipeline(table, query)`
        - `pipeline(table, [query])`
        - `pipeline(table=table, query=query)`
        - `pipeline(table=table, query=[query])`
        - `pipeline({"table": table, "query": query})`
        - `pipeline({"table": table, "query": [query]})`
        - `pipeline([{"table": table, "query": query}, {"table": table, "query": query}])`

        The `table` argument should be a dict or a DataFrame built from that dict, containing the whole table:

        Example:

        ```python
        data = {
            "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
            "age": ["56", "45", "59"],
            "number of movies": ["87", "53", "69"],
            "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
        }
        ```

        This dictionary can be passed in as such, or can be converted to a pandas DataFrame:

        Example:

        ```python
        import pandas as pd

        table = pd.DataFrame.from_dict(data)
        ```

        Args:
            table (`pd.DataFrame` or `Dict`):
                Pandas DataFrame or dictionary that will be converted to a DataFrame containing all the table values.
                See above for an example of dictionary.
            query (`str` or `list[str]`):
                Query or list of queries that will be sent to the model alongside the table.
            sequential (`bool`, *optional*, defaults to `False`):
                Whether to do inference sequentially or as a batch. Batching is faster, but models like SQA require the
                inference to be done sequentially to extract relations within sequences, given their conversational
                nature.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
                Activates and controls padding. Accepts the following values:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).

            truncation (`bool`, `str` or [`TapasTruncationStrategy`], *optional*, defaults to `False`):
                Activates and controls truncation. Accepts the following values:

                - `True` or `'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument `max_length`
                  or to the maximum acceptable input length for the model if that argument is not provided. This will
                  truncate row by row, removing rows from the table.
                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
                  greater than the model maximum admissible input size).


        Return:
            A dictionary or a list of dictionaries containing results: Each result is a dictionary with the following
            keys:

            - **answer** (`str`) -- The answer of the query given the table. If there is an aggregator, the answer will
              be preceded by `AGGREGATOR >`.
            - **coordinates** (`list[tuple[int, int]]`) -- Coordinates of the cells of the answers.
            - **cells** (`list[str]`) -- List of strings made up of the answer cell values.
            - **aggregator** (`str`) -- If the model has an aggregator, this returns the aggregator.
        r   r   )r:   r8   r,   r!   )r'   argsr(   pipeline_inputsresultsrE   r   r   r,      s
   Kz'TableQuestionAnsweringPipeline.__call__Nc                 K   s   i }|d ur
||d< |d ur||d< i }|d ur||d< t | dd d ur)| j|d< t | dd d ur;| j|d< | j|d< ||i fS )Npadding
truncation
sequentialassistant_modelassistant_tokenizer	tokenizer)r>   r   r   r   )r'   r   r   r   r(   preprocess_paramsforward_paramsr   r   r   _sanitize_parameters"  s   



z3TableQuestionAnsweringPipeline._sanitize_parametersc                 C   st   |d u r| j dkrd}nd}|d |d }}|jrtd|d u s&|dkr*td| j||d	||d
}||d< |S )Nr7   drop_rows_to_fitdo_not_truncater   r   ztable is empty zquery is emptypt)return_tensorsr   r   )r   emptyr   r   )r'   pipeline_inputr   r   r   r   rG   r   r   r   
preprocess5  s   
z)TableQuestionAnsweringPipeline.preprocessc                 K   st   | d}| jdkr|r| jdi |}n| jdi |}nd|vr'| j|d< | jjdi ||}|||d}|S )Nr   r7   generation_config)model_inputsr   r|   r   )popr   r   rH   r   r?   generate)r'   r   r   generate_kwargsr   r|   model_outputsr   r   r   _forwardE  s   


z'TableQuestionAnsweringPipeline._forwardc                    sz  |d }|d |d }j dkrjrE|d d \}}j|||}|\}}fddt|D  jjj fddt|D }	n|d	 }j||}|d	 }i  i }	g }
t|D ]6\}}fd
d|D } |d}|	|d}|d	| |fdd|D d}|r||d< |

| q^t|d	krtdjjdndd jj|ddD }
t|
dkr|
S |
d	 S )Nr   r   r|   r7   r   c                    s    i | ]\}}| j jj| qS r   )r?   r@   r5   r   rv   pred)r'   r   r   rT   _  s     z>TableQuestionAnsweringPipeline.postprocess.<locals>.<dictcomp>c                    s&   i | ]\}}|kr| | d  qS )z > r   r   )aggregatorsno_agg_label_indexr   r   rT   b  s     r   c                       g | ]} j | qS r   iatr   
coordinater   r   r   
<listcomp>m      z>TableQuestionAnsweringPipeline.postprocess.<locals>.<listcomp>r   z, c                    r   r   r   r   r   r   r   r   s  r   )answercoordinatescells
aggregatorzTable question answeringzEmpty answerc                 S   s   g | ]}d |iqS )r   r   )r   r   r   r   r   r   |  s    T)skip_special_tokensr   )r   rA   r   convert_logits_to_predictionsrj   r?   r@   no_aggregation_label_indexr   joinrb   r!   r   name_or_pathbatch_decode)r'   r   rG   r|   rM   
logits_aggpredictionsanswer_coordinates_batchagg_predictionsaggregators_prefixanswersrs   r   r   r   aggregator_prefixr   r   )r   r   r'   r   r   postprocessV  sF   

z*TableQuestionAnsweringPipeline.postprocess)NNN)TN)F)r-   r.   r/   r0   default_input_names_pipeline_calls_generate_load_processor_load_image_processor_load_feature_extractor_load_tokenizerr   _default_generation_configr   r9   rH   r   r,   r   r   r   r   __classcell__r   r   rE   r   r2   M   s&    "C
R

r2   )rh   r$   r[   rN   
generationr   utilsr   r   r   baser   r	   r
   r   r   r^   models.auto.modeling_autor   r   r   r2   r   r   r   r   <module>   s    5