o
    ॵiA@                     @   sf  d dl Z d dlZd dlZd dlmZmZmZ d dlZd dlm	Z	 ddl
mZ ddlmZ ddlmZ G dd	 d	eZ	
d,ddZdefddZdededededef
ddZdedefddZdedefddZ	
	d-dedededeeef d ee d!ededefd"d#Z	$	
		
	d.dedededeeef d ee d%ed&ed'ed!ededefd(d)Zd*d+ ZdS )/    N)AnyDictList)ModeKeys   )OfaBasePreprocessor)get_database_matches)dump_db_json_schemac                       s   e Zd ZdZejf fdd	Zdeee	f deee	f fddZ
deee	f deee	f fdd	Zdeee	f deee	f fd
dZ  ZS )OfaTextToSqlPreprocessorz0
    OFA preprocessor for text to sql tasks
    c                    sl   t t| j|||g|R i | | jjdd| _| jdd| _d| _i | _	t
jt
j|d| _dS )zpreprocess the data

        Args:
            cfg(modelscope.utils.config.ConfigDict) : model config
            model_dir (str): model path,
            mode: preprocessor mode (model mode)
        promptz . generating sql code.max_struct_length   	databaseN)superr
   __init__cfgmodelgetinstruction_textr   	separatordb_schema_cacheospathjoinabspathdatabase_path)selfr   	model_dirmodeargskwargs	__class__ Y/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/preprocessors/ofa/text2sql.pyr      s   

z!OfaTextToSqlPreprocessor.__init__datareturnc                 C   s    | j tjkr| |S | |S N)r   r   TRAIN_build_train_sample_build_infer_sample)r   r&   r$   r$   r%   __call__-   s   

z!OfaTextToSqlPreprocessor.__call__c                 C   sl  d| j v r	d|v sJ d|| j d  }|| j}t|dks$J d|\}}}|| jvrAt| jd | d | d || j|< d|  d| j	 }t
|||| j| j| | jjd	}|d
 }|d }|d }	|d }
d||| j }| || j }|d| j	| j d  }| jd|	dddd| j }t|| jg}t| j|g}d||||
d}|S )a  
        build sample for training tasks.

        step 1. Get the input question and database id from text input
        step 2. Get the database structure input
        step 3. Add a pseudo ids for every input.
        step 4. Calculate the target and previous output items.
        text;there must be `text` column in task key map and source data   z=invalid input, should contain query, question and database id/.sqlite NT	struct_intext_inseq_out	db_struct{} ; structured knowledge: {}    {}F)add_bosadd_eos        )idsourcetargetprev_output_tokensr6   )
column_mapsplitr   lenr   r	   r   r   stripmax_src_lengthseq2seq_inputr   r   formatr   tokenize_textr   max_tgt_lengthtorchcateos_itembos_item)r   r&   r-   textsqueryquestiondb_id
seq_inputsr3   r5   r6   src_itemtgt_itemtarget_itemprev_output_itemsampler$   r$   r%   r*   3   sf   	



z,OfaTextToSqlPreprocessor._build_train_samplec           	      C   s6  d| j v r	d|v sJ d|| j d  }|| j d d}| }|| jvr9t| jd | d | d || j|< d|  d| j }t	d||| j| j| | j
j}|d	 }|d
 }d||| j }| || j }|d| j| j d  }d||d}d| j v r| j d |v rd|| j d  |d< |S )z
        build sample for inference tasks.

        step 1. Get the input question and database id from text input
        step 2. Get the database structure input
        step 3. Add a pseudo ids for every input.
        r-   r.   r   culture_companyr0   r1   r2   Nr3   r6   r7   r8   r<   )r=   r>   r6   solutionr9   label)rA   r   rD   r   r	   r   r   rB   rE   rF   r   r   rG   r   rH   r   )	r   r&   r-   rQ   rR   r3   r6   rS   rW   r$   r$   r%   r+   k   sD   

z,OfaTextToSqlPreprocessor._build_infer_sample)__name__
__module____qualname____doc__r   	INFERENCEr   r   strr   r,   r*   r+   __classcell__r$   r$   r"   r%   r
      s    ""*8r
   Fc           
      C   sN   t | ||||}t||d  }|s|||dS t||\}}	|||	|dS )Nserialized_schema)r3   r4   r6   )r3   r4   r5   r6   )form_input_for_constructionspider_add_serialized_schemarD   spider_pre_process_one_function)
rO   rP   rQ   db_pathschemar    is_trainexrb   r5   r$   r$   r%   rF      s&   rF   itemc                 C   s2   d}t | d | d d|jd}|| d   |fS )N rO   rQ   T)rO   rQ   normalize_querytarget_with_db_idrP   )spider_get_targetrm   rD   )rj   r    prefixr5   r$   r$   r%   re      s   re   rO   rQ   rl   rm   r'   c                 C   s.   |rt ndd }|r| d||  S || S )Nc                 S   s   | S r(   r$   )xr$   r$   r%   <lambda>   s    z#spider_get_target.<locals>.<lambda>z | )	normalize)rO   rQ   rl   rm   
_normalizer$   r$   r%   rn      s   rn   c                 C   s(   dd }dd }dd }|||| S )Nc                 S   s   |  ddS )N , , )replacesr$   r$   r%   	comma_fix   s   znormalize.<locals>.comma_fixc                 S   s   d |  S )Nr2   )r   rB   rw   r$   r$   r%   white_space_fix   s   z"normalize.<locals>.white_space_fixc                 S   s   t ddd | S )Nz\b(?<!['\"])(\w+)(?!['\"])\bc                 S   s   |  d S )Nr   )grouplower)matchr$   r$   r%   rq      s    z*normalize.<locals>.lower.<locals>.<lambda>)resubrw   r$   r$   r%   r|      s   znormalize.<locals>.lowerr$   )rO   ry   rz   r|   r$   r$   r%   rr      s   rr   ri   c                 C   s   t |dr%t| d | d | d | d | d | d | d |jd	d
	}d|iS t| d | d | d | d | d ddd	|jd	d
}d|iS )Nschema_serialization_with_nlrP   rf   rQ   db_column_namesdb_table_namesdb_primary_keysdb_foreign_keysT)	rP   rf   rQ   r   r   r   r   $schema_serialization_with_db_contentrl   peteshawF)
rP   rf   rQ   r   r   schema_serialization_typeschema_serialization_randomizedschema_serialization_with_db_idr   rl   rb   )getattr!serialize_schema_natural_languager   serialize_schema)ri   r    rb   r$   r$   r%   rd      s8   
rd   TrP   rf   r   r   r   c	           %         s  | dd  fdd|D  d}	dd }
dd	 }d
d }dd }|d }tt|d |d }|	g}g }g }d}t|D ]\}} rH| n|}|| g }g }g }tt|d |d D ]J\}\}}|dkrkq` rq| n|}|| ||kr|| ||v r|| |rt| |||d | d | d d}|r||| |f q`|||}|| |
d |}|| t|dkr||}|| q>|D ](\}}||d |  } || }!||d |  }"|| }#|| |!|"|#}$||$ qd |S )Nz contains tables such as ru   c                    s   g | ]
} r
|  n|qS r$   )r|   ).0namerl   r$   r%   
<listcomp>      z5serialize_schema_natural_language.<locals>.<listcomp>.c                 S   s
   |  dS )Nz is the primary key.r$   )primary_keyr$   r$   r%   &table_description_primary_key_template  s   
zQserialize_schema_natural_language.<locals>.table_description_primary_key_templatec                 S   s   d|  dd | dS )NzTable z has columns such as ru   r   r   )r   column_namesr$   r$   r%   table_description  s   z<serialize_schema_natural_language.<locals>.table_descriptionc                 S   s   d dd | D  S )Nrk   c                 S   s   g | ]
\}}d  ||qS )z"The {} contains values such as {}.)rG   )r   columnvaluer$   r$   r%   r   
  r   zPserialize_schema_natural_language.<locals>.value_description.<locals>.<listcomp>r   )cv_pairsr$   r$   r%   value_description	  s   z<serialize_schema_natural_language.<locals>.value_descriptionc              	   S   s   d| d|  d| d| d	S )NzThe z of z is the foreign key of r   r$   )table_1column_1table_2column_2r$   r$   r%   foreign_key_description  s   zBserialize_schema_natural_language.<locals>.foreign_key_description	column_idother_column_idtable_idcolumn_namer   r0   r1   rP   
table_namer   rf   r2   )r   listzip	enumerater|   appendr   rC   )%rP   rf   rQ   r   r   r   r   r   rl   overall_descriptionr   r   r   r   descriptionsdb_table_name_strsdb_column_name_strs	value_sepr   r   table_name_strcolumnscolumn_value_pairsprimary_keysr   rp   y
column_strmatchestable_description_columns_str!table_description_primary_key_strvalue_description_strx_table_namex_column_namey_table_namey_column_nameforeign_key_description_strr$   r   r%   r      s   







r   r   r   r   r   c
              	      s   |dkrd}
d}d
d dddn|dkr&d	}
d
}d
d dddnt dtdtdtf	fdd 
fddt|D }|rRt| |ra|
jd|| }|S ||}|S )NverbosezDatabase: {db_id}. z. z"Table: {table}. Columns: {columns}ru   z{column} ({values})z{column}r   z
 | {db_id}rk   z | {table} : {columns}rt   z{column} ( {values} )r   r   r'   c                    sh   r|  n|}r.t| |d  d  d d}|r( j||dS j|dS j|dS )Nr0   r1   r   )r   values)r   )r|   r   rG   r   )r   r   column_name_strr   )column_str_with_valuescolumn_str_without_valuesrQ   rf   rl   rP   r   r   r$   r%   get_column_strk  s"   

z(serialize_schema.<locals>.get_column_strc                    s\   g | ]*\ j r ntfd dt fddtd d dqS )c                    s    | d dS )Nr   )r   r   r$   r   )r   r   r$   r%   rq     s    z-serialize_schema.<locals>.<listcomp>.<lambda>c                    s   | d  kS )Nr   r$   r   )r   r$   r%   rq     s    r   r   )tabler   )rG   r|   r   mapfilterr   )r   )
column_sepr   r   rl   	table_str)r   r   r%   r   ~  s$    
z$serialize_schema.<locals>.<listcomp>)rQ   )NotImplementedErrorr`   r   randomshufflerG   r   )rP   rf   rQ   r   r   r   r   r   r   rl   	db_id_str	table_septablesrb   r$   )r   r   r   r   rQ   rf   r   rl   rP   r   r   r   r%   r   K  s@   (

r   c                 C   sr   | ||||d dd |d D dd |d D d|d dd |d	 D d
d |d D dd |d D dd	S )Ntable_names_originalc                 S      g | ]\}}|qS r$   r$   r   r   r   r$   r$   r%   r         z/form_input_for_construction.<locals>.<listcomp>column_names_originalc                 S      g | ]\}}|qS r$   r$   r   r$   r$   r%   r     r   )r   r   column_typesc                 S   s   g | ]}d |iqS )r   r$   )r   r   r$   r$   r%   r     s    r   c                 S   r   r$   r$   r   r   r   r$   r$   r%   r     r   foreign_keysc                 S   r   r$   r$   r   r$   r$   r%   r     r   )r   r   )	rO   rP   rQ   rf   r   r   db_column_typesr   r   r$   )rO   rP   rQ   rf   rg   r$   r$   r%   rc     s0   rc   )F)FT)r   FTFT)r   r   r~   typingr   r   r   rJ   modelscope.utils.constantr   baser   utils.bridge_content_encoderr   utils.get_tablesr	   r
   rF   dictre   r`   boolrn   rr   rd   r   r   rc   r$   r$   r$   r%   <module>   s    


'
	

\
	

N