forked from xlang-ai/Binder
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
67065b6
commit bcea7be
Showing
22 changed files
with
32,821 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,192 @@ | ||
# coding=utf-8 | ||
# Copyright 2021 The HuggingFace Datasets Authors, The Google AI Language Team Authors and the current dataset script contributor. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
"""Squall: On the Potential of Lexico-logical Alignments for Semantic Parsing to SQL Queries""" | ||
|
||
|
||
import json | ||
import os | ||
import datasets | ||
import shutil | ||
from utils.wtq.utils import _load_table_w_page as _load_table | ||
|
||
# Find for instance the citation on arxiv or on the dataset repo/website | ||
_CITATION = """\ | ||
@inproceedings{Shi:Zhao:Boyd-Graber:Daume-III:Lee-2020, | ||
Title = {On the Potential of Lexico-logical Alignments for Semantic Parsing to {SQL} Queries}, | ||
Author = {Tianze Shi and Chen Zhao and Jordan Boyd-Graber and Hal {Daum\'{e} III} and Lillian Lee}, | ||
Booktitle = {Findings of EMNLP}, | ||
Year = {2020}, | ||
} | ||
""" | ||
|
||
_DESCRIPTION = """\ | ||
""" | ||
|
||
_HOMEPAGE = "https://github.com/tzshi/squall" | ||
|
||
_LICENSE = "CC-BY-SA-4.0 License" | ||
|
||
_URL = "https://github.com/ppasupat/WikiTableQuestions/archive/refs/heads/master.zip" | ||
_SQUALL_URL = "https://github.com/tzshi/squall/archive/refs/heads/main.zip" | ||
|
||
from utils.wtq.utils import WTQDBEngine, process_table_structure, retrieve_wtq_query_answer | ||
|
||
class WikiTableQuestion(datasets.GeneratorBasedBuilder): | ||
"""The Squall dataset""" | ||
|
||
def _info(self): | ||
return datasets.DatasetInfo( | ||
description=_DESCRIPTION, | ||
features=datasets.Features( | ||
{ | ||
"id": datasets.Value("string"), | ||
"question": datasets.Value("string"), | ||
"table_id": datasets.Value("string"), | ||
"table": { | ||
"page_title": datasets.Value("string"), | ||
"header": datasets.features.Sequence(datasets.Value("string")), | ||
"rows": datasets.features.Sequence(datasets.features.Sequence(datasets.Value("string"))) | ||
}, | ||
"sql": datasets.Value("string"), | ||
"answer_text": datasets.features.Sequence(datasets.Value("string")), | ||
} | ||
), | ||
supervised_keys=None, | ||
homepage=_HOMEPAGE, | ||
license=_LICENSE, | ||
citation=_CITATION, | ||
) | ||
|
||
def _split_generators(self, dl_manager): | ||
"""Returns SplitGenerators.""" | ||
data_dir = os.path.join(dl_manager.download_and_extract(_URL), 'WikiTableQuestions-master') | ||
squall_dir = os.path.join(dl_manager.download_and_extract(_SQUALL_URL), 'squall-main') | ||
|
||
return [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TRAIN, | ||
gen_kwargs={ | ||
"filepath": os.path.join(data_dir, "data/random-split-1-train.tsv"), | ||
"data_dir": data_dir, | ||
"squall_path": os.path.join(squall_dir, "data/squall.json"), | ||
"squall_tables_path": os.path.join(squall_dir, "tables/json"), | ||
"squall_db_path": os.path.join(squall_dir, "tables/db"), | ||
"squall_tmp_db_path": os.path.join(squall_dir, "tables/tmp_db"), | ||
|
||
}, | ||
), | ||
datasets.SplitGenerator( | ||
name=datasets.Split.VALIDATION, | ||
gen_kwargs={ | ||
"filepath": os.path.join(data_dir, "data/random-split-1-dev.tsv"), | ||
"data_dir": data_dir, | ||
"squall_path": os.path.join(squall_dir, "data/squall.json"), | ||
"squall_tables_path": os.path.join(squall_dir, "tables/json"), | ||
"squall_db_path": os.path.join(squall_dir, "tables/db"), | ||
"squall_tmp_db_path": os.path.join(squall_dir, "tables/tmp_db"), | ||
|
||
}, | ||
), | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TEST, | ||
gen_kwargs={ | ||
"filepath": os.path.join(data_dir, "data/pristine-unseen-tables.tsv"), | ||
"data_dir": data_dir, | ||
"squall_path": os.path.join(squall_dir, "data/squall.json"), | ||
"squall_tables_path": os.path.join(squall_dir, "tables/json"), | ||
"squall_db_path": os.path.join(squall_dir, "tables/db"), | ||
"squall_tmp_db_path": os.path.join(squall_dir, "tables/tmp_db"), | ||
|
||
}, | ||
), | ||
|
||
] | ||
|
||
def _generate_examples(self, filepath, data_dir, squall_path, squall_tables_path, squall_db_path, | ||
squall_tmp_db_path): | ||
if not os.path.exists(squall_tmp_db_path): | ||
os.makedirs(squall_tmp_db_path) | ||
|
||
# source table should not be truncated! | ||
src_table_content_map = {} | ||
# tgt table should be truncated! | ||
tgt_table_content_map = {} | ||
table_drop_rows_map = {} | ||
db_engine_map = {} | ||
|
||
for table_json_file in os.listdir(squall_tables_path): | ||
table_id = table_json_file[:-5] | ||
check_table_file = open(os.path.join(squall_tables_path, table_json_file), "r", encoding="utf8") | ||
src_table_content = json.load(check_table_file) | ||
src_table_content = process_table_structure(src_table_content) | ||
src_table_content_map[table_id] = json.loads(json.dumps(src_table_content)) | ||
tgt_table_content_map[table_id] = src_table_content | ||
|
||
for table_db_file in os.listdir(squall_db_path): | ||
table_id = table_db_file[:-3] | ||
# copy table db file into a temp file since we may delete some rows | ||
database_path = os.path.join(squall_db_path, table_db_file) | ||
temp_database_path = os.path.join(squall_tmp_db_path, table_db_file) | ||
if os.path.exists(temp_database_path): | ||
os.remove(temp_database_path) | ||
# future operations on the temp db to avoid effecting the original database | ||
shutil.copy(database_path, temp_database_path) | ||
db_engine_map[table_id] = WTQDBEngine(temp_database_path) | ||
if table_id in table_drop_rows_map and len(table_drop_rows_map[table_id]) != 0: | ||
table_drop_rows = table_drop_rows_map[table_id] | ||
db_engine_map[table_id].delete_rows(table_drop_rows) | ||
|
||
"""Yields examples.""" | ||
squall_id_map = {} | ||
with open(squall_path) as f: | ||
squall_data = json.load(f) | ||
for squall_item in squall_data: | ||
squall_id_map[squall_item["nt"]] = squall_item | ||
|
||
# data_id, question, table_id, gold_result_str | ||
with open(filepath, encoding="utf-8") as f: | ||
for idx, line in enumerate(f): | ||
# skip the header | ||
if idx == 0: | ||
continue | ||
data_id, question, table_id, gold_result_str = line.strip("\n").split("\t") | ||
|
||
if data_id in squall_id_map.keys(): | ||
# Data annotation from WikiTableQuestion dataset | ||
table = _load_table(os.path.join(data_dir, table_id.replace('.csv', '.tsv'))) | ||
gold_result = gold_result_str.split('|') | ||
|
||
# Data annotation from Squall dataset. | ||
squall_data_item = squall_id_map[data_id] | ||
squall_table_id = squall_data_item["tbl"] | ||
sql_struct = squall_data_item["sql"] | ||
engine, src_table_content = db_engine_map[squall_table_id], src_table_content_map[squall_table_id] | ||
try: | ||
encode_sql_str, _, exec_sql_str = retrieve_wtq_query_answer(engine, table, sql_struct) | ||
except IndexError as e: | ||
# In case header is modified. | ||
encode_sql_str, _, exec_sql_str = retrieve_wtq_query_answer(engine, src_table_content, sql_struct) | ||
|
||
yield idx, { | ||
"id": data_id, | ||
"question": question, | ||
"table_id": table_id, | ||
"table": table, | ||
"sql": encode_sql_str, | ||
"answer_text": gold_result, | ||
} | ||
else: | ||
continue |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
# coding=utf-8 | ||
# Copyright 2021 The HuggingFace Datasets Authors, The Google AI Language Team Authors and the current dataset script contributor. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
"""The WikiTableQuestions dataset is for the task of question answering on semi-structured HTML tables""" | ||
|
||
import json | ||
import os | ||
import datasets | ||
from utils.wtq.utils import _load_table_w_page as _load_table | ||
|
||
# Find for instance the citation on arxiv or on the dataset repo/website | ||
_CITATION = """\ | ||
@inproceedings{pasupat-liang-2015-compositional, | ||
title = "Compositional Semantic Parsing on Semi-Structured Tables", | ||
author = "Pasupat, Panupong and | ||
Liang, Percy", | ||
booktitle = "Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)", | ||
month = jul, | ||
year = "2015", | ||
address = "Beijing, China", | ||
publisher = "Association for Computational Linguistics", | ||
url = "https://aclanthology.org/P15-1142", | ||
doi = "10.3115/v1/P15-1142", | ||
pages = "1470--1480", | ||
} | ||
""" | ||
|
||
_DESCRIPTION = """\ | ||
Two important aspects of semantic parsing for question answering are the breadth of the knowledge source and the depth of | ||
logical compositionality. While existing work trades off one aspect for another, this paper simultaneously makes progress | ||
on both fronts through a new task: answering complex questions on semi-structured tables using question-answer pairs as | ||
supervision. The central challenge arises from two compounding factors: the broader domain results in an open-ended set | ||
of relations, and the deeper compositionality results in a combinatorial explosion in the space of logical forms. We | ||
propose a logical-form driven parsing algorithm guided by strong typing constraints and show that it obtains significant | ||
improvements over natural baselines. For evaluation, we created a new dataset of 22,033 complex questions on Wikipedia | ||
tables, which is made publicly available. | ||
""" | ||
|
||
_HOMEPAGE = "https://ppasupat.github.io/WikiTableQuestions/" | ||
|
||
_LICENSE = "CC-BY-SA-4.0 License" | ||
|
||
_URL = "https://github.com/ppasupat/WikiTableQuestions/archive/refs/heads/master.zip" | ||
_SQUALL_URL = "https://github.com/tzshi/squall/archive/refs/heads/main.zip" | ||
|
||
|
||
class WikiTableQuestion(datasets.GeneratorBasedBuilder): | ||
"""The WikiTableQuestions dataset""" | ||
|
||
def _info(self): | ||
return datasets.DatasetInfo( | ||
description=_DESCRIPTION, | ||
features=datasets.Features( | ||
{ | ||
"id": datasets.Value("string"), | ||
"question": datasets.Value("string"), | ||
"table_id": datasets.Value("string"), | ||
"table": {"page_title": datasets.Value("string"), | ||
"header": datasets.features.Sequence(datasets.Value("string")), | ||
"rows": datasets.features.Sequence(datasets.features.Sequence(datasets.Value("string")))}, | ||
"answer_text": datasets.features.Sequence(datasets.Value("string")), | ||
} | ||
), | ||
supervised_keys=None, | ||
homepage=_HOMEPAGE, | ||
license=_LICENSE, | ||
citation=_CITATION, | ||
) | ||
|
||
def _split_generators(self, dl_manager): | ||
"""Returns SplitGenerators.""" | ||
data_dir = os.path.join(dl_manager.download_and_extract(_URL), 'WikiTableQuestions-master') | ||
squall_dir = os.path.join(dl_manager.download_and_extract(_SQUALL_URL), 'squall-main') | ||
|
||
return [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TRAIN, | ||
gen_kwargs={"filepath": os.path.join(data_dir, "data/random-split-1-train.tsv"), | ||
"data_dir": data_dir, | ||
"squall_path": os.path.join(squall_dir, "data/squall.json")}, | ||
), | ||
datasets.SplitGenerator( | ||
name=datasets.Split.VALIDATION, | ||
gen_kwargs={"filepath": os.path.join(data_dir, "data/random-split-1-dev.tsv"), | ||
"data_dir": data_dir, | ||
"squall_path": os.path.join(squall_dir, "data/squall.json")}, | ||
), | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TEST, | ||
gen_kwargs={"filepath": os.path.join(data_dir, "data/pristine-unseen-tables.tsv"), | ||
"data_dir": data_dir, | ||
"squall_path": os.path.join(squall_dir, "data/squall.json")}, | ||
), | ||
|
||
] | ||
|
||
def _generate_examples(self, filepath, data_dir, squall_path): | ||
"""Yields examples.""" | ||
squall_id_list = [] | ||
with open(squall_path) as f: | ||
squall_data = json.load(f) | ||
for squall_item in squall_data: | ||
squall_id_list.append(squall_item["nt"]) | ||
# data_id, question, table_id, gold_result_str | ||
with open(filepath, encoding="utf-8") as f: | ||
for idx, line in enumerate(f): | ||
# skip the header | ||
if idx == 0: | ||
continue | ||
data_id, question, table_id, gold_result_str = line.strip("\n").split("\t") | ||
if data_id not in squall_id_list: | ||
gold_result = gold_result_str.split('|') | ||
yield idx, { | ||
"id": data_id, | ||
"question": question, | ||
"table_id": table_id, | ||
"table": _load_table(os.path.join(data_dir, table_id.replace('.csv', '.tsv'))), | ||
# convert the .csv postfix to .tsv, for easier read-in | ||
"answer_text": gold_result, | ||
} | ||
else: | ||
continue |
Oops, something went wrong.