Skip to content

Commit

Permalink
Init commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Timothyxxx committed Oct 8, 2022
1 parent 67065b6 commit bcea7be
Show file tree
Hide file tree
Showing 22 changed files with 32,821 additions and 0 deletions.
192 changes: 192 additions & 0 deletions datasets/has_squall.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
# coding=utf-8
# Copyright 2021 The HuggingFace Datasets Authors, The Google AI Language Team Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Squall: On the Potential of Lexico-logical Alignments for Semantic Parsing to SQL Queries"""


import json
import os
import datasets
import shutil
from utils.wtq.utils import _load_table_w_page as _load_table

# Find for instance the citation on arxiv or on the dataset repo/website
_CITATION = """\
@inproceedings{Shi:Zhao:Boyd-Graber:Daume-III:Lee-2020,
Title = {On the Potential of Lexico-logical Alignments for Semantic Parsing to {SQL} Queries},
Author = {Tianze Shi and Chen Zhao and Jordan Boyd-Graber and Hal {Daum\'{e} III} and Lillian Lee},
Booktitle = {Findings of EMNLP},
Year = {2020},
}
"""

_DESCRIPTION = """\
"""

_HOMEPAGE = "https://github.com/tzshi/squall"

_LICENSE = "CC-BY-SA-4.0 License"

_URL = "https://github.com/ppasupat/WikiTableQuestions/archive/refs/heads/master.zip"
_SQUALL_URL = "https://github.com/tzshi/squall/archive/refs/heads/main.zip"

from utils.wtq.utils import WTQDBEngine, process_table_structure, retrieve_wtq_query_answer

class WikiTableQuestion(datasets.GeneratorBasedBuilder):
"""The Squall dataset"""

def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features(
{
"id": datasets.Value("string"),
"question": datasets.Value("string"),
"table_id": datasets.Value("string"),
"table": {
"page_title": datasets.Value("string"),
"header": datasets.features.Sequence(datasets.Value("string")),
"rows": datasets.features.Sequence(datasets.features.Sequence(datasets.Value("string")))
},
"sql": datasets.Value("string"),
"answer_text": datasets.features.Sequence(datasets.Value("string")),
}
),
supervised_keys=None,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
data_dir = os.path.join(dl_manager.download_and_extract(_URL), 'WikiTableQuestions-master')
squall_dir = os.path.join(dl_manager.download_and_extract(_SQUALL_URL), 'squall-main')

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": os.path.join(data_dir, "data/random-split-1-train.tsv"),
"data_dir": data_dir,
"squall_path": os.path.join(squall_dir, "data/squall.json"),
"squall_tables_path": os.path.join(squall_dir, "tables/json"),
"squall_db_path": os.path.join(squall_dir, "tables/db"),
"squall_tmp_db_path": os.path.join(squall_dir, "tables/tmp_db"),

},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"filepath": os.path.join(data_dir, "data/random-split-1-dev.tsv"),
"data_dir": data_dir,
"squall_path": os.path.join(squall_dir, "data/squall.json"),
"squall_tables_path": os.path.join(squall_dir, "tables/json"),
"squall_db_path": os.path.join(squall_dir, "tables/db"),
"squall_tmp_db_path": os.path.join(squall_dir, "tables/tmp_db"),

},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": os.path.join(data_dir, "data/pristine-unseen-tables.tsv"),
"data_dir": data_dir,
"squall_path": os.path.join(squall_dir, "data/squall.json"),
"squall_tables_path": os.path.join(squall_dir, "tables/json"),
"squall_db_path": os.path.join(squall_dir, "tables/db"),
"squall_tmp_db_path": os.path.join(squall_dir, "tables/tmp_db"),

},
),

]

def _generate_examples(self, filepath, data_dir, squall_path, squall_tables_path, squall_db_path,
squall_tmp_db_path):
if not os.path.exists(squall_tmp_db_path):
os.makedirs(squall_tmp_db_path)

# source table should not be truncated!
src_table_content_map = {}
# tgt table should be truncated!
tgt_table_content_map = {}
table_drop_rows_map = {}
db_engine_map = {}

for table_json_file in os.listdir(squall_tables_path):
table_id = table_json_file[:-5]
check_table_file = open(os.path.join(squall_tables_path, table_json_file), "r", encoding="utf8")
src_table_content = json.load(check_table_file)
src_table_content = process_table_structure(src_table_content)
src_table_content_map[table_id] = json.loads(json.dumps(src_table_content))
tgt_table_content_map[table_id] = src_table_content

for table_db_file in os.listdir(squall_db_path):
table_id = table_db_file[:-3]
# copy table db file into a temp file since we may delete some rows
database_path = os.path.join(squall_db_path, table_db_file)
temp_database_path = os.path.join(squall_tmp_db_path, table_db_file)
if os.path.exists(temp_database_path):
os.remove(temp_database_path)
# future operations on the temp db to avoid effecting the original database
shutil.copy(database_path, temp_database_path)
db_engine_map[table_id] = WTQDBEngine(temp_database_path)
if table_id in table_drop_rows_map and len(table_drop_rows_map[table_id]) != 0:
table_drop_rows = table_drop_rows_map[table_id]
db_engine_map[table_id].delete_rows(table_drop_rows)

"""Yields examples."""
squall_id_map = {}
with open(squall_path) as f:
squall_data = json.load(f)
for squall_item in squall_data:
squall_id_map[squall_item["nt"]] = squall_item

# data_id, question, table_id, gold_result_str
with open(filepath, encoding="utf-8") as f:
for idx, line in enumerate(f):
# skip the header
if idx == 0:
continue
data_id, question, table_id, gold_result_str = line.strip("\n").split("\t")

if data_id in squall_id_map.keys():
# Data annotation from WikiTableQuestion dataset
table = _load_table(os.path.join(data_dir, table_id.replace('.csv', '.tsv')))
gold_result = gold_result_str.split('|')

# Data annotation from Squall dataset.
squall_data_item = squall_id_map[data_id]
squall_table_id = squall_data_item["tbl"]
sql_struct = squall_data_item["sql"]
engine, src_table_content = db_engine_map[squall_table_id], src_table_content_map[squall_table_id]
try:
encode_sql_str, _, exec_sql_str = retrieve_wtq_query_answer(engine, table, sql_struct)
except IndexError as e:
# In case header is modified.
encode_sql_str, _, exec_sql_str = retrieve_wtq_query_answer(engine, src_table_content, sql_struct)

yield idx, {
"id": data_id,
"question": question,
"table_id": table_id,
"table": table,
"sql": encode_sql_str,
"answer_text": gold_result,
}
else:
continue
133 changes: 133 additions & 0 deletions datasets/missing_squall.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# coding=utf-8
# Copyright 2021 The HuggingFace Datasets Authors, The Google AI Language Team Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The WikiTableQuestions dataset is for the task of question answering on semi-structured HTML tables"""

import json
import os
import datasets
from utils.wtq.utils import _load_table_w_page as _load_table

# Find for instance the citation on arxiv or on the dataset repo/website
_CITATION = """\
@inproceedings{pasupat-liang-2015-compositional,
title = "Compositional Semantic Parsing on Semi-Structured Tables",
author = "Pasupat, Panupong and
Liang, Percy",
booktitle = "Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
month = jul,
year = "2015",
address = "Beijing, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P15-1142",
doi = "10.3115/v1/P15-1142",
pages = "1470--1480",
}
"""

_DESCRIPTION = """\
Two important aspects of semantic parsing for question answering are the breadth of the knowledge source and the depth of
logical compositionality. While existing work trades off one aspect for another, this paper simultaneously makes progress
on both fronts through a new task: answering complex questions on semi-structured tables using question-answer pairs as
supervision. The central challenge arises from two compounding factors: the broader domain results in an open-ended set
of relations, and the deeper compositionality results in a combinatorial explosion in the space of logical forms. We
propose a logical-form driven parsing algorithm guided by strong typing constraints and show that it obtains significant
improvements over natural baselines. For evaluation, we created a new dataset of 22,033 complex questions on Wikipedia
tables, which is made publicly available.
"""

_HOMEPAGE = "https://ppasupat.github.io/WikiTableQuestions/"

_LICENSE = "CC-BY-SA-4.0 License"

_URL = "https://github.com/ppasupat/WikiTableQuestions/archive/refs/heads/master.zip"
_SQUALL_URL = "https://github.com/tzshi/squall/archive/refs/heads/main.zip"


class WikiTableQuestion(datasets.GeneratorBasedBuilder):
"""The WikiTableQuestions dataset"""

def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features(
{
"id": datasets.Value("string"),
"question": datasets.Value("string"),
"table_id": datasets.Value("string"),
"table": {"page_title": datasets.Value("string"),
"header": datasets.features.Sequence(datasets.Value("string")),
"rows": datasets.features.Sequence(datasets.features.Sequence(datasets.Value("string")))},
"answer_text": datasets.features.Sequence(datasets.Value("string")),
}
),
supervised_keys=None,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
data_dir = os.path.join(dl_manager.download_and_extract(_URL), 'WikiTableQuestions-master')
squall_dir = os.path.join(dl_manager.download_and_extract(_SQUALL_URL), 'squall-main')

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"filepath": os.path.join(data_dir, "data/random-split-1-train.tsv"),
"data_dir": data_dir,
"squall_path": os.path.join(squall_dir, "data/squall.json")},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={"filepath": os.path.join(data_dir, "data/random-split-1-dev.tsv"),
"data_dir": data_dir,
"squall_path": os.path.join(squall_dir, "data/squall.json")},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={"filepath": os.path.join(data_dir, "data/pristine-unseen-tables.tsv"),
"data_dir": data_dir,
"squall_path": os.path.join(squall_dir, "data/squall.json")},
),

]

def _generate_examples(self, filepath, data_dir, squall_path):
"""Yields examples."""
squall_id_list = []
with open(squall_path) as f:
squall_data = json.load(f)
for squall_item in squall_data:
squall_id_list.append(squall_item["nt"])
# data_id, question, table_id, gold_result_str
with open(filepath, encoding="utf-8") as f:
for idx, line in enumerate(f):
# skip the header
if idx == 0:
continue
data_id, question, table_id, gold_result_str = line.strip("\n").split("\t")
if data_id not in squall_id_list:
gold_result = gold_result_str.split('|')
yield idx, {
"id": data_id,
"question": question,
"table_id": table_id,
"table": _load_table(os.path.join(data_dir, table_id.replace('.csv', '.tsv'))),
# convert the .csv postfix to .tsv, for easier read-in
"answer_text": gold_result,
}
else:
continue
Loading

0 comments on commit bcea7be

Please sign in to comment.