From fa66c94d5a4d6e2a7dcf2c64d6377d3d1162be55 Mon Sep 17 00:00:00 2001 From: WilliamGazeley Date: Wed, 14 Feb 2024 13:55:17 +0000 Subject: [PATCH 1/4] Implement relative importing of data --- .gitignore | 3 + .../FinGPT_Benchmark/benchmarks/convfinqa.py | 9 ++- fingpt/FinGPT_Benchmark/benchmarks/fineval.py | 5 +- fingpt/FinGPT_Benchmark/benchmarks/finred.py | 5 +- fingpt/FinGPT_Benchmark/benchmarks/fiqa.py | 7 ++- fingpt/FinGPT_Benchmark/benchmarks/fpb.py | 9 +-- .../FinGPT_Benchmark/benchmarks/headline.py | 8 +-- fingpt/FinGPT_Benchmark/benchmarks/ner.py | 6 +- fingpt/FinGPT_Benchmark/benchmarks/nwgi.py | 4 +- fingpt/FinGPT_Benchmark/benchmarks/tfns.py | 3 +- fingpt/FinGPT_Benchmark/data/download.py | 57 +++++++++++++++++++ 11 files changed, 90 insertions(+), 26 deletions(-) create mode 100644 fingpt/FinGPT_Benchmark/data/download.py diff --git a/.gitignore b/.gitignore index 9961844..c80c860 100644 --- a/.gitignore +++ b/.gitignore @@ -136,3 +136,6 @@ fingpt/FinGPT_sentiment/instruct-FinGPT/run.sh fingpt/FinGPT_sentiment/instruct-FinGPT/checkpoints fingpt/FinGPT_sentiment/instruct-FinGPT/ds_results_all_10_v2_1.* FinGPT_Training_LoRA_with_Chatglm2_6b_for_beginners.ipynb + +# Benchmark data +fingpt/FinGPT_Benchmark/data/*/** diff --git a/fingpt/FinGPT_Benchmark/benchmarks/convfinqa.py b/fingpt/FinGPT_Benchmark/benchmarks/convfinqa.py index 070df61..0bf9a64 100644 --- a/fingpt/FinGPT_Benchmark/benchmarks/convfinqa.py +++ b/fingpt/FinGPT_Benchmark/benchmarks/convfinqa.py @@ -8,9 +8,9 @@ import re import sys import numpy as np +from fingpt.FinGPT_Benchmark.utils import * +from pathlib import Path sys.path.append('../') -from utils import * - def cvt_text_to_pred(text): if not text: @@ -32,9 +32,8 @@ def map_output(feature): return {'label': label, 'pred': pred} -def test_convfinqa(args, model, tokenizer): - - dataset = load_from_disk('../data/fingpt-convfinqa')['test']#.select(range(30)) + + dataset = load_from_disk(Path(__file__).parent.parent / 'data/fingpt-convfinqa')['test'] dataset = dataset.map(partial(test_mapping, args), load_from_cache_file=False) def collate_fn(batch): diff --git a/fingpt/FinGPT_Benchmark/benchmarks/fineval.py b/fingpt/FinGPT_Benchmark/benchmarks/fineval.py index bd19542..9611038 100644 --- a/fingpt/FinGPT_Benchmark/benchmarks/fineval.py +++ b/fingpt/FinGPT_Benchmark/benchmarks/fineval.py @@ -8,8 +8,9 @@ import re import sys import numpy as np +from fingpt.FinGPT_Benchmark.utils import * +from pathlib import Path sys.path.append('../') -from utils import * def cvt_text_to_pred(text): @@ -33,7 +34,7 @@ def map_output(feature): def test_fineval(args, model, tokenizer): - dataset = load_from_disk('../data/fingpt-fineval')['test']#.select(range(30)) + dataset = load_from_disk(Path(__file__).parent.parent / 'data/fingpt-fineval')['test'] dataset = dataset.map(partial(test_mapping, args), load_from_cache_file=False) def collate_fn(batch): diff --git a/fingpt/FinGPT_Benchmark/benchmarks/finred.py b/fingpt/FinGPT_Benchmark/benchmarks/finred.py index 2e801c5..b197a4f 100644 --- a/fingpt/FinGPT_Benchmark/benchmarks/finred.py +++ b/fingpt/FinGPT_Benchmark/benchmarks/finred.py @@ -8,8 +8,9 @@ import re import sys import numpy as np +from fingpt.FinGPT_Benchmark.utils import * +from pathlib import Path sys.path.append('../') -from utils import * relations = [ @@ -102,7 +103,7 @@ def calc_metric(gt_list, pred_list): def test_re(args, model, tokenizer): - dataset = load_from_disk('../data/fingpt-finred-re')['test']#.select(range(50)) + dataset = load_from_disk(Path(__file__).parent.parent / 'data/fingpt-finred-re')['test'] dataset = dataset.train_test_split(0.2, seed=42)['test'] dataset = dataset.map(partial(test_mapping, args), load_from_cache_file=False) diff --git a/fingpt/FinGPT_Benchmark/benchmarks/fiqa.py b/fingpt/FinGPT_Benchmark/benchmarks/fiqa.py index 94e405e..1967b82 100644 --- a/fingpt/FinGPT_Benchmark/benchmarks/fiqa.py +++ b/fingpt/FinGPT_Benchmark/benchmarks/fiqa.py @@ -9,9 +9,10 @@ from torch.utils.data import DataLoader from functools import partial +from pathlib import Path -with open('sentiment_templates.txt') as f: +with open(Path(__file__).parent / 'sentiment_templates.txt') as f: templates = [l.strip() for l in f.readlines()] @@ -58,7 +59,7 @@ def vote_output(x): def test_fiqa(args, model, tokenizer, prompt_fun=add_instructions): batch_size = args.batch_size # dataset = load_dataset('pauri32/fiqa-2018') - dataset = load_from_disk('../data/fiqa-2018/') + dataset = load_from_disk(Path(__file__).parent.parent / 'data/fiqa-2018/') dataset = datasets.concatenate_datasets([dataset["train"], dataset["validation"] ,dataset["test"] ]) dataset = dataset.train_test_split(0.226, seed = 42)['test'] dataset = dataset.to_pandas() @@ -112,7 +113,7 @@ def test_fiqa(args, model, tokenizer, prompt_fun=add_instructions): def test_fiqa_mlt(args, model, tokenizer): batch_size = args.batch_size # dataset = load_dataset('pauri32/fiqa-2018') - dataset = load_from_disk('../data/fiqa-2018/') + dataset = load_from_disk(Path(__file__).parent.parent / 'data/fiqa-2018/') dataset = datasets.concatenate_datasets([dataset["train"], dataset["validation"] ,dataset["test"] ]) dataset = dataset.train_test_split(0.226, seed=42)['test'] dataset = dataset.to_pandas() diff --git a/fingpt/FinGPT_Benchmark/benchmarks/fpb.py b/fingpt/FinGPT_Benchmark/benchmarks/fpb.py index 0aa6788..1a460a3 100644 --- a/fingpt/FinGPT_Benchmark/benchmarks/fpb.py +++ b/fingpt/FinGPT_Benchmark/benchmarks/fpb.py @@ -9,6 +9,7 @@ from torch.utils.data import DataLoader from functools import partial +from pathlib import Path dic = { 0:"negative", @@ -16,7 +17,7 @@ 2:'positive', } -with open('sentiment_templates.txt') as f: +with open(Path(__file__).parent / 'sentiment_templates.txt') as f: templates = [l.strip() for l in f.readlines()] @@ -52,7 +53,7 @@ def vote_output(x): def test_fpb(args, model, tokenizer, prompt_fun=None): batch_size = args.batch_size # instructions = load_dataset("financial_phrasebank", "sentences_50agree") - instructions = load_from_disk("../data/financial_phrasebank-sentences_50agree/") + instructions = load_from_disk(Path(__file__).parent.parent / "data/financial_phrasebank-sentences_50agree/") instructions = instructions["train"] instructions = instructions.train_test_split(seed = 42)['test'] instructions = instructions.to_pandas() @@ -105,8 +106,8 @@ def test_fpb(args, model, tokenizer, prompt_fun=None): def test_fpb_mlt(args, model, tokenizer): batch_size = args.batch_size - # instructions = load_dataset("financial_phrasebank", "sentences_50agree") - dataset = load_from_disk('../data/financial_phrasebank-sentences_50agree/') + # dataset = load_dataset("financial_phrasebank", "sentences_50agree") + dataset = load_from_disk(Path(__file__).parent.parent / 'data/financial_phrasebank-sentences_50agree/') dataset = dataset["train"]#.select(range(300)) dataset = dataset.train_test_split(seed=42)['test'] dataset = dataset.to_pandas() diff --git a/fingpt/FinGPT_Benchmark/benchmarks/headline.py b/fingpt/FinGPT_Benchmark/benchmarks/headline.py index 3ce3dbd..59c1087 100644 --- a/fingpt/FinGPT_Benchmark/benchmarks/headline.py +++ b/fingpt/FinGPT_Benchmark/benchmarks/headline.py @@ -5,11 +5,11 @@ import torch from torch.utils.data import DataLoader from functools import partial - +from pathlib import Path +from fingpt.FinGPT_Benchmark.utils import * import sys sys.path.append('../') -from utils import * @@ -34,8 +34,8 @@ def map_output(feature): def test_headline(args, model, tokenizer): - # dataset = load_from_disk('../data/fingpt-headline')['test']#.select(range(300)) - dataset = load_from_disk('../data/fingpt-headline-instruct')['test']#.select(range(300)) + # dataset = load_from_disk('../data/fingpt-headline')['test'] + dataset = load_from_disk(Path(__file__).parent.parent / 'data/fingpt-headline-instruct')['test'] dataset = dataset.map(partial(test_mapping, args), load_from_cache_file=False) def collate_fn(batch): diff --git a/fingpt/FinGPT_Benchmark/benchmarks/ner.py b/fingpt/FinGPT_Benchmark/benchmarks/ner.py index 83ea5fe..b8cdb96 100644 --- a/fingpt/FinGPT_Benchmark/benchmarks/ner.py +++ b/fingpt/FinGPT_Benchmark/benchmarks/ner.py @@ -8,9 +8,9 @@ import re import sys import numpy as np +from fingpt.FinGPT_Benchmark.utils import * +from pathlib import Path sys.path.append('../') -from utils import * - ent_dict = { 'PER': 'person', @@ -53,7 +53,7 @@ def map_output(feature): def test_ner(args, model, tokenizer): - dataset = load_from_disk('../data/fingpt-ner')['test']#.select(range(30)) + dataset = load_from_disk(Path(__file__).parent.parent / 'data/fingpt-ner')['test'] dataset = dataset.map(partial(test_mapping, args), load_from_cache_file=False) def collate_fn(batch): diff --git a/fingpt/FinGPT_Benchmark/benchmarks/nwgi.py b/fingpt/FinGPT_Benchmark/benchmarks/nwgi.py index efa952b..ced2b5b 100644 --- a/fingpt/FinGPT_Benchmark/benchmarks/nwgi.py +++ b/fingpt/FinGPT_Benchmark/benchmarks/nwgi.py @@ -6,6 +6,7 @@ from tqdm import tqdm import datasets import torch +from pathlib import Path dic = { 'strong negative':"negative", @@ -36,8 +37,7 @@ def change_target(x): def test_nwgi(args, model, tokenizer, prompt_fun=None): batch_size = args.batch_size # dataset = load_dataset('oliverwang15/news_with_gpt_instructions') - dataset = load_from_disk('../data/news_with_gpt_instructions/') - dataset = dataset['test'].to_pandas() + dataset = load_from_disk(Path(__file__).parent.parent / 'data/news_with_gpt_instructions/') dataset['output'] = dataset['label'].apply(lambda x:dic[x]) if prompt_fun is None: diff --git a/fingpt/FinGPT_Benchmark/benchmarks/tfns.py b/fingpt/FinGPT_Benchmark/benchmarks/tfns.py index ee3f283..8c7ae84 100644 --- a/fingpt/FinGPT_Benchmark/benchmarks/tfns.py +++ b/fingpt/FinGPT_Benchmark/benchmarks/tfns.py @@ -6,6 +6,7 @@ from tqdm import tqdm import datasets import torch +from pathlib import Path dic = { 0:"negative", @@ -32,7 +33,7 @@ def change_target(x): def test_tfns(args, model, tokenizer, prompt_fun=None): batch_size = args.batch_size # dataset = load_dataset('zeroshot/twitter-financial-news-sentiment') - dataset = load_from_disk('../data/twitter-financial-news-sentiment') + dataset = load_from_disk(Path(__file__).parent.parent / 'data/twitter-financial-news-sentiment') dataset = dataset['validation'] dataset = dataset.to_pandas() dataset['label'] = dataset['label'].apply(lambda x:dic[x]) diff --git a/fingpt/FinGPT_Benchmark/data/download.py b/fingpt/FinGPT_Benchmark/data/download.py new file mode 100644 index 0000000..48517a3 --- /dev/null +++ b/fingpt/FinGPT_Benchmark/data/download.py @@ -0,0 +1,57 @@ +import datasets +from pathlib import Path + +def download(): + """Downloads all datasets to where the FinGPT library is located.""" + data_dir = Path(__file__).parent + + dataset = datasets.load_dataset('pauri32/fiqa-2018') + dataset.save_to_disk(data_dir / 'fiqa-2018') + + dataset = datasets.load_dataset('FinGPT/fingpt-finred') + dataset.save_to_disk(data_dir / 'fingpt-finred') + + dataset = datasets.load_dataset('zeroshot/twitter-financial-news-sentiment') + dataset.save_to_disk(data_dir / 'twitter-financial-news-sentiment') + + dataset = datasets.load_dataset('oliverwang15/news_with_gpt_instructions') + dataset.save_to_disk(data_dir / 'news_with_gpt_instructions') + + dataset = datasets.load_dataset("financial_phrasebank", "sentences_50agree") + dataset.save_to_disk(data_dir / 'financial_phrasebank-sentences_50agree') + + dataset = datasets.load_dataset('FinGPT/fingpt-fiqa_qa') + dataset.save_to_disk(data_dir / 'fingpt-fiqa_qa') + + dataset = datasets.load_dataset('FinGPT/fingpt-headline-cls') + dataset.save_to_disk(data_dir / 'fingpt-headline-cls') + + dataset = datasets.load_dataset('FinGPT/fingpt-finred') + dataset.save_to_disk(data_dir / 'fingpt-finred') + + dataset = datasets.load_dataset('FinGPT/fingpt-convfinqa') + dataset.save_to_disk(data_dir / 'fingpt-convfinqa') + + dataset = datasets.load_dataset('FinGPT/fingpt-finred-cls') + dataset.save_to_disk(data_dir / 'fingpt-finred-cls') + + dataset = datasets.load_dataset('FinGPT/fingpt-ner') + dataset.save_to_disk(data_dir / 'fingpt-ner') + + dataset = datasets.load_dataset('FinGPT/fingpt-headline') + dataset.save_to_disk(data_dir / 'fingpt-headline-instruct') + + dataset = datasets.load_dataset('FinGPT/fingpt-finred-re') + dataset.save_to_disk(data_dir / 'fingpt-finred-re') + + dataset = datasets.load_dataset('FinGPT/fingpt-ner-cls') + dataset.save_to_disk(data_dir / 'fingpt-ner-cls') + + dataset = datasets.load_dataset('FinGPT/fingpt-fineval') + dataset.save_to_disk(data_dir / 'fingpt-fineval') + + dataset = datasets.load_dataset('FinGPT/fingpt-sentiment-cls') + dataset.save_to_disk(data_dir / 'fingpt-sentiment-cls') + +if __name__ == "__main__": + download() From 4cc826d53450a736cd14762c696b6dcf4139832b Mon Sep 17 00:00:00 2001 From: WilliamGazeley Date: Mon, 19 Feb 2024 09:26:48 +0000 Subject: [PATCH 2/4] Added init file to benchmarks dir --- fingpt/FinGPT_Benchmark/benchmarks/__init__.py | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 fingpt/FinGPT_Benchmark/benchmarks/__init__.py diff --git a/fingpt/FinGPT_Benchmark/benchmarks/__init__.py b/fingpt/FinGPT_Benchmark/benchmarks/__init__.py new file mode 100644 index 0000000..8fdfe5c --- /dev/null +++ b/fingpt/FinGPT_Benchmark/benchmarks/__init__.py @@ -0,0 +1,3 @@ +from . import fpb, fiqa, finred, fineval, convfinqa, headline, ner, nwgi, tfns + +__all__ = [fpb, fiqa, finred, fineval, convfinqa, headline, ner, nwgi, tfns] From ccfd2f9c96f100b70f9cfbabed793a46c27f5958 Mon Sep 17 00:00:00 2001 From: WilliamGazeley Date: Mon, 19 Feb 2024 10:08:42 +0000 Subject: [PATCH 3/4] Include sentiment_templates.txt in package --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) create mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..5b7895a --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include fingpt/FinGPT_Benchmark/benchmarks/sentiment_templates.txt From d5134817bf9adef6e68c4933111e26f5899add09 Mon Sep 17 00:00:00 2001 From: WilliamGazeley Date: Mon, 19 Feb 2024 10:09:12 +0000 Subject: [PATCH 4/4] Cleaner dataset downloads --- fingpt/FinGPT_Benchmark/__init__.py | 2 + fingpt/FinGPT_Benchmark/data/__init__.py | 0 fingpt/FinGPT_Benchmark/data/download.py | 84 ++++++++++-------------- 3 files changed, 36 insertions(+), 50 deletions(-) create mode 100644 fingpt/FinGPT_Benchmark/data/__init__.py diff --git a/fingpt/FinGPT_Benchmark/__init__.py b/fingpt/FinGPT_Benchmark/__init__.py index e69de29..1aa7252 100644 --- a/fingpt/FinGPT_Benchmark/__init__.py +++ b/fingpt/FinGPT_Benchmark/__init__.py @@ -0,0 +1,2 @@ +from .data.download import download as download_datasets +from . import benchmarks diff --git a/fingpt/FinGPT_Benchmark/data/__init__.py b/fingpt/FinGPT_Benchmark/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/fingpt/FinGPT_Benchmark/data/download.py b/fingpt/FinGPT_Benchmark/data/download.py index 48517a3..d18aee0 100644 --- a/fingpt/FinGPT_Benchmark/data/download.py +++ b/fingpt/FinGPT_Benchmark/data/download.py @@ -1,57 +1,41 @@ import datasets from pathlib import Path - -def download(): +import argparse + +DATASETS = [ + # source, destination + (('pauri32/fiqa-2018', None), 'fiqa-2018'), + (('FinGPT/fingpt-finred', None), 'fingpt-finred'), + (('zeroshot/twitter-financial-news-sentiment', None), 'twitter-financial-news-sentiment'), + (('oliverwang15/news_with_gpt_instructions', None), 'news_with_gpt_instructions'), + (('financial_phrasebank', 'sentences_50agree'), 'financial_phrasebank-sentences_50agree'), + (('FinGPT/fingpt-fiqa_qa', None), 'fingpt-fiqa_qa'), + (('FinGPT/fingpt-headline-cls', None), 'fingpt-headline-cls'), + (('FinGPT/fingpt-finred', None), 'fingpt-finred'), + (('FinGPT/fingpt-convfinqa', None), 'fingpt-convfinqa'), + (('FinGPT/fingpt-finred-cls', None), 'fingpt-finred-cls'), + (('FinGPT/fingpt-ner', None), 'fingpt-ner'), + (('FinGPT/fingpt-headline', None), 'fingpt-headline-instruct'), + (('FinGPT/fingpt-finred-re', None), 'fingpt-finred-re'), + (('FinGPT/fingpt-ner-cls', None), 'fingpt-ner-cls'), + (('FinGPT/fingpt-fineval', None), 'fingpt-fineval'), + (('FinGPT/fingpt-sentiment-cls', None), 'fingpt-sentiment-cls'), +] + +def download(no_cache: bool = False): """Downloads all datasets to where the FinGPT library is located.""" data_dir = Path(__file__).parent - dataset = datasets.load_dataset('pauri32/fiqa-2018') - dataset.save_to_disk(data_dir / 'fiqa-2018') - - dataset = datasets.load_dataset('FinGPT/fingpt-finred') - dataset.save_to_disk(data_dir / 'fingpt-finred') - - dataset = datasets.load_dataset('zeroshot/twitter-financial-news-sentiment') - dataset.save_to_disk(data_dir / 'twitter-financial-news-sentiment') - - dataset = datasets.load_dataset('oliverwang15/news_with_gpt_instructions') - dataset.save_to_disk(data_dir / 'news_with_gpt_instructions') - - dataset = datasets.load_dataset("financial_phrasebank", "sentences_50agree") - dataset.save_to_disk(data_dir / 'financial_phrasebank-sentences_50agree') - - dataset = datasets.load_dataset('FinGPT/fingpt-fiqa_qa') - dataset.save_to_disk(data_dir / 'fingpt-fiqa_qa') - - dataset = datasets.load_dataset('FinGPT/fingpt-headline-cls') - dataset.save_to_disk(data_dir / 'fingpt-headline-cls') - - dataset = datasets.load_dataset('FinGPT/fingpt-finred') - dataset.save_to_disk(data_dir / 'fingpt-finred') - - dataset = datasets.load_dataset('FinGPT/fingpt-convfinqa') - dataset.save_to_disk(data_dir / 'fingpt-convfinqa') - - dataset = datasets.load_dataset('FinGPT/fingpt-finred-cls') - dataset.save_to_disk(data_dir / 'fingpt-finred-cls') - - dataset = datasets.load_dataset('FinGPT/fingpt-ner') - dataset.save_to_disk(data_dir / 'fingpt-ner') - - dataset = datasets.load_dataset('FinGPT/fingpt-headline') - dataset.save_to_disk(data_dir / 'fingpt-headline-instruct') - - dataset = datasets.load_dataset('FinGPT/fingpt-finred-re') - dataset.save_to_disk(data_dir / 'fingpt-finred-re') - - dataset = datasets.load_dataset('FinGPT/fingpt-ner-cls') - dataset.save_to_disk(data_dir / 'fingpt-ner-cls') - - dataset = datasets.load_dataset('FinGPT/fingpt-fineval') - dataset.save_to_disk(data_dir / 'fingpt-fineval') - - dataset = datasets.load_dataset('FinGPT/fingpt-sentiment-cls') - dataset.save_to_disk(data_dir / 'fingpt-sentiment-cls') + for src, dest in DATASETS: + if Path(data_dir / dest).is_dir() and not no_cache: + print(f"Dataset found at {data_dir / dest}, skipping") + continue + dataset = datasets.load_dataset(*src) + dataset.save_to_disk(data_dir / dest) if __name__ == "__main__": - download() + parser = argparse.ArgumentParser() + parser.add_argument("--no_cache", default=False, required=False, type=str, help="Redownloads all datasets if set to True") + + args = parser.parse_args() + download(no_cache=args.no_cache)