Skip to content

Commit

Permalink
Merge pull request AI4Finance-Foundation#162 from InvestmentResearchA…
Browse files Browse the repository at this point in the history
…I/improve-benchmark-data-relative-imports

Implement relative importing of data
  • Loading branch information
llk010502 authored Mar 16, 2024
2 parents 1dcab62 + d513481 commit 7d77243
Show file tree
Hide file tree
Showing 15 changed files with 80 additions and 26 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,6 @@ fingpt/FinGPT_sentiment/instruct-FinGPT/run.sh
fingpt/FinGPT_sentiment/instruct-FinGPT/checkpoints
fingpt/FinGPT_sentiment/instruct-FinGPT/ds_results_all_10_v2_1.*
FinGPT_Training_LoRA_with_Chatglm2_6b_for_beginners.ipynb

# Benchmark data
fingpt/FinGPT_Benchmark/data/*/**
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include fingpt/FinGPT_Benchmark/benchmarks/sentiment_templates.txt
2 changes: 2 additions & 0 deletions fingpt/FinGPT_Benchmark/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .data.download import download as download_datasets
from . import benchmarks
3 changes: 3 additions & 0 deletions fingpt/FinGPT_Benchmark/benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from . import fpb, fiqa, finred, fineval, convfinqa, headline, ner, nwgi, tfns

__all__ = [fpb, fiqa, finred, fineval, convfinqa, headline, ner, nwgi, tfns]
9 changes: 4 additions & 5 deletions fingpt/FinGPT_Benchmark/benchmarks/convfinqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
import re
import sys
import numpy as np
from fingpt.FinGPT_Benchmark.utils import *
from pathlib import Path
sys.path.append('../')
from utils import *


def cvt_text_to_pred(text):
if not text:
Expand All @@ -32,9 +32,8 @@ def map_output(feature):
return {'label': label, 'pred': pred}


def test_convfinqa(args, model, tokenizer):

dataset = load_from_disk('../data/fingpt-convfinqa')['test']#.select(range(30))

dataset = load_from_disk(Path(__file__).parent.parent / 'data/fingpt-convfinqa')['test']
dataset = dataset.map(partial(test_mapping, args), load_from_cache_file=False)

def collate_fn(batch):
Expand Down
5 changes: 3 additions & 2 deletions fingpt/FinGPT_Benchmark/benchmarks/fineval.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@
import re
import sys
import numpy as np
from fingpt.FinGPT_Benchmark.utils import *
from pathlib import Path
sys.path.append('../')
from utils import *


def cvt_text_to_pred(text):
Expand All @@ -33,7 +34,7 @@ def map_output(feature):

def test_fineval(args, model, tokenizer):

dataset = load_from_disk('../data/fingpt-fineval')['test']#.select(range(30))
dataset = load_from_disk(Path(__file__).parent.parent / 'data/fingpt-fineval')['test']
dataset = dataset.map(partial(test_mapping, args), load_from_cache_file=False)

def collate_fn(batch):
Expand Down
5 changes: 3 additions & 2 deletions fingpt/FinGPT_Benchmark/benchmarks/finred.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@
import re
import sys
import numpy as np
from fingpt.FinGPT_Benchmark.utils import *
from pathlib import Path
sys.path.append('../')
from utils import *


relations = [
Expand Down Expand Up @@ -102,7 +103,7 @@ def calc_metric(gt_list, pred_list):

def test_re(args, model, tokenizer):

dataset = load_from_disk('../data/fingpt-finred-re')['test']#.select(range(50))
dataset = load_from_disk(Path(__file__).parent.parent / 'data/fingpt-finred-re')['test']
dataset = dataset.train_test_split(0.2, seed=42)['test']
dataset = dataset.map(partial(test_mapping, args), load_from_cache_file=False)

Expand Down
7 changes: 4 additions & 3 deletions fingpt/FinGPT_Benchmark/benchmarks/fiqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@

from torch.utils.data import DataLoader
from functools import partial
from pathlib import Path


with open('sentiment_templates.txt') as f:
with open(Path(__file__).parent / 'sentiment_templates.txt') as f:
templates = [l.strip() for l in f.readlines()]


Expand Down Expand Up @@ -58,7 +59,7 @@ def vote_output(x):
def test_fiqa(args, model, tokenizer, prompt_fun=add_instructions):
batch_size = args.batch_size
# dataset = load_dataset('pauri32/fiqa-2018')
dataset = load_from_disk('../data/fiqa-2018/')
dataset = load_from_disk(Path(__file__).parent.parent / 'data/fiqa-2018/')
dataset = datasets.concatenate_datasets([dataset["train"], dataset["validation"] ,dataset["test"] ])
dataset = dataset.train_test_split(0.226, seed = 42)['test']
dataset = dataset.to_pandas()
Expand Down Expand Up @@ -112,7 +113,7 @@ def test_fiqa(args, model, tokenizer, prompt_fun=add_instructions):
def test_fiqa_mlt(args, model, tokenizer):
batch_size = args.batch_size
# dataset = load_dataset('pauri32/fiqa-2018')
dataset = load_from_disk('../data/fiqa-2018/')
dataset = load_from_disk(Path(__file__).parent.parent / 'data/fiqa-2018/')
dataset = datasets.concatenate_datasets([dataset["train"], dataset["validation"] ,dataset["test"] ])
dataset = dataset.train_test_split(0.226, seed=42)['test']
dataset = dataset.to_pandas()
Expand Down
9 changes: 5 additions & 4 deletions fingpt/FinGPT_Benchmark/benchmarks/fpb.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,15 @@

from torch.utils.data import DataLoader
from functools import partial
from pathlib import Path

dic = {
0:"negative",
1:'neutral',
2:'positive',
}

with open('sentiment_templates.txt') as f:
with open(Path(__file__).parent / 'sentiment_templates.txt') as f:
templates = [l.strip() for l in f.readlines()]


Expand Down Expand Up @@ -52,7 +53,7 @@ def vote_output(x):
def test_fpb(args, model, tokenizer, prompt_fun=None):
batch_size = args.batch_size
# instructions = load_dataset("financial_phrasebank", "sentences_50agree")
instructions = load_from_disk("../data/financial_phrasebank-sentences_50agree/")
instructions = load_from_disk(Path(__file__).parent.parent / "data/financial_phrasebank-sentences_50agree/")
instructions = instructions["train"]
instructions = instructions.train_test_split(seed = 42)['test']
instructions = instructions.to_pandas()
Expand Down Expand Up @@ -105,8 +106,8 @@ def test_fpb(args, model, tokenizer, prompt_fun=None):

def test_fpb_mlt(args, model, tokenizer):
batch_size = args.batch_size
# instructions = load_dataset("financial_phrasebank", "sentences_50agree")
dataset = load_from_disk('../data/financial_phrasebank-sentences_50agree/')
# dataset = load_dataset("financial_phrasebank", "sentences_50agree")
dataset = load_from_disk(Path(__file__).parent.parent / 'data/financial_phrasebank-sentences_50agree/')
dataset = dataset["train"]#.select(range(300))
dataset = dataset.train_test_split(seed=42)['test']
dataset = dataset.to_pandas()
Expand Down
8 changes: 4 additions & 4 deletions fingpt/FinGPT_Benchmark/benchmarks/headline.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
import torch
from torch.utils.data import DataLoader
from functools import partial

from pathlib import Path
from fingpt.FinGPT_Benchmark.utils import *

import sys
sys.path.append('../')
from utils import *



Expand All @@ -34,8 +34,8 @@ def map_output(feature):

def test_headline(args, model, tokenizer):

# dataset = load_from_disk('../data/fingpt-headline')['test']#.select(range(300))
dataset = load_from_disk('../data/fingpt-headline-instruct')['test']#.select(range(300))
# dataset = load_from_disk('../data/fingpt-headline')['test']
dataset = load_from_disk(Path(__file__).parent.parent / 'data/fingpt-headline-instruct')['test']
dataset = dataset.map(partial(test_mapping, args), load_from_cache_file=False)

def collate_fn(batch):
Expand Down
6 changes: 3 additions & 3 deletions fingpt/FinGPT_Benchmark/benchmarks/ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
import re
import sys
import numpy as np
from fingpt.FinGPT_Benchmark.utils import *
from pathlib import Path
sys.path.append('../')
from utils import *


ent_dict = {
'PER': 'person',
Expand Down Expand Up @@ -53,7 +53,7 @@ def map_output(feature):

def test_ner(args, model, tokenizer):

dataset = load_from_disk('../data/fingpt-ner')['test']#.select(range(30))
dataset = load_from_disk(Path(__file__).parent.parent / 'data/fingpt-ner')['test']
dataset = dataset.map(partial(test_mapping, args), load_from_cache_file=False)

def collate_fn(batch):
Expand Down
4 changes: 2 additions & 2 deletions fingpt/FinGPT_Benchmark/benchmarks/nwgi.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from tqdm import tqdm
import datasets
import torch
from pathlib import Path

dic = {
'strong negative':"negative",
Expand Down Expand Up @@ -36,8 +37,7 @@ def change_target(x):
def test_nwgi(args, model, tokenizer, prompt_fun=None):
batch_size = args.batch_size
# dataset = load_dataset('oliverwang15/news_with_gpt_instructions')
dataset = load_from_disk('../data/news_with_gpt_instructions/')
dataset = dataset['test'].to_pandas()
dataset = load_from_disk(Path(__file__).parent.parent / 'data/news_with_gpt_instructions/')
dataset['output'] = dataset['label'].apply(lambda x:dic[x])

if prompt_fun is None:
Expand Down
3 changes: 2 additions & 1 deletion fingpt/FinGPT_Benchmark/benchmarks/tfns.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from tqdm import tqdm
import datasets
import torch
from pathlib import Path

dic = {
0:"negative",
Expand All @@ -32,7 +33,7 @@ def change_target(x):
def test_tfns(args, model, tokenizer, prompt_fun=None):
batch_size = args.batch_size
# dataset = load_dataset('zeroshot/twitter-financial-news-sentiment')
dataset = load_from_disk('../data/twitter-financial-news-sentiment')
dataset = load_from_disk(Path(__file__).parent.parent / 'data/twitter-financial-news-sentiment')
dataset = dataset['validation']
dataset = dataset.to_pandas()
dataset['label'] = dataset['label'].apply(lambda x:dic[x])
Expand Down
Empty file.
41 changes: 41 additions & 0 deletions fingpt/FinGPT_Benchmark/data/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import datasets
from pathlib import Path
import argparse

DATASETS = [
# source, destination
(('pauri32/fiqa-2018', None), 'fiqa-2018'),
(('FinGPT/fingpt-finred', None), 'fingpt-finred'),
(('zeroshot/twitter-financial-news-sentiment', None), 'twitter-financial-news-sentiment'),
(('oliverwang15/news_with_gpt_instructions', None), 'news_with_gpt_instructions'),
(('financial_phrasebank', 'sentences_50agree'), 'financial_phrasebank-sentences_50agree'),
(('FinGPT/fingpt-fiqa_qa', None), 'fingpt-fiqa_qa'),
(('FinGPT/fingpt-headline-cls', None), 'fingpt-headline-cls'),
(('FinGPT/fingpt-finred', None), 'fingpt-finred'),
(('FinGPT/fingpt-convfinqa', None), 'fingpt-convfinqa'),
(('FinGPT/fingpt-finred-cls', None), 'fingpt-finred-cls'),
(('FinGPT/fingpt-ner', None), 'fingpt-ner'),
(('FinGPT/fingpt-headline', None), 'fingpt-headline-instruct'),
(('FinGPT/fingpt-finred-re', None), 'fingpt-finred-re'),
(('FinGPT/fingpt-ner-cls', None), 'fingpt-ner-cls'),
(('FinGPT/fingpt-fineval', None), 'fingpt-fineval'),
(('FinGPT/fingpt-sentiment-cls', None), 'fingpt-sentiment-cls'),
]

def download(no_cache: bool = False):
"""Downloads all datasets to where the FinGPT library is located."""
data_dir = Path(__file__).parent

for src, dest in DATASETS:
if Path(data_dir / dest).is_dir() and not no_cache:
print(f"Dataset found at {data_dir / dest}, skipping")
continue
dataset = datasets.load_dataset(*src)
dataset.save_to_disk(data_dir / dest)

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--no_cache", default=False, required=False, type=str, help="Redownloads all datasets if set to True")

args = parser.parse_args()
download(no_cache=args.no_cache)

0 comments on commit 7d77243

Please sign in to comment.