Skip to content

Commit

Permalink
further changes in anli dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
tlatkowski committed Nov 6, 2019
1 parent 71c5b78 commit 017a68b
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 26 deletions.
40 changes: 29 additions & 11 deletions data/anli.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,34 @@
import pandas as pd
import os

import jsonlines
import pandas as pd

from data import dataset


class ANLIDataset(dataset.DatasetExperiment):

def __init__(self, *args):
super().__init__(*args)
with jsonlines.open('./corpora/anli_v0.1/R3/test.jsonl') as jsonl_reader:
for o in jsonl_reader:
print(o)
self.hypothesis = []
self.reason = []
self.label = []
with jsonlines.open(os.path.join(self.data_dir, 'train.jsonl')) as jsonl_reader:
for instance in jsonl_reader:
self.hypothesis.append(instance['hypothesis'])
self.reason.append(instance['reason'])
self.label.append(instance['label'])

dataset = pd.DataFrame(
list(
zip(
self.hypothesis,
self.reason,
self.label,
)
),
columns=['hypothesis', 'reason', 'label']
)
num_instances = len(dataset)
self.num_train = num_instances * (1 - self.dev_ratio - self.test_ratio)
self.num_dev = num_instances * self.dev_ratio
Expand All @@ -24,28 +42,28 @@ def train_set(self):
return self.train

def train_set_pairs(self):
return self.train[['question1', 'question2']].as_matrix()
return self.train[['hypothesis', 'reason']].as_matrix()

def train_labels(self):
return self.train['is_duplicate'].as_matrix()
return pd.get_dummies(self.train['label']).as_matrix()

def dev_set(self):
return self.dev

def dev_set_pairs(self):
return self.dev[['question1', 'question2']].as_matrix()
return self.dev[['hypothesis', 'reason']].as_matrix()

def dev_labels(self):
return self.dev['is_duplicate'].as_matrix()
return self.dev['label'].as_matrix()

def test_set(self):
return self.test

def test_set_pairs(self):
return self.test[['question1', 'question2']].as_matrix()
return self.test[['hypothesis', 'reason']].as_matrix()

def test_labels(self):
return self.test['is_duplicate'].as_matrix()
return self.test['label'].as_matrix()

def _data_path(self):
return 'corpora/QQP/'
return 'corpora/ANLI/R3'
20 changes: 11 additions & 9 deletions data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,14 @@ def pick_train_mini_batch(self):
train_idxs = np.arange(len(self._train_labels))
np.random.shuffle(train_idxs)
train_idxs = train_idxs[:self.num_dev_instances()]
return self.train_sen1[train_idxs], self.train_sen2[train_idxs], self._train_labels[
train_idxs]

def __str__(self):
return 'Dataset properties:\n ' \
'Number of training instances: {}\n ' \
'Number of dev instances: {}\n ' \
'Number of test instances: {}\n' \
.format(len(self._train_labels), len(self._dev_labels), len(self._test_labels))
mini_train1 = self.train_sen1[train_idxs]
mini_train2 = self.train_sen2[train_idxs]
mini_labels = self._train_labels[train_idxs]
return mini_train1, mini_train2, mini_labels

def __str__(self):
return 'Dataset properties:\n ' \
'Number of training instances: {}\n ' \
'Number of dev instances: {}\n ' \
'Number of test instances: {}\n' \
.format(len(self._train_labels), len(self._dev_labels), len(self._test_labels))
10 changes: 8 additions & 2 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
from utils.model_saver import ModelSaver
from utils.other_utils import timer, set_visible_gpu, init_config

log = tf.logging.info


def train(main_config, model_config, model_name, experiment_name, dataset_name):
main_cfg = MainConfig(main_config)
Expand All @@ -38,8 +40,10 @@ def train(main_config, model_config, model_name, experiment_name, dataset_name):
num_batches = dataset_helper.num_batches
model = model(max_sentence_len, vocabulary_size, main_config, model_config)
model_saver = ModelSaver(main_cfg.model_dir, experiment_name, main_cfg.checkpoints_to_keep)
config = tf.ConfigProto(allow_soft_placement=True,
log_device_placement=main_cfg.log_device_placement)
config = tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=main_cfg.log_device_placement,
)

with tf.Session(config=config) as session:
global_step = 0
Expand All @@ -50,6 +54,8 @@ def train(main_config, model_config, model_name, experiment_name, dataset_name):

metrics = {'acc': 0.0}
time_per_epoch = []

log('Training model for {} epochs'.format(main_cfg.num_epochs))
for epoch in tqdm(range(main_cfg.num_epochs), desc='Epochs'):
start_time = time.time()

Expand Down
10 changes: 6 additions & 4 deletions utils/model_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@ def _evaluate(self, x1, x2, labels, batch_size=100):
accuracy = 0.0
for batch in range(num_batches):
x1_batch, x2_batch, y_batch = batch_helper.next(batch)
feed_dict = {self._model.x1: x1_batch,
self._model.x2: x2_batch,
self._model.is_training: False,
self._model.labels: y_batch}
feed_dict = {
self._model.x1: x1_batch,
self._model.x2: x2_batch,
self._model.is_training: False,
self._model.labels: y_batch
}
accuracy += self._session.run(self._model.accuracy, feed_dict=feed_dict)
accuracy /= num_batches
return accuracy
Expand Down

0 comments on commit 017a68b

Please sign in to comment.