Skip to content

Commit

Permalink
add char-level PennTreeBank dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
AKuzina committed Nov 22, 2021
1 parent f45fb8c commit 5c3e778
Show file tree
Hide file tree
Showing 12 changed files with 574 additions and 26 deletions.
Empty file added __init__.py
Empty file.
10 changes: 10 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ def get_config():
# This parameter is automatically derived from the other parameters of the run. It specifies
# the path where the network parameters will be saved / loaded from.
report_auc=False,
report_ppl=False,
report_bpc=False,
max_epochs_no_improvement=100,
# --------------------------
# Parameters of TCNs / BFCNNs
Expand All @@ -81,6 +83,12 @@ def get_config():
# kernels. e.g., 32.
pool=False, # **Not used in our experiments -> Worse performance.**
# If True, it adds a max pool layer after each Residual Block.
emb_dropout=0.,
# Embeding dropout for PennTreeBank dataset
emb_size=0,
# Size of the embedding
tied_weights=True,
# If true - use the same weight matrix for encoder and decoder
# --------------------------
# Parameters of SIREN
kernelnet_omega_0=0.0,
Expand Down Expand Up @@ -109,6 +117,8 @@ def get_config():
drop_rate=0,
# Specifies the rate at which data will be droped from the original dataset. Used for experiments
# With missing data. e.g., 30, 50, 70.
# 6. PennTreeBank: valid sequence length. Seq_len = effective history + valid sequence length (only second part used for traning)
valid_seq_len=0,
)
default_config = ml_collections.ConfigDict(default_config)
return default_config
64 changes: 46 additions & 18 deletions dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
SpeechCommands,
CharTrajectories,
PhysioNet,
PennTreeBankChar,
)

import ml_collections
Expand All @@ -30,15 +31,19 @@ def dataset_constructor(
"SpeechCommands": SpeechCommands,
"CharTrajectories": CharTrajectories,
"PhysioNet": PhysioNet,
'PennTreeBankChar': PennTreeBankChar,
}[config.dataset]

if config.dataset == 'PennTreeBankChar':
eval_batch_size = 10
training_set = dataset(
partition="train",
seq_length=config.seq_length,
memory_size=config.memory_size,
mfcc=config.mfcc,
sr=config.sr_train,
dropped_rate=config.drop_rate,
valid_seq_len=config.valid_seq_len,
batch_size=config.batch_size,
)
test_set = dataset(
partition="test",
Expand All @@ -49,15 +54,19 @@ def dataset_constructor(
if config.sr_test == 0
else config.sr_test, # Test set can be sample differently.
dropped_rate=config.drop_rate,
valid_seq_len=config.valid_seq_len,
batch_size=eval_batch_size,
)
if config.dataset in ["SpeechCommands", "CharTrajectories", "PhysioNet"]:
if config.dataset in ["SpeechCommands", "CharTrajectories", "PhysioNet", "PennTreeBankChar"]:
validation_set = dataset(
partition="val",
seq_length=config.seq_length,
memory_size=config.memory_size,
mfcc=config.mfcc,
sr=config.sr_train,
dropped_rate=config.drop_rate,
valid_seq_len=config.valid_seq_len,
batch_size=eval_batch_size,
)
else:
validation_set = None
Expand All @@ -74,29 +83,48 @@ def get_dataset(
:return: Tuple ( dict(train_loader, val_loader) , test_loader)
"""
training_set, validation_set, test_set = dataset_constructor(config)
if config.dataset in ["PennTreeBankChar"]:
with config.unlocked():
config.vocab_size = len(training_set.dictionary)
training_loader = torch.utils.data.DataLoader(
training_set,
batch_sampler=training_set.sampler,
num_workers=num_workers,
)
test_loader = torch.utils.data.DataLoader(
test_set,
batch_sampler=test_set.sampler,
num_workers=num_workers,
)

training_loader = torch.utils.data.DataLoader(
training_set,
batch_size=config.batch_size,
shuffle=True,
num_workers=num_workers,
)
test_loader = torch.utils.data.DataLoader(
test_set,
batch_size=config.batch_size,
shuffle=False,
num_workers=num_workers,
)

if validation_set is not None:
val_loader = torch.utils.data.DataLoader(
validation_set,
batch_sampler=validation_set.sampler,
num_workers=num_workers,
)
else:
training_loader = torch.utils.data.DataLoader(
training_set,
batch_size=config.batch_size,
shuffle=True,
num_workers=num_workers,
)
test_loader = torch.utils.data.DataLoader(
test_set,
batch_size=config.batch_size,
shuffle=False,
num_workers=num_workers,
)
else:
val_loader = test_loader

if validation_set is not None:
val_loader = torch.utils.data.DataLoader(
validation_set,
batch_size=config.batch_size,
shuffle=False,
num_workers=num_workers,
)
else:
val_loader = test_loader

dataloaders = {"train": training_loader, "validation": val_loader}

Expand Down
1 change: 1 addition & 0 deletions datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@
)
from .physionet import PhysioNet
from .char_trajectories import CharTrajectories
from .penn_tree_bank_char import PennTreeBankChar
142 changes: 142 additions & 0 deletions datasets/penn_tree_bank_char.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
"""
Adapted from https://github.com/locuslab/TCN/blob/master/TCN/
"""
import pickle
from collections import Counter
import os
import numpy as np
import torch
import pathlib
from .utils import load_data, save_data
import observations


class PennTreeBankChar(torch.utils.data.Dataset):
def __init__(
self,
partition: int,
seq_length: int,
valid_seq_len: int,
batch_size: int,
**kwargs,
):
self.seq_len = seq_length
self.valid_seq_len = valid_seq_len
self.batch_size = batch_size
self.root = pathlib.Path("./data")
self.base_loc = self.root / "penn"
data_loc = self.base_loc / "preprocessed_data_char"

if os.path.exists(data_loc):
self.dictionary = pickle.load(open(str(data_loc / 'dictionary_char'), 'rb'))
else:
train, valid, test = self._process_data()

if not os.path.exists(data_loc):
os.mkdir(data_loc)
pickle.dump(self.dictionary, open(str(data_loc / 'dictionary_char'), 'wb'))
save_data(
data_loc,
train=train,
valid=valid,
test=test,
)

self.X, self.y = self.load_data(data_loc, partition)
if partition == 'train':
self.sampler = SequentialBatchSampler(self)
else:
self.sampler = SequentialBatchSampler(self, shuffle=False)
super(PennTreeBankChar, self).__init__()

def __getitem__(self, ind):
b = ind // len(self.X[0])
i = ind - b * len(self.X[0])
return self.X[b][i], self.y[b][i]

def __len__(self):
return len(self.X[0]) * len(self.X)

def create_seq(self, data, batch_size):
nbatch = data.size(0) // batch_size
data = data.narrow(0, 0, nbatch * batch_size).view(batch_size, -1) ## crop tail
x = []
y = []
L = data.shape[1]
for i in range(0, L-1, self.valid_seq_len):
if i + self.seq_len - self.valid_seq_len >= L - 1:
continue
end = min(i + self.seq_len, L - 1)
x.append(data[:, i: end].contiguous())
y.append(data[:, i+1: end+1].contiguous())
return x, y

def _process_data(self):
self.dictionary = Dictionary()
train, test, valid = getattr(observations, 'ptb')(self.base_loc)
for c in train + ' ' + test + '' + valid:
self.dictionary.add_word(c)
self.dictionary.prep_dict()


train = self._char_to_tensor(train)
valid = self._char_to_tensor(valid)
test = self._char_to_tensor(test)
return train, valid, test

def _char_to_tensor(self, string):
tensor = torch.zeros(len(string)).long()
for i in range(len(string)):
tensor[i] = self.dictionary.char2idx[string[i]]
return tensor

def load_data(self, data_loc, partition):
tensors = load_data(data_loc)
if partition == "train":
data = tensors["train"]
elif partition == "val":
data = tensors["valid"]
elif partition == "test":
data = tensors["test"]
else:
raise NotImplementedError("the set {} is not implemented.".format(set))
X, y = self.create_seq(data, self.batch_size)
return X, y


class Dictionary(object):
def __init__(self):
self.char2idx = {}
self.idx2char = []
self.counter = Counter()

def add_word(self, word):
self.counter[word] += 1

def prep_dict(self):
for char in self.counter:
if char not in self.char2idx:
self.idx2char.append(char)
self.char2idx[char] = len(self.idx2char) - 1

def __len__(self):
return len(self.idx2char)


class SequentialBatchSampler(torch.utils.data.Sampler):
def __init__(self, data_source, shuffle=True):
super(SequentialBatchSampler, self).__init__(data_source)
self.X = data_source.X
if shuffle:
self.sampler = torch.utils.data.SubsetRandomSampler(np.arange(len(self.X)))
else:
self.sampler = np.arange(len(self.X))
self.batch_size = self.X[0].shape[0]

def __iter__(self):
for idx in self.sampler:
batch = [idx * self.batch_size + j for j in range(self.batch_size)]
yield batch

def __len__(self):
return len(self.X)
30 changes: 29 additions & 1 deletion model.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ def get_model(config):
in_channels = 1
elif config.dataset in ["PhysioNet"]:
in_channels = 75
elif config.dataset in ["PennTreeBankChar"]:
in_channels = config.emb_size
else:
raise NotImplementedError("Dataset {} not found.".format(config.dataset))

Expand Down Expand Up @@ -80,6 +82,15 @@ def get_model(config):
kernel_size=config.cnn_kernel_size,
dropout=config.dropout,
),
"PennTreeBankChar_TCN": lambda: models.PTB_TCN(
input_size=config.emb_size,
output_size=config.vocab_size,
num_channels=[config.no_hidden] * (config.no_blocks-1) + [config.emb_size],
kernel_size=config.cnn_kernel_size,
dropout=config.dropout,
emb_dropout=config.emb_dropout,
tied_weights=config.tied_weights,
),
"AddProblem_CKCNN": lambda: models.AddProblem_CKCNN(
in_channels=in_channels,
hidden_channels=config.no_hidden,
Expand Down Expand Up @@ -168,6 +179,23 @@ def get_model(config):
weight_dropout=config.weight_dropout,
pool=config.pool,
),
"PennTreeBankChar_CKCNN": lambda: models.seqText_CKCNN(
in_channels=in_channels,
out_channels=config.vocab_size,
hidden_channels=config.no_hidden,
num_blocks=config.no_blocks,
kernelnet_hidden_channels=config.kernelnet_no_hidden,
kernelnet_activation_function=config.kernelnet_activation_function,
kernelnet_norm_type=config.kernelnet_norm_type,
dim_linear=1,
bias=True,
omega_0=config.kernelnet_omega_0,
dropout=config.dropout,
weight_dropout=config.weight_dropout,
pool=config.pool,
emb_dropout=config.emb_dropout,
tied_weights=config.tied_weights
),
"CharTrajectories_CKCNN": lambda: models.seqImg_CKCNN(
in_channels=in_channels,
out_channels=20,
Expand All @@ -187,7 +215,7 @@ def get_model(config):

# print number parameters
print("Number of parameters:", ckconv.utils.num_params(model))
# wandb.run.summary["no_params"] = ckconv.utils.num_params(model)
wandb.run.summary["no_params"] = ckconv.utils.num_params(model)

# Check if multi-GPU available and if so, use the available GPU's
print("GPU's available:", torch.cuda.device_count())
Expand Down
4 changes: 2 additions & 2 deletions models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .tcn import AddProblem_TCN, CopyMemory_TCN, MNIST_TCN
from .ckcnn import CopyMemory_CKCNN, AddProblem_CKCNN, seqImg_CKCNN
from .tcn import AddProblem_TCN, CopyMemory_TCN, MNIST_TCN, PTB_TCN
from .ckcnn import CopyMemory_CKCNN, AddProblem_CKCNN, seqImg_CKCNN, seqText_CKCNN
from .bfcnn import seqImg_BFCNN
Loading

0 comments on commit 5c3e778

Please sign in to comment.