Skip to content

Commit

Permalink
thread safe tokenizer; transpose_y for dataloader
Browse files Browse the repository at this point in the history
  • Loading branch information
[email protected] authored and [email protected] committed Mar 14, 2018
1 parent 98fad0e commit 3e4b5a9
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 27 deletions.
9 changes: 5 additions & 4 deletions fastai/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,11 @@ def get_tensor(batch, pin):

class DataLoader(object):
def __init__(self, dataset, batch_size=1, shuffle=False, sampler=None, batch_sampler=None, pad_idx=0,
num_workers=None, collate_fn=np_collate, pin_memory=False, drop_last=False, transpose=False):
num_workers=None, collate_fn=np_collate, pin_memory=False, drop_last=False,
transpose=False, transpose_y=False):
self.dataset,self.batch_size,self.num_workers = dataset,batch_size,num_workers
self.collate_fn,self.pin_memory,self.drop_last = collate_fn,pin_memory,drop_last
self.transpose,self.pad_idx = transpose,pad_idx
self.transpose,self.transpose_y,self.pad_idx = transpose,transpose_y,pad_idx

if batch_sampler is not None:
if batch_size > 1 or shuffle or sampler is not None or drop_last:
Expand All @@ -66,8 +67,8 @@ def __len__(self): return len(self.batch_sampler)

def get_batch(self, indices):
res = self.collate_fn([self.dataset[i] for i in indices], self.pad_idx)
if not self.transpose: return res
res[0] = res[0].T
if self.transpose: res[0] = res[0].T
if self.transpose_y: res[1] = res[1].T
return res

def __iter__(self):
Expand Down
2 changes: 1 addition & 1 deletion fastai/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def fit(model, data, epochs, opt, crit, metrics=None, callbacks=None, stepper=St
for cb in callbacks: cb.on_train_begin()
names = ["epoch", "trn_loss", "val_loss"] + [f.__name__ for f in metrics]
layout = "{!s:10} " * len(names)

num_batch = len(data.trn_dl)
if epochs<1:
num_batch = int(num_batch*epochs)
Expand Down
42 changes: 20 additions & 22 deletions fastai/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,6 @@
import spacy
from spacy.symbols import ORTH

re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
my_tok.tokenizer.add_special_case('<eos>', [{ORTH: '<eos>'}])
my_tok.tokenizer.add_special_case('<bos>', [{ORTH: '<bos>'}])
my_tok.tokenizer.add_special_case('<unk>', [{ORTH: '<unk>'}])
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

Expand All @@ -25,16 +16,18 @@ def texts_labels_from_folders(path, folders):
labels.append(idx)
return texts, np.array(labels).astype(np.int64)

#def texts_from_files(src, names):
#texts,labels = [],[]
#for idx,name in enumerate(names):
#path = os.path.join(src, name)
#t = [o.strip() for o in open(path, encoding = "ISO-8859-1")]
#texts += t
#labels += ([idx] * len(t))
#return texts,np.array(labels)

class Tokenizer():
def __init__(self, lang='en'):
self.re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
self.tok = spacy.load(lang)
for w in ('<eos>','<bos>','<unk>'):
self.tok.tokenizer.add_special_case(w, [{ORTH: w}])

def sub_br(self,x): return self.re_br.sub("\n", x)

def spacy_tok(self,x):
return [t.text for t in self.tok.tokenizer(self.sub_br(x))]

re_rep = re.compile(r'(\S)(\1{3,})')
re_word_rep = re.compile(r'(\b\w+\W+)(\1{3,})')

Expand Down Expand Up @@ -70,13 +63,18 @@ def proc_text(self, s):
s = Tokenizer.do_caps(s)
s = re.sub(r'([/#])', r' \1 ', s)
s = re.sub(' {2,}', ' ', s)
return spacy_tok(s)
return self.spacy_tok(s)

def proc_all(self, ss): return [self.proc_text(s) for s in ss]
@staticmethod
def proc_all(ss, lang):
tok = Tokenizer(lang)
return [tok.proc_text(s) for s in ss]

def proc_all_mp(self, ss):
@staticmethod
def proc_all_mp(ss, lang='en'):
ncpus = num_cpus()//2
with ProcessPoolExecutor(ncpus) as e: return sum(e.map(self.proc_all, ss), [])
with ProcessPoolExecutor(ncpus) as e:
return sum(e.map(Tokenizer.proc_all, ss, [lang]*len(ss)), [])


class TextDataset(Dataset):
Expand Down

0 comments on commit 3e4b5a9

Please sign in to comment.