Skip to content

Commit

Permalink
stemming ready for testing
Browse files Browse the repository at this point in the history
  • Loading branch information
ben-tinc committed Jan 30, 2017
1 parent 05af876 commit 4f84251
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 2 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
edited/
plaintext/
reference/
stemmed/
nltk_data/
__pycache__/
15 changes: 13 additions & 2 deletions tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,20 @@ def write_stemmed_plain_text(tree, filename):
stemmer = SnowballStemmer("german")
plaintext = etree.tostring(
tree.getroot().find("text"),
method="text", enocding="utf-8"
method="text", encoding="utf-8"
)
raise NotImplementedError("")
tokenizer = nltk.data.load("tokenizers/punkt/german.pickle")
sentences = tokenizer.tokenize(plaintext.decode())
tokens = []
for s in sentences:
t = nltk.tokenize.word_tokenize(s)
for word in t:
if not (word.strip().isspace() or
word.strip() in [",", ".", ";", "!", "?", "-", '"', "'", "``",
"''", ":", "(", ")", "–"]):
tokens.append(stemmer.stem(word))
with open(filename, "w") as f:
f.write(" ".join(tokens))


def write_xml(ps, tree, filename):
Expand Down

0 comments on commit 4f84251

Please sign in to comment.