Skip to content

Commit

Permalink
minor code cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
barrust committed Feb 24, 2018
1 parent d6987ab commit 09d850f
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 21 deletions.
1 change: 1 addition & 0 deletions spellchecker/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
''' SpellChecker Module '''
from . spellchecker import SpellChecker
from . info import (__author__, __maintainer__, __email__, __license__,
__version__, __credits__, __url__, __bugtrack_url__)
Expand Down
2 changes: 1 addition & 1 deletion spellchecker/info.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
''' Information '''
''' SpellChecker Information '''


__author__ = 'Tyler Barrus'
Expand Down
56 changes: 36 additions & 20 deletions spellchecker/spellchecker.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,69 @@
''' SpellChecker Module; simple, intuitive spell checker based on the post by
Peter Norvig. See: https://norvig.com/spell-correct.html '''
from __future__ import absolute_import

import os
import re
import string
from collections import Counter

import spellchecker.info as base # to get a relative file path!
import spellchecker.info as base


class SpellChecker(object):
''' The SpellChecker class encapsulates the basics needed to accomplish a
simple spell checking algorithm. It is based on the work by
Peter Norvig (https://norvig.com/spell-correct.html) '''

def __init__(self):
# Should allow passing in a different file
dirpath = os.path.dirname(base.__file__)
full_filename = os.path.join(dirpath, 'resources', 'old_books.txt')
self.dictionary = None
self.dictionary = Counter()
with open(full_filename) as fobj:
self.dictionary = Counter(self.words(fobj.read()))
self.dictionary.update(self.words(fobj.read()))
self.total_words = sum(self.dictionary.values())

def words(self, text):
@staticmethod
def words(text):
''' Parse the text into words; currently removes punctuation '''
return re.findall(r'\w+', text.lower())

def P(self, word, N=None):
"Probability of `word`."
if N is None:
N = sum(self.dictionary.values())
return self.dictionary[word] / N
def word_probability(self, word, total_words=None):
"Probability of `word` being the desired word"
if total_words is None:
total_words = self.total_words
return self.dictionary[word] / total_words

def correction(self, word):
"Most probable spelling correction for word."
return max(self.candidates(word), key=self.P)
return max(self.candidates(word), key=self.word_probability)

def candidates(self, word):
"Generate possible spelling corrections for word."
return (self.known([word]) or self.known(self.edit_distance_1(word)) or self.known(self.edit_distance_2(word)) or [word])
return (self.known([word]) or self.known(self.edit_distance_1(word)) or
self.known(self.edit_distance_2(word)) or [word])

def known(self, words):
"The subset of `words` that appear in the dictionary of WORDS."
return set(w for w in words if w in self.dictionary)

def edit_distance_1(self, word):
def unknown(self, words):
''' The subset of `words` that do not appear in the dictionary'''
return set(w for w in words if w not in self.dictionary)

@staticmethod
def edit_distance_1(word):
"All edits that are one edit away from `word`."
letters = 'abcdefghijklmnopqrstuvwxyz'
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
inserts = [L + c + R for L, R in splits for c in letters]
letters = string.ascii_lowercase
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
inserts = [L + c + R for L, R in splits for c in letters]
return set(deletes + transposes + replaces + inserts)

def edit_distance_2(word):
def edit_distance_2(self, word):
"All edits that are two edits away from `word`."
return (e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1))
return (e2 for e1 in self.edit_distance_1(word)
for e2 in self.edit_distance_1(e1))

0 comments on commit 09d850f

Please sign in to comment.