Skip to content

Commit

Permalink
fix spellchecking punctuation and numbers
Browse files Browse the repository at this point in the history
  • Loading branch information
barrust committed Jul 9, 2018
1 parent 74fbc6b commit c12bd4d
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 4 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# pyspellchecker

## Version 0.1.3
* Better handle punctuation and numbers as the word to check

## Version 0.1.1
* Add support for language dictionaries
* English, Spanish, French, and German
Expand Down
2 changes: 1 addition & 1 deletion spellchecker/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
__maintainer__ = 'Tyler Barrus'
__email__ = '[email protected]'
__license__ = 'MIT'
__version__ = '0.1.2'
__version__ = '0.1.3'
__credits__ = ['Peter Norvig']
__url__ = 'https://github.com/barrust/pyspellchecker'
__bugtrack_url__ = '{0}/issues'.format(__url__)
22 changes: 19 additions & 3 deletions spellchecker/spellchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import re
import json
import gzip
import string
from collections import Counter


Expand Down Expand Up @@ -95,7 +96,7 @@ def candidates(self, word):
Returns:
set: The set of words that are possible candidates '''
return (self.known([word]) or self.known(self.edit_distance_1(word)) or
self.known(self.edit_distance_2(word)) or [word])
self.known(self.edit_distance_2(word)) or {word})

def known(self, words):
''' The subset of `words` that appear in the dictionary of words
Expand All @@ -106,7 +107,8 @@ def known(self, words):
Returns:
set: The set of those words from the input that are in the \
corpus '''
return set(w for w in words if w in self._word_frequency.dictionary)
return set(w for w in words if w in self._word_frequency.dictionary or
not self._check_if_should_check(w))

def unknown(self, words):
''' The subset of `words` that do not appear in the dictionary
Expand All @@ -117,7 +119,8 @@ def unknown(self, words):
Returns:
set: The set of those words from the input that are not in \
the corpus '''
return set(w for w in words if w not in self._word_frequency.dictionary)
tmp = [w for w in words if self._check_if_should_check(w)]
return set(w for w in tmp if w not in self._word_frequency.dictionary)

def edit_distance_1(self, word):
''' Compute all strings that are one edit away from `word` using only
Expand All @@ -128,6 +131,8 @@ def edit_distance_1(self, word):
Returns:
set: The set of strings that are edit distance two from the \
provided word '''
if self._check_if_should_check(word) is False:
return {word}
letters = self._word_frequency.letters
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
Expand All @@ -148,6 +153,17 @@ def edit_distance_2(self, word):
return (e2 for e1 in self.edit_distance_1(word)
for e2 in self.edit_distance_1(e1))

@staticmethod
def _check_if_should_check(word):
if len(word) == 1 and word in string.punctuation:
return False
try: # check if it is a number (int, float, etc)
float(word)
return False
except ValueError:
pass

return True

class WordFrequency(object):
''' Store the `dictionary` as a word frequency list while allowing for
Expand Down
6 changes: 6 additions & 0 deletions tests/spellchecker_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ def test_correction(self):
self.assertEqual(spell.correction('ergo'), 'ergo')
self.assertEqual(spell.correction('alot'), 'a lot')
self.assertEqual(spell.correction('this'), 'this')
self.assertEqual(spell.correction('-'), '-')
self.assertEqual(spell.correction('1213'), '1213')
self.assertEqual(spell.correction('1213.9'), '1213.9')

def test_candidates(self):
''' test spell checker candidates '''
Expand All @@ -28,6 +31,7 @@ def test_candidates(self):
'whs', 'ghs', 'rhs', 'this'}
self.assertEqual(spell.candidates('ths'), cands)
self.assertEqual(spell.candidates('the'), {'the'})
self.assertEqual(spell.candidates('-'), {'-'})

def test_words(self):
''' rest the parsing of words '''
Expand Down Expand Up @@ -56,6 +60,7 @@ def test_word_known(self):
self.assertEqual(spell.known(['sherlock']), {'sherlock'})
self.assertEqual(spell.known(['holmes']), {'holmes'})
self.assertEqual(spell.known(['known']), {'known'})
self.assertEqual(spell.known(['-']), {'-'})

self.assertEqual(spell.known(['foobar']), set())
self.assertEqual(spell.known(['ths']), set())
Expand All @@ -68,6 +73,7 @@ def test_unknown_words(self):
self.assertEqual(spell.unknown(['sherlock']), set())
self.assertEqual(spell.unknown(['holmes']), set())
self.assertEqual(spell.unknown(['known']), set())
self.assertEqual(spell.unknown(['-']), set())

self.assertEqual(spell.unknown(['foobar']), {'foobar'})
self.assertEqual(spell.unknown(['ths']), {'ths'})
Expand Down

0 comments on commit c12bd4d

Please sign in to comment.