Skip to content

Commit

Permalink
directly use Levenshtein.normalized_similarity
Browse files Browse the repository at this point in the history
  • Loading branch information
maxbachmann authored Apr 13, 2022
1 parent 7f41822 commit e63a1e8
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions errant/en/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def get_two_sided_type(o_toks, c_toks):
# Use string similarity to detect true spelling errors.
else:
# Normalised Lev distance works better than Lev ratio
str_sim = 1-Levenshtein.normalized_distance(o_toks[0].lower_, c_toks[0].lower_)
str_sim = Levenshtein.normalized_similarity(o_toks[0].lower_, c_toks[0].lower_)
# WARNING: THIS IS AN APPROXIMATION.
# Thresholds tuned manually on FCE_train + W&I_train
# str_sim > 0.55 is almost always a true spelling error
Expand Down Expand Up @@ -328,7 +328,7 @@ def get_two_sided_type(o_toks, c_toks):
# These rules are quite language specific.
if o_toks[0].text.isalpha() and c_toks[0].text.isalpha():
# Normalised Lev distance works better than Lev ratio
str_sim = 1-Levenshtein.normalized_distance(o_toks[0].lower_, c_toks[0].lower_)
str_sim = Levenshtein.normalized_similarity(o_toks[0].lower_, c_toks[0].lower_)
# WARNING: THIS IS AN APPROXIMATION.
# Thresholds tuned manually on FCE_train + W&I_train
# A. Short sequences are likely to be SPELL or function word errors
Expand Down

0 comments on commit e63a1e8

Please sign in to comment.