forked from apache/solr
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
LUCENE-9667: Hunspell: add spellchecker API, support BREAK and FORBID…
…DENWORD affix rules (apache#2207)
- Loading branch information
1 parent
a233ed2
commit 939699f
Showing
18 changed files
with
344 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
104 changes: 104 additions & 0 deletions
104
lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.analysis.hunspell; | ||
|
||
import org.apache.lucene.util.BytesRef; | ||
|
||
/** | ||
* A spell checker based on Hunspell dictionaries. The objects of this class are not thread-safe | ||
* (but a single underlying Dictionary can be shared by multiple spell-checkers in different | ||
* threads). Not all Hunspell features are supported yet. | ||
*/ | ||
public class SpellChecker { | ||
private final Dictionary dictionary; | ||
private final BytesRef scratch = new BytesRef(); | ||
private final Stemmer stemmer; | ||
|
||
public SpellChecker(Dictionary dictionary) { | ||
this.dictionary = dictionary; | ||
stemmer = new Stemmer(dictionary); | ||
} | ||
|
||
/** @return whether the given word's spelling is considered correct according to Hunspell rules */ | ||
public boolean spell(String word) { | ||
char[] wordChars = word.toCharArray(); | ||
if (dictionary.isForbiddenWord(wordChars, scratch)) { | ||
return false; | ||
} | ||
|
||
if (!stemmer.stem(wordChars, word.length()).isEmpty()) { | ||
return true; | ||
} | ||
|
||
if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) { | ||
return tryBreaks(word); | ||
} | ||
|
||
return false; | ||
} | ||
|
||
private boolean tryBreaks(String word) { | ||
for (String br : dictionary.breaks.starting) { | ||
if (word.length() > br.length() && word.startsWith(br)) { | ||
if (spell(word.substring(br.length()))) { | ||
return true; | ||
} | ||
} | ||
} | ||
|
||
for (String br : dictionary.breaks.ending) { | ||
if (word.length() > br.length() && word.endsWith(br)) { | ||
if (spell(word.substring(0, word.length() - br.length()))) { | ||
return true; | ||
} | ||
} | ||
} | ||
|
||
for (String br : dictionary.breaks.middle) { | ||
int pos = word.indexOf(br); | ||
if (canBeBrokenAt(word, br, pos)) { | ||
return true; | ||
} | ||
|
||
// try to break at the second occurrence | ||
// to recognize dictionary words with a word break | ||
if (pos > 0 && canBeBrokenAt(word, br, word.indexOf(br, pos + 1))) { | ||
return true; | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
private boolean hasTooManyBreakOccurrences(String word) { | ||
int occurrences = 0; | ||
for (String br : dictionary.breaks.middle) { | ||
int pos = 0; | ||
while ((pos = word.indexOf(br, pos)) >= 0) { | ||
if (++occurrences >= 10) return true; | ||
pos += br.length(); | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
private boolean canBeBrokenAt(String word, String breakStr, int breakPos) { | ||
return breakPos > 0 | ||
&& breakPos < word.length() - breakStr.length() | ||
&& spell(word.substring(0, breakPos)) | ||
&& spell(word.substring(breakPos + breakStr.length())); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
71 changes: 71 additions & 0 deletions
71
lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.analysis.hunspell; | ||
|
||
import java.io.InputStream; | ||
import java.net.URL; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.util.Objects; | ||
import org.apache.lucene.store.ByteBuffersDirectory; | ||
import org.apache.lucene.util.IOUtils; | ||
|
||
public class SpellCheckerTest extends StemmerTestBase { | ||
|
||
public void testBreak() throws Exception { | ||
doTest("break"); | ||
} | ||
|
||
public void testBreakDefault() throws Exception { | ||
doTest("breakdefault"); | ||
} | ||
|
||
public void testBreakOff() throws Exception { | ||
doTest("breakoff"); | ||
} | ||
|
||
protected void doTest(String name) throws Exception { | ||
InputStream affixStream = | ||
Objects.requireNonNull(getClass().getResourceAsStream(name + ".aff"), name); | ||
InputStream dictStream = | ||
Objects.requireNonNull(getClass().getResourceAsStream(name + ".dic"), name); | ||
|
||
SpellChecker speller; | ||
try { | ||
Dictionary dictionary = | ||
new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream); | ||
speller = new SpellChecker(dictionary); | ||
} finally { | ||
IOUtils.closeWhileHandlingException(affixStream); | ||
IOUtils.closeWhileHandlingException(dictStream); | ||
} | ||
|
||
URL good = StemmerTestBase.class.getResource(name + ".good"); | ||
if (good != null) { | ||
for (String word : Files.readAllLines(Path.of(good.toURI()))) { | ||
assertTrue("Unexpectedly considered misspelled: " + word, speller.spell(word)); | ||
} | ||
} | ||
|
||
URL wrong = StemmerTestBase.class.getResource(name + ".wrong"); | ||
if (wrong != null) { | ||
for (String word : Files.readAllLines(Path.of(wrong.toURI()))) { | ||
assertFalse("Unexpectedly considered correct: " + word, speller.spell(word)); | ||
} | ||
} | ||
} | ||
} |
10 changes: 10 additions & 0 deletions
10
lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.aff
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# word break points test, recursive break at dash and n-dash | ||
SET UTF-8 | ||
|
||
BREAK 2 | ||
BREAK - | ||
BREAK – | ||
|
||
WORDCHARS -– | ||
|
||
FORBIDDENWORD ! |
7 changes: 7 additions & 0 deletions
7
lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.dic
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
6 | ||
foo | ||
bar | ||
baz | ||
fox-bax | ||
foo-baz/! | ||
12 changes: 12 additions & 0 deletions
12
lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.good
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
foo | ||
bar | ||
fox-bax | ||
foo-bar | ||
foo–bar | ||
foo-bar-foo-bar | ||
foo-bar–foo-bar | ||
bar-baz | ||
baz-foo | ||
foo-bar-foo-bar-foo-bar-foo-bar-foo-bar | ||
e-mail-foo |
13 changes: 13 additions & 0 deletions
13
lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/break.wrong
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
fox | ||
bax | ||
-foo | ||
bar- | ||
fox-bar | ||
foo-bax | ||
foo–bax | ||
fox–bar | ||
foo-bar-fox-bar | ||
foo-bax-foo-bar | ||
foo-bar–fox-bar | ||
foo-bax–foo-bar | ||
foo-baz |
Oops, something went wrong.