Skip to content

Commit

Permalink
hunspell: add Suggester#proceedPastRep to avoid losing relevant sugge…
Browse files Browse the repository at this point in the history
…stions (#13612)

* hunspell: add Suggester#proceedPastRep to avoid losing relevant suggestions
  • Loading branch information
donnerpeter committed Jul 27, 2024
1 parent 8d4f7a6 commit 481ca2d
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 9 deletions.
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,8 @@ API Changes

* GITHUB#13469: Expose FlatVectorsFormat as a first-class format; can be configured using a custom Codec. (Michael Sokolov)

* GITHUB#13612: Hunspell: add Suggester#proceedPastRep to avoid losing relevant suggestions. (Peter Gromov)

New Features
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class ModifyingSuggester {
private final String misspelled;
private final WordCase wordCase;
private final FragmentChecker fragmentChecker;
private final boolean proceedPastRep;
private final char[] tryChars;
private final Hunspell speller;

Expand All @@ -39,13 +40,15 @@ class ModifyingSuggester {
LinkedHashSet<Suggestion> result,
String misspelled,
WordCase wordCase,
FragmentChecker checker) {
FragmentChecker checker,
boolean proceedPastRep) {
this.speller = speller;
tryChars = speller.dictionary.tryChars.toCharArray();
this.result = result;
this.misspelled = misspelled;
this.wordCase = wordCase;
fragmentChecker = checker;
this.proceedPastRep = proceedPastRep;
}

/**
Expand Down Expand Up @@ -125,9 +128,9 @@ private boolean tryVariationsOf(String word) {
boolean hasGoodSuggestions = trySuggestion(word.toUpperCase(Locale.ROOT));

GradedSuggestions repResult = tryRep(word);
if (repResult == GradedSuggestions.Best) return true;
if (repResult == GradedSuggestions.Best && !proceedPastRep) return true;

hasGoodSuggestions |= repResult == GradedSuggestions.Normal;
hasGoodSuggestions |= repResult != GradedSuggestions.None;

if (!speller.dictionary.mapTable.isEmpty()) {
enumerateMapReplacements(word, "", 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,16 +53,21 @@ public class Suggester {
private final Dictionary dictionary;
private final SuggestibleEntryCache suggestibleCache;
private final FragmentChecker fragmentChecker;
private final boolean proceedPastRep;

public Suggester(Dictionary dictionary) {
this(dictionary, null, FragmentChecker.EVERYTHING_POSSIBLE);
this(dictionary, null, FragmentChecker.EVERYTHING_POSSIBLE, false);
}

private Suggester(
Dictionary dictionary, SuggestibleEntryCache suggestibleCache, FragmentChecker checker) {
Dictionary dictionary,
SuggestibleEntryCache suggestibleCache,
FragmentChecker checker,
boolean proceedPastRep) {
this.dictionary = dictionary;
this.suggestibleCache = suggestibleCache;
this.fragmentChecker = checker;
this.proceedPastRep = proceedPastRep;
}

/**
Expand All @@ -71,16 +76,26 @@ private Suggester(
* entries are stored as fast-to-iterate plain words instead of highly compressed prefix trees.
*/
public Suggester withSuggestibleEntryCache() {
return new Suggester(
dictionary, SuggestibleEntryCache.buildCache(dictionary.words), fragmentChecker);
SuggestibleEntryCache cache = SuggestibleEntryCache.buildCache(dictionary.words);
return new Suggester(dictionary, cache, fragmentChecker, proceedPastRep);
}

/**
* Returns a copy of this suggester instance with {@link FragmentChecker} hint that can improve
* the performance of the "Modification" phase performance.
*/
public Suggester withFragmentChecker(FragmentChecker checker) {
return new Suggester(dictionary, suggestibleCache, checker);
return new Suggester(dictionary, suggestibleCache, checker, proceedPastRep);
}

/**
* Returns a copy of this suggester instance that doesn't stop after encountering acceptable words
* after applying REP rules. By default, Hunspell stops when it finds any, but this behavior may
* not always be desirable, e.g., if we have "REP i ea", "tims" be replaced only by "teams" and
* not "times", which could also be meant.
*/
public Suggester proceedPastRep() {
return new Suggester(dictionary, suggestibleCache, fragmentChecker, true);
}

/**
Expand Down Expand Up @@ -174,7 +189,8 @@ Root<CharsRef> findStem(
}

boolean hasGoodSuggestions =
new ModifyingSuggester(suggestionSpeller, suggestions, word, wordCase, fragmentChecker)
new ModifyingSuggester(
suggestionSpeller, suggestions, word, wordCase, fragmentChecker, proceedPastRep)
.suggest();

if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,14 @@ public void testAllcaps() throws Exception {

public void testRepSuggestions() throws Exception {
doTest("rep");

//noinspection DataFlowIssue
Path aff = Path.of(getClass().getResource("rep.aff").toURI());
Dictionary dictionary = TestAllDictionaries.loadDictionary(aff);
Suggester suggester = new Suggester(dictionary);
assertEquals(List.of("auto's"), suggester.suggestNoTimeout("autos", () -> {}));
assertEquals(
List.of("auto's", "auto"), suggester.proceedPastRep().suggestNoTimeout("autos", () -> {}));
}

public void testPhSuggestions() throws Exception {
Expand Down

0 comments on commit 481ca2d

Please sign in to comment.