Skip to content

Commit

Permalink
feat: Support multiple repMap combinations (#4270)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jason3S committed Mar 3, 2023
1 parent 298f5ed commit bbc3ed4
Show file tree
Hide file tree
Showing 9 changed files with 208 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Args: ["**/*.{md,tex}"]
Summary:
files: 1407
filesWithIssues: 484
issues: 4887
issues: 4877
errors: 0
Errors: []

Expand Down Expand Up @@ -143,7 +143,6 @@ issues:
- "documents/Analysis%20III/Kapitel-1.tex:146:23 Borelsche U \\textbf{Borelsche $\\sigma$-Algebra} auf"
- "documents/Analysis%20III/Kapitel-1.tex:148:54 Borel U Mengen} oder \\textbf{Borel-Mengen}."
- "documents/Analysis%20III/Kapitel-1.tex:476:73 Subadditivität U {j})}\\) (\\(\\sigma\\)-Subadditivität)"
- "documents/Analysis%20III/Kapitel-1.tex:484:14 muesste U % Eigentlich muesste es in folgender Zeile"
- "documents/Analysis%20III/Kapitel-10.tex:164:21 Integrationen U Die Reihenfolge der Integrationen darf beliebig vertauscht"
- "documents/Analysis%20III/Kapitel-11.tex:8:9 Diffeomorphismus U \\textbf{Diffeomorphismus} genau dann wenn \\("
- "documents/Analysis%20III/Kapitel-12.tex:41:150 Tangentialvektor U )\\in\\mdr^n$ \\textbf{Tangentialvektor} von $\\gamma$ in $t"
Expand Down Expand Up @@ -696,7 +695,6 @@ issues:
- "documents/kit-agb/kit-agb.tex:43:16 Vertiefungsfach U $\\boxtimes$ Vertiefungsfach \\\\"
- "documents/kit-agb/kit-agb.tex:45:19 Vertiefungs U %% Namen des Wahl/Vertiefungs/Ergaenzungsfachs hier"
- "documents/kit-agb/kit-agb.tex:61:9 gehts U %% Hier gehts weiter:"
- "documents/kit-agb/kit-agb.tex:91:10 Pruefungsdauer U %% Bitte Pruefungsdauer eintragen"
- "documents/kit-agb/kit-agb.tex:102:18 nachgehackt U immer wieder nachgehackt."
- "documents/kit-agb/kit-agb.tex:124:56 Komilitonen U 2 Treffen à 5h mit Komilitonen."
- "documents/kit-agb/kit-agb.tex:177:72 Metacost U Daten. Es wird z.b. von Metacost"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Repository: MartinThoma/LaTeX-examples
Url: "https://github.com/MartinThoma/LaTeX-examples.git"
Args: ["**/*.{md,tex}"]
Lines:
CSpell: Files checked: 1407, Issues found: 4887 in 484 files
CSpell: Files checked: 1407, Issues found: 4877 in 484 files
exit code: 1
./README.md:12:35 - Unknown word (Wikpedia) -- can be found on [my Wikpedia Commons user page](http
./README.md:20:26 - Unknown word (seperate) -- Every LaTeX file is in a seperate folder and has its own
Expand Down Expand Up @@ -138,7 +138,6 @@ Lines:
./documents/Analysis III/Kapitel-1.tex:146:23 - Unknown word (Borelsche) -- \textbf{Borelsche $\sigma$-Algebra} auf
./documents/Analysis III/Kapitel-1.tex:148:54 - Unknown word (Borel) -- Mengen} oder \textbf{Borel-Mengen}.
./documents/Analysis III/Kapitel-1.tex:476:73 - Unknown word (Subadditivität) -- {j})}\) (\(\sigma\)-Subadditivität)
./documents/Analysis III/Kapitel-1.tex:484:14 - Unknown word (muesste) -- % Eigentlich muesste es in folgender Zeile
./documents/Analysis III/Kapitel-10.tex:164:21 - Unknown word (Integrationen) -- Die Reihenfolge der Integrationen darf beliebig vertauscht
./documents/Analysis III/Kapitel-11.tex:8:9 - Unknown word (Diffeomorphismus) -- \textbf{Diffeomorphismus} genau dann wenn \(
./documents/Analysis III/Kapitel-12.tex:41:150 - Unknown word (Tangentialvektor) -- )\in\mdr^n$ \textbf{Tangentialvektor} von $\gamma$ in $t
Expand Down Expand Up @@ -908,7 +907,6 @@ Lines:
./documents/kit-agb/kit-agb.tex:43:16 - Unknown word (Vertiefungsfach) -- $\boxtimes$ Vertiefungsfach \\
./documents/kit-agb/kit-agb.tex:45:19 - Unknown word (Vertiefungs) -- %% Namen des Wahl/Vertiefungs/Ergaenzungsfachs hier
./documents/kit-agb/kit-agb.tex:61:9 - Unknown word (gehts) -- %% Hier gehts weiter:
./documents/kit-agb/kit-agb.tex:91:10 - Unknown word (Pruefungsdauer) -- %% Bitte Pruefungsdauer eintragen
./documents/kit-mathe-template/README.md:8:44 - Unknown word (Kühnleins) -- template, sign it, add Dr. Kühnleins permission and give
./documents/kit-mathe-template/README.md:9:23 - Unknown word (Gheta) -- to ... hm ... [Dr. Gheta](http:https://www.informatik
./documents/kit-mathe-template/kit-mathe-template.tex:53:24 - Unknown word (MATHBAST) -- \item \texttt{[MATHBAST01]} Einführung in die
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,13 @@ export interface SpellingDictionary extends DictionaryInfo {
suggest(word: string, suggestOptions: SuggestOptions): SuggestionResult[];
genSuggestions(collector: SuggestionCollector, suggestOptions: SuggestOptions): void;
mapWord(word: string): string;
/**
* Generates all possible word combinations by applying `repMap`.
* This acts a bit like brace expansions in globs.
* @param word - the word to map
* @returns array of adjusted words.
*/
remapWord?: (word: string) => string[];
readonly size: number;
readonly isDictionaryCaseSensitive: boolean;
getErrors?(): Error[];
Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,28 @@
import { createMapper } from '../util/repMap';
import { __testing__ } from './SpellingDictionaryFromTrie';
import { buildTrieFast } from 'cspell-trie-lib';

import { __testing__, SpellingDictionaryFromTrie } from './SpellingDictionaryFromTrie';

const { outerWordForms } = __testing__;

// cspell:ignore guenstig günstig Bundesstaat Bundeßtaat
// cspell:ignore Goerresstraße, Goerreßtraße, Görresstraße, Görreßtraße

describe('SpellingDictionaryFromTrie', () => {
test.each`
word | repMap | expected
${'hello'} | ${undefined} | ${['hello']}
${'guenstig'} | ${[['ae', 'ä'], ['oe', 'ö'], ['ue', 'ü'], ['ss', 'ß']]} | ${['guenstig', 'günstig']}
${'günstig'} | ${[['ae', 'ä'], ['oe', 'ö'], ['ue', 'ü'], ['ss', 'ß']]} | ${['günstig', 'günstig'.normalize('NFD')]}
${'Bundesstaat'} | ${[['ae', 'ä'], ['oe', 'ö'], ['ue', 'ü'], ['ss', 'ß']]} | ${['Bundesstaat', 'Bundeßtaat']}
word | repMap | expected
${'hello'} | ${undefined} | ${['hello']}
${'guenstig'} | ${[['ae', 'ä'], ['oe', 'ö'], ['ue', 'ü'], ['ss', 'ß']]} | ${['guenstig', 'günstig']}
${'günstig'} | ${[['ae', 'ä'], ['oe', 'ö'], ['ue', 'ü'], ['ss', 'ß']]} | ${['günstig', N('günstig')]}
${'Bundesstaat'} | ${[['ae', 'ä'], ['oe', 'ö'], ['ue', 'ü'], ['ss', 'ß']]} | ${['Bundesstaat', 'Bundeßtaat']}
${'Goerresstraße'} | ${[['ae', 'ä'], ['oe', 'ö'], ['ue', 'ü'], ['ss', 'ß']]} | ${['Goerresstraße', 'Goerreßtraße', 'Görresstraße', 'Görreßtraße']}
`('outerWordForms $word', ({ word, repMap, expected }) => {
const mapWord = createMapper(repMap);
expect(outerWordForms(word, mapWord ?? ((a) => a))).toEqual(new Set(expected));
const trie = buildTrieFast([]);
const dict = new SpellingDictionaryFromTrie(trie, 'test', { repMap });
const mapWord = dict.remapWord || ((a) => [dict.mapWord(a)]);
expect(outerWordForms(word, mapWord)).toEqual(new Set(expected));
});
});

function N(s: string, mode: 'NFD' | 'NFC' = 'NFD') {
return s.normalize(mode);
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import { CompoundWordsMethod, importTrie, suggestionCollector, Trie } from 'cspe

import { autoCache, createCache01 } from '../util/AutoCache';
import { clean } from '../util/clean';
import { createMapper } from '../util/repMap';
import { createMapper, createRepMapper } from '../util/repMap';
import * as Defaults from './defaults';
import type {
FindResult,
Expand Down Expand Up @@ -39,6 +39,7 @@ export class SpellingDictionaryFromTrie implements SpellingDictionary {
readonly knownWords = new Set<string>();
readonly unknownWords = new Set<string>();
readonly mapWord: (word: string) => string;
readonly remapWord: (word: string) => string[];
readonly type = 'SpellingDictionaryFromTrie';
readonly isDictionaryCaseSensitive: boolean;
readonly containsNoSuggestWords: boolean;
Expand All @@ -53,6 +54,7 @@ export class SpellingDictionaryFromTrie implements SpellingDictionary {
size?: number
) {
this.mapWord = createMapper(options.repMap, options.dictionaryInformation?.ignore);
this.remapWord = createRepMapper(options.repMap, options.dictionaryInformation?.ignore);
this.isDictionaryCaseSensitive = options.caseSensitive ?? !trie.isLegacy;
this.containsNoSuggestWords = options.noSuggest || false;
this._size = size || 0;
Expand Down Expand Up @@ -109,7 +111,7 @@ export class SpellingDictionaryFromTrie implements SpellingDictionary {
useCompounds: number | boolean | undefined,
ignoreCase: boolean
): FindAnyFormResult | undefined {
const outerForms = outerWordForms(word, this.mapWord);
const outerForms = outerWordForms(word, this.remapWord ? this.remapWord : (word) => [this.mapWord(word)]);

for (const form of outerForms) {
const r = this._findAnyForm(form, useCompounds, ignoreCase);
Expand Down Expand Up @@ -265,11 +267,11 @@ function findCache(fn: FindFunction, size = 2000): FindFunction {
return find;
}

function outerWordForms(word: string, mapWord: (word: string) => string): Set<string> {
function outerWordForms(word: string, mapWord: (word: string) => string[]): Set<string> {
const forms = pipe(
[word],
opConcatMap((word) => [word, word.normalize('NFC'), word.normalize('NFD')]),
opConcatMap((word) => [word, mapWord(word)])
opConcatMap((word) => [word, ...mapWord(word)])
);

return new Set(forms);
Expand Down
50 changes: 49 additions & 1 deletion packages/cspell-dictionary/src/util/repMap.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { __testing__, createMapper } from './repMap';

const { createMapperRegExp, charsetToRepMap } = __testing__;
const { createMapperRegExp, charsetToRepMap, createTrie, calcAllEdits, applyEdits } = __testing__;

describe('ReMap Tests', () => {
test('empty replace map', () => {
Expand Down Expand Up @@ -109,3 +109,51 @@ describe('ReMap Tests', () => {
expect(reg).toEqual(expected);
});
});

describe('RepMapper', () => {
test.each`
repMap | ignoreChars | expected
${undefined} | ${undefined} | ${{}}
${[['a', 'b']]} | ${undefined} | ${{ children: { a: { rep: ['b'] } } }}
${[['a', 'b'], ['a', 'b']]} | ${undefined} | ${{ children: { a: { rep: ['b'] } } }}
${[['a', 'b'], ['a', 'c']]} | ${undefined} | ${{ children: { a: { rep: ['b', 'c'] } } }}
${[['a', 'b'], ['a', 'c']]} | ${'a'} | ${{ children: { a: { rep: ['b', 'c', ''] } } }}
${[['a', 'b'], ['a', 'c']]} | ${'i'} | ${{ children: { a: { rep: ['b', 'c'] }, i: { rep: [''] } } }}
${[['a', 'b'], ['a', 'c']]} | ${'i'} | ${{ children: { a: { rep: ['b', 'c'] }, i: { rep: [''] } } }}
${[['a|i', 'b'], ['a', 'c']]} | ${'i'} | ${{ children: { a: { rep: ['b', 'c'] }, i: { rep: ['b', ''] } } }}
`('createTrie', ({ repMap, ignoreChars, expected }) => {
expect(createTrie(repMap, ignoreChars)).toEqual(expected);
});

test.each`
repMap | ignoreChars | word | expected
${undefined} | ${undefined} | ${'hello'} | ${[]}
${[['e', 'é']]} | ${undefined} | ${'hello'} | ${[{ b: 1, e: 2, r: 'é' }]}
${[['e', 'é'], ['o', 'ó']]} | ${undefined} | ${'hello'} | ${[{ b: 1, e: 2, r: 'é' }, { b: 4, e: 5, r: 'ó' }]}
${[['ll', 'y'], ['ll', 'el']]} | ${undefined} | ${'hello'} | ${[{ b: 2, e: 4, r: 'y' }, { b: 2, e: 4, r: 'el' }]}
${[['f', 'ph'], ['ph', 'f']]} | ${undefined} | ${'phone'} | ${[{ b: 0, e: 2, r: 'f' }]}
`('calcAllEdits', ({ repMap, ignoreChars, word, expected }) => {
const root = createTrie(repMap, ignoreChars);
expect(calcAllEdits(root, word)).toEqual(expected);
});

// cspell:ignore héllo helló hélló heyo heelo fone phoné phöne
test.each`
repMap | ignoreChars | word | expected
${undefined} | ${undefined} | ${'hello'} | ${['hello']}
${[['e', 'é']]} | ${undefined} | ${'hello'} | ${['hello', 'héllo']}
${[['e', 'é'], ['o', 'ó']]} | ${undefined} | ${'hello'} | ${['hello', 'helló', 'héllo', 'hélló']}
${[['ll', 'y'], ['ll', 'el']]} | ${undefined} | ${'hello'} | ${['hello', 'heyo', 'heelo']}
${[['f', 'ph'], ['ph', 'f']]} | ${undefined} | ${'phone'} | ${['phone', 'fone']}
${[]} | ${'\u0300-\u0308'} | ${N('phoné')} | ${[N('phoné'), 'phone']}
${[]} | ${'\u0300-\u0308'} | ${N('phöne')} | ${[N('phöne'), 'phone']}
`('applyEdits', ({ repMap, ignoreChars, word, expected }) => {
const root = createTrie(repMap, ignoreChars);
const edits = calcAllEdits(root, word);
expect(applyEdits(word, edits)).toEqual(expected);
});
});

function N(s: string, mode: 'NFD' | 'NFC' = 'NFD') {
return s.normalize(mode);
}
128 changes: 124 additions & 4 deletions packages/cspell-dictionary/src/util/repMap.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import type { CharacterSet, ReplaceMap } from '@cspell/cspell-types';
import type { CharacterSet, ReplaceEntry, ReplaceMap } from '@cspell/cspell-types';
import { expandCharacterSet } from 'cspell-trie-lib';

import { escapeRegEx } from './regexHelper';
import { isDefined } from './util';

export type ReplaceMapper = (src: string) => string;

export function createMapper(repMap: ReplaceMap | undefined, ignoreCharset?: string): ReplaceMapper {
if (!repMap && !ignoreCharset) return (a) => a;
repMap = repMap || [];
const charsetMap = charsetToRepMap(ignoreCharset);
const charsetMap = charsetToRepMapRegEx(ignoreCharset);
if (charsetMap) {
repMap = repMap.concat(charsetMap);
}
Expand All @@ -30,14 +32,29 @@ export function createMapper(repMap: ReplaceMap | undefined, ignoreCharset?: str
};
}

function charsetToRepMap(charset: CharacterSet | undefined, replaceWith = ''): ReplaceMap | undefined {
function charsetToRepMapRegEx(charset: CharacterSet | undefined, replaceWith = ''): ReplaceMap | undefined {
if (!charset) return undefined;

return charset
.split('|')
.map((chars) => `[${chars.replace(/[\][\\]/g, '\\$&')}]`)
.map((map) => [map, replaceWith]);
}
function charsetToRepMap(charset: undefined, replaceWith?: string): undefined;
function charsetToRepMap(charset: CharacterSet, replaceWith?: string): ReplaceMap;
function charsetToRepMap(charset: CharacterSet | undefined, replaceWith?: string): ReplaceMap | undefined;
function charsetToRepMap(charset: CharacterSet | undefined, replaceWith = ''): ReplaceMap | undefined {
if (!charset) return undefined;

return charset
.split('|')
.flatMap((chars) => [...expandCharacterSet(chars)])
.map((char) => [char, replaceWith]);
}

function expandReplaceMap(repMap: ReplaceMap): ReplaceMap {
return repMap.flatMap(([from, replaceWith]) => from.split('|').map((w) => [w, replaceWith] as ReplaceEntry));
}

function createMapperRegExp(repMap: ReplaceMap): RegExp {
const filteredMap = repMap.filter(([match, _]) => !!match);
Expand Down Expand Up @@ -66,7 +83,110 @@ function createMapperRegExp(repMap: ReplaceMap): RegExp {
return regEx;
}

interface RepTrieNode {
rep?: string[];
children?: Record<string, RepTrieNode>;
}

interface Edit {
b: number;
e: number;
r: string;
}

export function createRepMapper(repMap: ReplaceMap | undefined, ignoreCharset?: string): (word: string) => string[] {
if (!repMap && !ignoreCharset) return (word) => [word];

const trie = createTrie(repMap, ignoreCharset);

// const root = createTrie(repMap, ignoreCharset);
return (word) => {
const edits = calcAllEdits(trie, word);
return applyEdits(word, edits);
};
}

function applyEdits(word: string, edits: Edit[]): string[] {
if (!edits.length) return [word];

// Prepare
const letterEdits: { edits: Edit[]; suffixes: string[] }[] = [];
for (let i = 0; i < word.length; ++i) {
letterEdits[i] = { edits: [{ b: i, e: i + 1, r: word[i] }], suffixes: [] };
}
letterEdits[word.length] = { edits: [], suffixes: [''] };

// Add edits
for (const edit of edits) {
const le = letterEdits[edit.b];
le.edits.push(edit);
}

// Apply edits in reverse
for (let i = word.length - 1; i >= 0; --i) {
const le = letterEdits[i];
const sfx = le.suffixes;
for (const edit of le.edits) {
const pfx = edit.r;
const nSfx = letterEdits[edit.e].suffixes;
for (const s of nSfx) {
sfx.push(pfx + s);
}
}
}

const results = new Set(letterEdits[0].suffixes);

return [...results];
}

function calcAllEdits(root: RepTrieNode, word: string): Edit[] {
const edits: Edit[] = [];

function walk(node: RepTrieNode, b: number, e: number) {
if (node.rep) {
node.rep.forEach((r) => edits.push({ b, e, r }));
}
if (e === word.length || !node.children) return;
const n = node.children[word[e]];
if (!n) return;
walk(n, b, e + 1);
}

for (let i = 0; i < word.length; ++i) {
walk(root, i, i);
}

return edits;
}

function createTrie(repMap: ReplaceMap | undefined, ignoreCharset?: string): RepTrieNode {
const combined = [repMap, charsetToRepMap(ignoreCharset)].filter(isDefined).flatMap((a) => a);
const expanded = expandReplaceMap(combined);

const trieRoot: RepTrieNode = Object.create(null);

expanded.forEach(([match, replaceWith]) => addToTrie(trieRoot, match, replaceWith));
return trieRoot;
}

function addToTrie(node: RepTrieNode, match: string, replaceWith: string) {
while (match) {
const children: Record<string, RepTrieNode> = node.children || (node.children = Object.create(null));
const k = match[0];
const childNode = children[k] || (children[k] = Object.create(null));
node = childNode;
match = match.slice(1);
}
const s = new Set(node.rep || []);
s.add(replaceWith);
node.rep = [...s];
}

export const __testing__ = {
charsetToRepMap,
charsetToRepMap: charsetToRepMapRegEx,
createMapperRegExp,
createTrie,
calcAllEdits,
applyEdits,
};
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ exports[`Validate index.ts > Track changes to the API. 1`] = `
"defaultTrieOptions",
"editDistance",
"editDistanceWeighted",
"expandCharacterSet",
"findNode",
"has",
"hintedWalker",
Expand Down
1 change: 1 addition & 0 deletions packages/cspell-trie-lib/src/lib/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,6 @@ export { isDefined } from './utils/isDefined.js';
export { mergeDefaults } from './utils/mergeDefaults.js';
export { mergeOptionalWithDefaults } from './utils/mergeOptionalWithDefaults.js';
export { normalizeWord, normalizeWordForCaseInsensitive, normalizeWordToLowercase } from './utils/normalizeWord.js';
export { expandCharacterSet } from './utils/text.js';
export type { HintedWalkerIterator, Hinting, WalkerIterator, YieldResult } from './walker/index.js';
export { CompoundWordsMethod, hintedWalker, JOIN_SEPARATOR, walker, WORD_SEPARATOR } from './walker/index.js';

0 comments on commit bbc3ed4

Please sign in to comment.