Skip to content

Commit

Permalink
added block list features to js api
Browse files Browse the repository at this point in the history
  • Loading branch information
RicBent committed Jun 19, 2024
1 parent ef1b33f commit 9759afb
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 14 deletions.
90 changes: 86 additions & 4 deletions bindings/wasm/kiwi_wasm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,71 @@ int nextInstanceId() {
}


static std::map<int, std::unordered_set<const Morpheme*>> morphemeSets;

int nextMorphemeSetId() {
static int id = 0;
return id++;
}


template<typename T>
inline T getAtOrDefault(const json& args, size_t index, const T& defaultValue) {
return args.size() > index ? args.at(index).get<T>() : defaultValue;
}


inline std::unordered_set<const Morpheme*> parseMorphemeSet(const Kiwi& kiwi, const json& morphs) {
std::unordered_set<const Morpheme*> set;

for (const auto& morph : morphs) {
const std::string form8 = morph["form"];
const std::u16string form = utf8To16(form8);

POSTag tag = POSTag::unknown;
if (morph.contains("tag")) {
const std::string tagStr8 = morph["tag"];
const std::u16string tagStr = utf8To16(tagStr8);
tag = toPOSTag(tagStr);
}

auto matches = kiwi.findMorpheme(form, tag);
set.insert(matches.begin(), matches.end());
}

return set;
}


class BlockListArg {
std::unordered_set<const Morpheme*> tempSet;
int blockListId;

public:
BlockListArg(const Kiwi& kiwi, const json& args, size_t index) : blockListId(-1) {
if (args.size() <= index) {
return;
}
const auto& arg = args.at(index);
if (arg.is_number_integer()) {
blockListId = arg.get<int>();
} else if (arg.is_array()) {
tempSet = parseMorphemeSet(kiwi, arg);
}
}

const std::unordered_set<const Morpheme*>* setPtr() const {
if (blockListId >= 0) {
return &morphemeSets[blockListId];
}
if (!tempSet.empty()) {
return &tempSet;
}
return nullptr;
}
};


inline json serializeTokenInfo(const TokenInfo& tokenInfo) {
return {
{ "str", utf16To8(tokenInfo.str) },
Expand Down Expand Up @@ -137,8 +196,9 @@ json kiwiIsTypoTolerant(Kiwi& kiwi, const json& args) {
json kiwiAnalyze(Kiwi& kiwi, const json& args) {
const std::string str = args[0];
const Match matchOptions = getAtOrDefault(args, 1, Match::allWithNormalizing);
const BlockListArg blockListArg(kiwi, args, 2);

const TokenResult tokenResult = kiwi.analyze(str, (Match)matchOptions);
const TokenResult tokenResult = kiwi.analyze(str, (Match)matchOptions, blockListArg.setPtr());

return serializeTokenResult(tokenResult);
}
Expand All @@ -147,17 +207,19 @@ json kiwiAnalyzeTopN(Kiwi& kiwi, const json& args) {
const std::string str = args[0];
const int topN = args[1];
const Match matchOptions = getAtOrDefault(args, 2, Match::allWithNormalizing);
const BlockListArg blockListArg(kiwi, args, 3);

const std::vector<TokenResult> tokenResults = kiwi.analyze(str, topN, matchOptions);
const std::vector<TokenResult> tokenResults = kiwi.analyze(str, topN, matchOptions, blockListArg.setPtr());

return serializeTokenResultVec(tokenResults);
}

json kiwiTokenize(Kiwi& kiwi, const json& args) {
const std::string str = args[0];
const Match matchOptions = getAtOrDefault(args, 1, Match::allWithNormalizing);
const BlockListArg blockListArg(kiwi, args, 2);

const TokenResult tokenResult = kiwi.analyze(str, (Match)matchOptions);
const TokenResult tokenResult = kiwi.analyze(str, (Match)matchOptions, blockListArg.setPtr());

return serializeTokenInfoVec(tokenResult.first);
}
Expand All @@ -166,8 +228,9 @@ json kiwiTokenizeTopN(Kiwi& kiwi, const json& args) {
const std::string str = args[0];
const int topN = args[1];
const Match matchOptions = getAtOrDefault(args, 2, Match::allWithNormalizing);
const BlockListArg blockListArg(kiwi, args, 3);

const std::vector<TokenResult> tokenResults = kiwi.analyze(str, topN, matchOptions);
const std::vector<TokenResult> tokenResults = kiwi.analyze(str, topN, matchOptions, blockListArg.setPtr());

json result = json::array();
for (const TokenResult& tokenResult : tokenResults) {
Expand Down Expand Up @@ -308,6 +371,23 @@ json kiwiSetIntegrateAllomorph(Kiwi& kiwi, const json& args) {
return nullptr;
}

json kiwiCreateMorphemeSet(Kiwi& kiwi, const json& args) {
const int id = nextMorphemeSetId();

const json morphs = args[0];
std::unordered_set<const Morpheme*> set = parseMorphemeSet(kiwi, morphs);

morphemeSets.emplace(id, set);

return id;
}

json kiwiDestroyMorphemeSet(Kiwi& kiwi, const json& args) {
const int id = args[0];
morphemeSets.erase(id);
return nullptr;
}


using ApiMethod = json(*)(const json&);
using InstanceApiMethod = json(*)(Kiwi&, const json&);
Expand Down Expand Up @@ -342,6 +422,8 @@ std::map<std::string, InstanceApiMethod> instanceApiMethods = {
{ "setTypoCostWeight", kiwiSetTypoCostWeight },
{ "getIntegrateAllomorph", kiwiGetIntegrateAllomorph },
{ "setIntegrateAllomorph", kiwiSetIntegrateAllomorph },
{ "createMorphemeSet", kiwiCreateMorphemeSet },
{ "destroyMorphemeSet", kiwiDestroyMorphemeSet },
};


Expand Down
37 changes: 27 additions & 10 deletions bindings/wasm/package/src/kiwi.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { AsyncMethods } from './util.js';

interface TokenInfo {
export interface TokenInfo {
str: string;
position: number;
wordPosition: number;
Expand All @@ -14,7 +14,7 @@ interface TokenInfo {
subSentPosition: number;
}

interface TokenResult {
export interface TokenResult {
tokens: TokenInfo[];
score: number;
}
Expand Down Expand Up @@ -45,12 +45,12 @@ export enum Match {
allWithNormalizing = all | normalizeCoda,
}

interface SentenceSpan {
export interface SentenceSpan {
start: number;
end: number;
}

interface SentenceSplitResult {
export interface SentenceSplitResult {
spans: SentenceSpan[];
tokenResult: TokenResult | null;
}
Expand All @@ -61,31 +61,46 @@ export enum Space {
insertSpace = 2,
}

export interface SentenceJoinMorph {
export interface Morph {
form: string;
tag: string;
}

export interface SentenceJoinMorph extends Morph {
space?: Space;
}

interface SentenceJoinResult {
export interface SentenceJoinResult {
str: string;
ranges: SentenceSpan[] | null;
}

export type MorphemeSet = number;

export interface Kiwi {
ready: () => boolean;
isTypoTolerant: () => boolean;
analyze: (str: string, matchOptions?: Match) => TokenResult;
analyze: (
str: string,
matchOptions?: Match,
blockList?: Morph[] | MorphemeSet
) => TokenResult;
analyzeTopN: (
str: string,
n: number,
matchOptions?: Match
matchOptions?: Match,
blockList?: Morph[] | MorphemeSet
) => TokenResult[];
tokenize: (str: string, matchOptions?: Match) => TokenInfo[];
tokenize: (
str: string,
matchOptions?: Match,
blockList?: Morph[] | MorphemeSet
) => TokenInfo[];
tokenizeTopN: (
str: string,
n: number,
matchOptions?: Match
matchOptions?: Match,
blockList?: Morph[] | MorphemeSet
) => TokenInfo[][];
splitIntoSents: (
str: string,
Expand Down Expand Up @@ -113,6 +128,8 @@ export interface Kiwi {
setTypoCostWeight: (v: number) => void;
getIntegrateAllomorphic: () => boolean;
setIntegrateAllomorphic: (v: boolean) => void;
createMorphemeSet: (morphs: Morph[]) => MorphemeSet;
destroyMorphemeSet: (id: MorphemeSet) => void;
}

export type KiwiAsync = AsyncMethods<Kiwi>;

0 comments on commit 9759afb

Please sign in to comment.