First commit -- the parser and the Rule, Sym classes

The parser implements the earley top-down chart parser and uses depth-first search to construct the parse tree. The Rule class is a simple Array extension that contains a 'lhs' property. The Sym class is just an object with the 'name' attribute. Nothing fancy.
patgrasso · Aug 2, 2016 · 2b963f2 · 2b963f2
commit 2b963f2
Show file tree

Hide file tree

Showing 4 changed files with 381 additions and 0 deletions.
diff --git a/lib/parser.js b/lib/parser.js
@@ -0,0 +1,245 @@
+/*eslint no-console:0*/
+'use strict';
+
+const rules = require('./rules');
+const interp = require('./interp');
+const tokenize = require('./tokenizer');
+
+function parse(sent, tokenizer) {
+ let tokens = (tokenizer || tokenize)(sent);
+ let states = Array.apply(null, Array(tokens.length + 1)).map(() => []);
+ var i, j;
+
+ let rulePairs = rules.rules.map((rule) => ({
+ name : rule.lhs.name,
+ rule : rule,
+ position: 0,
+ origin : 0
+ }));
+
+ [].push.apply(states[0], rulePairs);
+
+ for (i = 0; i <= tokens.length; i += 1) {
+ for (j = 0; j < states[i].length; j += 1) {
+ predict(tokens, states, i, j);
+ scan(tokens, states, i, j);
+ complete(tokens, states, i, j);
+ }
+ }
+
+ return dfs(swap(removeUnfinishedItems(states)), tokens);
+}
+
+
+function predict(tokens, states, i, j) {
+ let curr = states[i][j];
+
+ // prediction
+ if (curr.rule[curr.position] instanceof rules.Sym) {
+ rules.rules.forEach((rule) => {
+ let stateHasItem = states[i].filter((earleyItem) => {
+ return earleyItem.rule === rule &&
+ curr.position === earleyItem.position;
+ }).length > 0;
+
+ if (!stateHasItem) {
+ states[i].push({
+ name : rule.lhs.name,
+ rule : rule,
+ position: 0,
+ origin : i
+ });
+
+ if (states[i].length > 100) {
+ throw new Error(JSON.stringify(states[i], null, 2));
+ }
+ }
+ });
+ }
+}
+
+
+function scan(tokens, states, i, j) {
+ let newItem
+ , curr = states[i][j];
+
+ // scan
+ if (curr.rule[curr.position] instanceof RegExp) {
+ // regex matches token
+ if (curr.rule[curr.position].test(tokens[i]) && i < states.length) {
+ newItem = Object.assign({}, curr);
+ newItem.position += 1;
+ states[i + 1].push(newItem);
+ }
+ }
+ if (typeof curr.rule[curr.position] === 'string') {
+ // string equals token
+ if (curr.rule[curr.position] === tokens[i] && i < states.length) {
+ newItem = Object.assign({}, curr);
+ newItem.position += 1;
+ states[i + 1].push(newItem);
+ }
+ }
+}
+
+function complete(tokens, states, i, j) {
+ let newItem
+ , curr = states[i][j];
+
+ // completion (check first because the position may be out of bounds)
+ if (curr.position >= curr.rule.length) {
+ states[curr.origin].forEach((earleyItem) => {
+ if (earleyItem.rule[earleyItem.position] === curr.rule.lhs) {
+ let stateHasItem = states[i].filter((ei) => {
+ return ei.rule === earleyItem.rule &&
+ ei.position === earleyItem.position + 1;
+ }).length > 0;
+
+ if (stateHasItem) {
+ return;
+ }
+ newItem = Object.assign({}, earleyItem);
+ newItem.position += 1;
+ states[i].push(newItem);
+ }
+ });
+ }
+}
+
+function removeUnfinishedItems(states) {
+ return states.map((state) => state.filter((earleyItem) => {
+ return earleyItem.position >= earleyItem.rule.length;
+ }));
+}
+
+function swap(states) {
+ let newStates = Array.apply(null, Array(states.length)).map(() => []);
+
+ states.forEach((state, i) => {
+ state.forEach((earleyItem) => {
+ newStates[earleyItem.origin].push(earleyItem);
+ earleyItem.origin = i;
+ });
+ });
+ return newStates;
+}
+
+function dfs(states, tokens) {
+ let root = states[0].reduce((best, curr) => {
+ if (best == null || curr.origin > best.origin) {
+ return curr;
+ }
+ return best;
+ }, null);
+
+ if (root.origin !== tokens.length) {
+ throw new SyntaxError(`Parsing error near '${tokens[root.origin]}' `);
+ }
+
+ return {
+ item : root.name,
+ children: dfsHelper(states, root, 0, 0, tokens)
+ };
+}
+
+
+function dfsHelper(states, root, state, depth, tokens) {
+ var edges;
+
+ // Base case: we finished the root rule
+ if (state === root.origin && depth === root.rule.length) {
+ return [];
+ }
+
+ // If the current production symbol is a terminal
+ if (root.rule[depth] instanceof RegExp) {
+ if (root.rule[depth].test(tokens[state])) {
+ let subMatch = dfsHelper(states, root, state + 1, depth + 1, tokens);
+
+ if (subMatch) {
+ return [tokens[state]].concat(subMatch);
+ }
+ }
+ return null;
+ } else if (typeof root.rule[depth] === 'string') {
+ if (root.rule[depth] === tokens[state]) {
+ let subMatch = dfsHelper(states, root, state + 1, depth + 1, tokens);
+
+ if (subMatch) {
+ return [tokens[state]].concat(subMatch);
+ }
+ }
+ return null;
+ }
+
+ // Otherwise, it must be a non-terminal
+ edges = states[state]
+ .filter((item) => item.rule.lhs === root.rule[depth])
+ .map((item) => {
+ let subMatch = dfsHelper(states, root, item.origin, depth + 1, tokens);
+
+ if (subMatch) {
+ return [{
+ item : item,
+ children: dfsHelper(states, item, state, 0, tokens)
+ }].concat(subMatch);
+ }
+ return null;
+ })
+ .filter((list) => list);
+
+ if (edges.length > 1) {
+ let diffs = edges.filter(
+ (tree) => JSON.stringify(tree) !== JSON.stringify(edges[0])
+ );
+
+ if (diffs.length > 0) {
+ console.log('Ambiguity\n' + JSON.stringify(edges, null, 2));
+ }
+ }
+
+ return edges[0];
+}
+
+
+function interpret(parseTree) {
+ if (typeof parseTree === 'string' || parseTree == null) {
+ return parseTree;
+ }
+
+ let values = parseTree.children
+ .map((tree) => interpret(tree))
+ .filter((value) => value != null);
+
+ return interp.valueOf(parseTree.item, values);
+}
+
+
+
+//let sentence = '23 + ( 32 * 46 )';
+//let sentence = '( 23 + 32 ) * 46';
+//let sentence = '23 + 32 * 46';
+//let sentence = '( ( 12 ) )';
+//let sentence = '1 * 2 + 3 * 4 + 5';
+//let sentence = '1 + 2 + 3';
+let sentence = '1^3 + 2 * 3(3)';
+
+let states = parse(sentence);
+
+console.log('\n\n--FINAL--');
+//printStates(states, sentence);
+console.log('\n');
+console.log('input:', sentence);
+console.log('=================');
+
+
+console.log('\n~~ dfs ~~');
+console.log(JSON.stringify(
+ dfs(states, tokenize(sentence)),
+ null, 2
+));
+
+console.log(interpret(dfs(states, tokenize(sentence))));
+
+module.exports.parse = parse;
+module.exports.interpret = interpret;
diff --git a/lib/rules.js b/lib/rules.js
@@ -0,0 +1,93 @@
+
+function Rule(lhs, rhs) {
+ let arr = [];
+
+ if (!rhs || rhs.length === 0) {
+ throw new Error('Rule does not produce anything');
+ }
+ arr.push.apply(arr, rhs);
+ arr.lhs = lhs;
+ arr.__proto__ = Rule.prototype;
+
+ return arr;
+}
+Object.setPrototypeOf(Rule.prototype, Array.prototype);
+
+
+function Sym(name) {
+ let symbol = {};
+ symbol.__proto__ = Sym.prototype;
+ symbol.name = name;
+ return symbol;
+}
+
+//const expr = new Sym('expr');
+//const addexp = new Sym('addexp');
+const sum = new Sym('sum');
+const prod = new Sym('prod');
+const factor = new Sym('factor');
+const exp = new Sym('exp');
+//const number = new Sym('number');
+
+/*
+let rules = {
+ multiply: [expr, '*', expr],
+ add : [expr, '+', expr],
+ //subtract: [expr, '-', expr],
+ //divide : [expr, '/', expr],
+ group : ['(', expr, ')'],
+ number : [/^\d+$/]
+};
+*/
+
+/*
+let rules = {
+ multiply: new Rule(expr, [expr, '*', expr]),
+ add : new Rule(expr, [expr, '+', expr]),
+ divide : new Rule(expr, [expr, '/', expr]),
+ subtract: new Rule(expr, [expr, '-', expr]),
+ group : new Rule(expr, ['(', expr, ')']),
+ number : new Rule(expr, [/^\d+$/])
+};
+*/
+
+let rules = [
+ // sum
+ new Rule(sum , [sum, '+', prod] , (x, _, y) => x + y),
+ new Rule(sum , [prod] , (x) => x),
+
+ // product
+ new Rule(prod , [prod, '*', exp] , (x, _, y) => x * y),
+ new Rule(prod , [exp, '*', prod] , (x, _, y) => x * y),
+ //new Rule(prod , [prod, '*', prod] , (x, _, y) => x * y),
+ new Rule(prod , [exp] , (x) => x),
+ //new Rule(prod , [factor]),
+
+ // distributive product
+ new Rule(prod , [exp, '(', exp, ')'] , (x, _, y) => x * y),
+ new Rule(prod , ['(', exp, ')', exp] , (_, x, __, y) => x * y),
+
+ // exponent
+ new Rule(exp , [factor, '^', factor] , (x, _, y) => Math.pow(x, y)),
+ new Rule(exp , [factor] , (x) => x),
+
+ // factor
+ new Rule(factor , ['(', sum, ')'] , (_, x) => x),
+ new Rule(factor , [/\d+/] , (n) => parseFloat(n))
+];
+
+function getTokens() {
+ return rules.reduce(
+ (tokens, rule) => tokens.concat(
+ rule.filter((sym) => typeof sym === 'string' || sym instanceof RegExp)
+ )
+ , []);
+}
+
+module.exports = {
+ rules : rules,
+ Rule : Rule,
+ Sym : Sym,
+ getTokens : getTokens
+};
+
diff --git a/lib/tokenizer.js b/lib/tokenizer.js
@@ -0,0 +1,17 @@
+
+module.exports.tokenize = (sent, terminals) => {
+ let tokens = terminals
+ .map((token) => {
+ if (typeof token === 'string') {
+ return token.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, '\\$&');
+ }
+ return token.source;
+ })
+ , delims = RegExp('(' + tokens.join('|') + ')');
+
+ return sent
+ .split(delims)
+ .map((item) => item.trim())
+ .filter((item) => item !== '');
+};
+
diff --git a/package.json b/package.json
@@ -0,0 +1,26 @@
+{
+ "name": "parsey",
+ "version": "0.0.0",
+ "description": "Parser for context-free grammars",
+ "main": "parser.js",
+ "scripts": {
+ "test": "jasmine"
+ },
+ "repository": {
+ "type": "git",
+ "url": "git+https://github.com/patgrasso/parsey.git"
+ },
+ "keywords": [
+ "parse",
+ "grammar",
+ "cfg",
+ "ast",
+ "earley"
+ ],
+ "author": "Pat Grasso",
+ "license": "MIT",
+ "bugs": {
+ "url": "https://github.com/patgrasso/parsey/issues"
+ },
+ "homepage": "https://github.com/patgrasso/parsey#readme"
+}