-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
First commit -- the parser and the Rule, Sym classes
The parser implements the earley top-down chart parser and uses depth-first search to construct the parse tree. The Rule class is a simple Array extension that contains a 'lhs' property. The Sym class is just an object with the 'name' attribute. Nothing fancy.
- Loading branch information
0 parents
commit 2b963f2
Showing
4 changed files
with
381 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,245 @@ | ||
/*eslint no-console:0*/ | ||
'use strict'; | ||
|
||
const rules = require('./rules'); | ||
const interp = require('./interp'); | ||
const tokenize = require('./tokenizer'); | ||
|
||
function parse(sent, tokenizer) { | ||
let tokens = (tokenizer || tokenize)(sent); | ||
let states = Array.apply(null, Array(tokens.length + 1)).map(() => []); | ||
var i, j; | ||
|
||
let rulePairs = rules.rules.map((rule) => ({ | ||
name : rule.lhs.name, | ||
rule : rule, | ||
position: 0, | ||
origin : 0 | ||
})); | ||
|
||
[].push.apply(states[0], rulePairs); | ||
|
||
for (i = 0; i <= tokens.length; i += 1) { | ||
for (j = 0; j < states[i].length; j += 1) { | ||
predict(tokens, states, i, j); | ||
scan(tokens, states, i, j); | ||
complete(tokens, states, i, j); | ||
} | ||
} | ||
|
||
return dfs(swap(removeUnfinishedItems(states)), tokens); | ||
} | ||
|
||
|
||
function predict(tokens, states, i, j) { | ||
let curr = states[i][j]; | ||
|
||
// prediction | ||
if (curr.rule[curr.position] instanceof rules.Sym) { | ||
rules.rules.forEach((rule) => { | ||
let stateHasItem = states[i].filter((earleyItem) => { | ||
return earleyItem.rule === rule && | ||
curr.position === earleyItem.position; | ||
}).length > 0; | ||
|
||
if (!stateHasItem) { | ||
states[i].push({ | ||
name : rule.lhs.name, | ||
rule : rule, | ||
position: 0, | ||
origin : i | ||
}); | ||
|
||
if (states[i].length > 100) { | ||
throw new Error(JSON.stringify(states[i], null, 2)); | ||
} | ||
} | ||
}); | ||
} | ||
} | ||
|
||
|
||
function scan(tokens, states, i, j) { | ||
let newItem | ||
, curr = states[i][j]; | ||
|
||
// scan | ||
if (curr.rule[curr.position] instanceof RegExp) { | ||
// regex matches token | ||
if (curr.rule[curr.position].test(tokens[i]) && i < states.length) { | ||
newItem = Object.assign({}, curr); | ||
newItem.position += 1; | ||
states[i + 1].push(newItem); | ||
} | ||
} | ||
if (typeof curr.rule[curr.position] === 'string') { | ||
// string equals token | ||
if (curr.rule[curr.position] === tokens[i] && i < states.length) { | ||
newItem = Object.assign({}, curr); | ||
newItem.position += 1; | ||
states[i + 1].push(newItem); | ||
} | ||
} | ||
} | ||
|
||
function complete(tokens, states, i, j) { | ||
let newItem | ||
, curr = states[i][j]; | ||
|
||
// completion (check first because the position may be out of bounds) | ||
if (curr.position >= curr.rule.length) { | ||
states[curr.origin].forEach((earleyItem) => { | ||
if (earleyItem.rule[earleyItem.position] === curr.rule.lhs) { | ||
let stateHasItem = states[i].filter((ei) => { | ||
return ei.rule === earleyItem.rule && | ||
ei.position === earleyItem.position + 1; | ||
}).length > 0; | ||
|
||
if (stateHasItem) { | ||
return; | ||
} | ||
newItem = Object.assign({}, earleyItem); | ||
newItem.position += 1; | ||
states[i].push(newItem); | ||
} | ||
}); | ||
} | ||
} | ||
|
||
function removeUnfinishedItems(states) { | ||
return states.map((state) => state.filter((earleyItem) => { | ||
return earleyItem.position >= earleyItem.rule.length; | ||
})); | ||
} | ||
|
||
function swap(states) { | ||
let newStates = Array.apply(null, Array(states.length)).map(() => []); | ||
|
||
states.forEach((state, i) => { | ||
state.forEach((earleyItem) => { | ||
newStates[earleyItem.origin].push(earleyItem); | ||
earleyItem.origin = i; | ||
}); | ||
}); | ||
return newStates; | ||
} | ||
|
||
function dfs(states, tokens) { | ||
let root = states[0].reduce((best, curr) => { | ||
if (best == null || curr.origin > best.origin) { | ||
return curr; | ||
} | ||
return best; | ||
}, null); | ||
|
||
if (root.origin !== tokens.length) { | ||
throw new SyntaxError(`Parsing error near '${tokens[root.origin]}' `); | ||
} | ||
|
||
return { | ||
item : root.name, | ||
children: dfsHelper(states, root, 0, 0, tokens) | ||
}; | ||
} | ||
|
||
|
||
function dfsHelper(states, root, state, depth, tokens) { | ||
var edges; | ||
|
||
// Base case: we finished the root rule | ||
if (state === root.origin && depth === root.rule.length) { | ||
return []; | ||
} | ||
|
||
// If the current production symbol is a terminal | ||
if (root.rule[depth] instanceof RegExp) { | ||
if (root.rule[depth].test(tokens[state])) { | ||
let subMatch = dfsHelper(states, root, state + 1, depth + 1, tokens); | ||
|
||
if (subMatch) { | ||
return [tokens[state]].concat(subMatch); | ||
} | ||
} | ||
return null; | ||
} else if (typeof root.rule[depth] === 'string') { | ||
if (root.rule[depth] === tokens[state]) { | ||
let subMatch = dfsHelper(states, root, state + 1, depth + 1, tokens); | ||
|
||
if (subMatch) { | ||
return [tokens[state]].concat(subMatch); | ||
} | ||
} | ||
return null; | ||
} | ||
|
||
// Otherwise, it must be a non-terminal | ||
edges = states[state] | ||
.filter((item) => item.rule.lhs === root.rule[depth]) | ||
.map((item) => { | ||
let subMatch = dfsHelper(states, root, item.origin, depth + 1, tokens); | ||
|
||
if (subMatch) { | ||
return [{ | ||
item : item, | ||
children: dfsHelper(states, item, state, 0, tokens) | ||
}].concat(subMatch); | ||
} | ||
return null; | ||
}) | ||
.filter((list) => list); | ||
|
||
if (edges.length > 1) { | ||
let diffs = edges.filter( | ||
(tree) => JSON.stringify(tree) !== JSON.stringify(edges[0]) | ||
); | ||
|
||
if (diffs.length > 0) { | ||
console.log('Ambiguity\n' + JSON.stringify(edges, null, 2)); | ||
} | ||
} | ||
|
||
return edges[0]; | ||
} | ||
|
||
|
||
function interpret(parseTree) { | ||
if (typeof parseTree === 'string' || parseTree == null) { | ||
return parseTree; | ||
} | ||
|
||
let values = parseTree.children | ||
.map((tree) => interpret(tree)) | ||
.filter((value) => value != null); | ||
|
||
return interp.valueOf(parseTree.item, values); | ||
} | ||
|
||
|
||
|
||
//let sentence = '23 + ( 32 * 46 )'; | ||
//let sentence = '( 23 + 32 ) * 46'; | ||
//let sentence = '23 + 32 * 46'; | ||
//let sentence = '( ( 12 ) )'; | ||
//let sentence = '1 * 2 + 3 * 4 + 5'; | ||
//let sentence = '1 + 2 + 3'; | ||
let sentence = '1^3 + 2 * 3(3)'; | ||
|
||
let states = parse(sentence); | ||
|
||
console.log('\n\n--FINAL--'); | ||
//printStates(states, sentence); | ||
console.log('\n'); | ||
console.log('input:', sentence); | ||
console.log('================='); | ||
|
||
|
||
console.log('\n~~ dfs ~~'); | ||
console.log(JSON.stringify( | ||
dfs(states, tokenize(sentence)), | ||
null, 2 | ||
)); | ||
|
||
console.log(interpret(dfs(states, tokenize(sentence)))); | ||
|
||
module.exports.parse = parse; | ||
module.exports.interpret = interpret; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
|
||
function Rule(lhs, rhs) { | ||
let arr = []; | ||
|
||
if (!rhs || rhs.length === 0) { | ||
throw new Error('Rule does not produce anything'); | ||
} | ||
arr.push.apply(arr, rhs); | ||
arr.lhs = lhs; | ||
arr.__proto__ = Rule.prototype; | ||
|
||
return arr; | ||
} | ||
Object.setPrototypeOf(Rule.prototype, Array.prototype); | ||
|
||
|
||
function Sym(name) { | ||
let symbol = {}; | ||
symbol.__proto__ = Sym.prototype; | ||
symbol.name = name; | ||
return symbol; | ||
} | ||
|
||
//const expr = new Sym('expr'); | ||
//const addexp = new Sym('addexp'); | ||
const sum = new Sym('sum'); | ||
const prod = new Sym('prod'); | ||
const factor = new Sym('factor'); | ||
const exp = new Sym('exp'); | ||
//const number = new Sym('number'); | ||
|
||
/* | ||
let rules = { | ||
multiply: [expr, '*', expr], | ||
add : [expr, '+', expr], | ||
//subtract: [expr, '-', expr], | ||
//divide : [expr, '/', expr], | ||
group : ['(', expr, ')'], | ||
number : [/^\d+$/] | ||
}; | ||
*/ | ||
|
||
/* | ||
let rules = { | ||
multiply: new Rule(expr, [expr, '*', expr]), | ||
add : new Rule(expr, [expr, '+', expr]), | ||
divide : new Rule(expr, [expr, '/', expr]), | ||
subtract: new Rule(expr, [expr, '-', expr]), | ||
group : new Rule(expr, ['(', expr, ')']), | ||
number : new Rule(expr, [/^\d+$/]) | ||
}; | ||
*/ | ||
|
||
let rules = [ | ||
// sum | ||
new Rule(sum , [sum, '+', prod] , (x, _, y) => x + y), | ||
new Rule(sum , [prod] , (x) => x), | ||
|
||
// product | ||
new Rule(prod , [prod, '*', exp] , (x, _, y) => x * y), | ||
new Rule(prod , [exp, '*', prod] , (x, _, y) => x * y), | ||
//new Rule(prod , [prod, '*', prod] , (x, _, y) => x * y), | ||
new Rule(prod , [exp] , (x) => x), | ||
//new Rule(prod , [factor]), | ||
|
||
// distributive product | ||
new Rule(prod , [exp, '(', exp, ')'] , (x, _, y) => x * y), | ||
new Rule(prod , ['(', exp, ')', exp] , (_, x, __, y) => x * y), | ||
|
||
// exponent | ||
new Rule(exp , [factor, '^', factor] , (x, _, y) => Math.pow(x, y)), | ||
new Rule(exp , [factor] , (x) => x), | ||
|
||
// factor | ||
new Rule(factor , ['(', sum, ')'] , (_, x) => x), | ||
new Rule(factor , [/\d+/] , (n) => parseFloat(n)) | ||
]; | ||
|
||
function getTokens() { | ||
return rules.reduce( | ||
(tokens, rule) => tokens.concat( | ||
rule.filter((sym) => typeof sym === 'string' || sym instanceof RegExp) | ||
) | ||
, []); | ||
} | ||
|
||
module.exports = { | ||
rules : rules, | ||
Rule : Rule, | ||
Sym : Sym, | ||
getTokens : getTokens | ||
}; | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
|
||
module.exports.tokenize = (sent, terminals) => { | ||
let tokens = terminals | ||
.map((token) => { | ||
if (typeof token === 'string') { | ||
return token.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, '\\$&'); | ||
} | ||
return token.source; | ||
}) | ||
, delims = RegExp('(' + tokens.join('|') + ')'); | ||
|
||
return sent | ||
.split(delims) | ||
.map((item) => item.trim()) | ||
.filter((item) => item !== ''); | ||
}; | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
{ | ||
"name": "parsey", | ||
"version": "0.0.0", | ||
"description": "Parser for context-free grammars", | ||
"main": "parser.js", | ||
"scripts": { | ||
"test": "jasmine" | ||
}, | ||
"repository": { | ||
"type": "git", | ||
"url": "git+https://github.com/patgrasso/parsey.git" | ||
}, | ||
"keywords": [ | ||
"parse", | ||
"grammar", | ||
"cfg", | ||
"ast", | ||
"earley" | ||
], | ||
"author": "Pat Grasso", | ||
"license": "MIT", | ||
"bugs": { | ||
"url": "https://github.com/patgrasso/parsey/issues" | ||
}, | ||
"homepage": "https://github.com/patgrasso/parsey#readme" | ||
} |