Skip to content

Commit

Permalink
Document everything, pretty much
Browse files Browse the repository at this point in the history
  • Loading branch information
patgrasso committed Aug 3, 2016
1 parent f142ca1 commit c0de0ed
Show file tree
Hide file tree
Showing 4 changed files with 254 additions and 11 deletions.
183 changes: 172 additions & 11 deletions lib/parser.js
Original file line number Diff line number Diff line change
@@ -1,17 +1,69 @@
/**
* Provides functions for parsing sentences with a pre-constructed grammar.
* The parsing algorithm is an implementation of the earley top-down chart
* parser. The parse tree is constructed from the chart using depth-first search
*
* @module lib/parser
*/
/*eslint no-console:0*/

'use strict';

const rules = require('./rules');
const tokenize = require('./tokenizer');


/**
* Tokenizes, then parses, the input string with the given grammar. The result
* is a parse tree represented with plain objects. See
* {@link module:lib/parser.dfs|dfs()} for an example of the structure of a
* parse tree
*
* @function parse
* @memberof module:lib/parser
* @param {string} sent - Input string to parse
* @param {module:lib/rules~Rule[]} grammar - Set of
* {@link module:lib/rules~Rule|Rules} that define a language
* @param {Function} [tokenizer={@link module:lib/tokenizer|tokenize}] -
* Function that accepts a string and a grammar (optionally) and splits
* the input string into tokens, each representing a symbol in the
* language
* @return {object} Root node of the parse tree
*/
function parse(sent, grammar, tokenizer) {
let tokens = (tokenizer || tokenize)(sent, grammar || rules.rules);
let states = earley(tokens, grammar || rules.rules);
return dfs(states, tokens);
}


/**
* Parses the input tokens using the earley top-down chart parsing algorithm
* to product a set of states, each containing a list of earley items
*
* @function earley
* @memberof module:lib/parser
* @param {string[]} tokens - Sequence of symbols to be parsed
* @param {module:lib/rules~Rule[]} grammar - Set of rules that define a
* language
* @return {state[]} Set of 'states', each of which contains a list of earley
* items. Each earley item looks something like this:
* <pre><code>
* {
* name: [string],
* rule: [Rule],
* position: [number],
* origin: [number]
* }
* </code></pre>
* An earley item represents a completed parse of some individual rule. The
* position should be equivalent to rule.length, and the origin, despite its
* name, describes the state at which parse finished.
*
* This means that an earley item <i>should</i> exist in state 0 with an
* origin equivalent to the number of tokens passed in to indicate that the
* entire input was parsed successfully for some rule
*/
function earley(tokens, grammar) {
let states = Array.apply(null, Array(tokens.length + 1)).map(() => []);
var i, j;
Expand All @@ -37,6 +89,22 @@ function earley(tokens, grammar) {
}


/**
* Prediction stage in the earley algorithm. Given an earley item, determine
* if the next symbol to be processed is a non-terminal, and if so add all
* rules whose LHS equals that symbol to the current earley state.
*
* This also avoids adding duplicate rules to a state, a pitfall caused by
* left-recursive grammars
*
* @function predict
* @param {string[]} tokens - Input tokens being parsed
* @param {state[]} states - Set of lists of earley items
* @param {number} i - Index of the earley state to be processed
* @param {number} j - Index of the earley item to be processed within the state
* @param {module:lib/rules~Rule[]} grammar - Set of rules that define the
* language
*/
function predict(tokens, states, i, j, grammar) {
let curr = states[i][j];

Expand All @@ -61,6 +129,19 @@ function predict(tokens, states, i, j, grammar) {
}


/**
* Scanning stage in the earley algorithm. Given an earley item, determine if
* the next symbol to be processed is a terminal, and if so see if it matches
* the tokens at the state/index described by `i`. If the token matches, add
* an earley item to the next state that is a duplicate of this one, except
* whose position is one greater
*
* @function scan
* @param {string[]} tokens - Input tokens being parsed
* @param {state[]} states - Set of lists of earley items
* @param {number} i - Index of the earley state to be processed
* @param {number} j - Index of the earley item to be processed within the state
*/
function scan(tokens, states, i, j) {
let newItem
, curr = states[i][j];
Expand All @@ -84,6 +165,23 @@ function scan(tokens, states, i, j) {
}
}


/**
* Completion stage in the earley algorithm. If the current earley item's
* position is >= to the length of it's rule, it has successfully parsed the
* rule it represents!
*
* Once this has occurred, go back to the state in which the earley item
* originated and find all earley items whose next symbol to match matches our
* earley item, and add them to the current state, incrementing their positions
* accordingly
*
* @function complete
* @param {string[]} tokens - Input tokens being parsed
* @param {state[]} states - Set of lists of earley items
* @param {number} i - Index of the earley state to be processed
* @param {number} j - Index of the earley item to be processed within the state
*/
function complete(tokens, states, i, j) {
let newItem
, curr = states[i][j];
Expand All @@ -109,12 +207,37 @@ function complete(tokens, states, i, j) {
}
}


/**
* Removes earley items from each state that failed to completely parse through.
* In other words, removes earley items whose position is less than the length
* of its rule
*
* @function removeUnfinishedItems
* @param {state[]} states - Set of lists of earley items
* @return {state[]} Set of lists of completed earley items
*/
function removeUnfinishedItems(states) {
return states.map((state) => state.filter((earleyItem) => {
return earleyItem.position >= earleyItem.rule.length;
}));
}


/**
* Places earley items in the states in which they originated, as opposed to the
* states in which they finished parsing, and set their <code>origin</code>
* properties to the state in which they finished.
*
* This allows a depth-first search of the chart to move forwards through the
* graph, which is more intuitive than having to move backwards
*
* @function swap
* @param {state[]} states - Set of lists of earley items
* @return {state[]} Set of lists of earley items, but each item now exists in
* the state at which it originated, and the <code>origin</code> property of
* each item points to the state at which the parse completed
*/
function swap(states) {
let newStates = Array.apply(null, Array(states.length)).map(() => []);

Expand All @@ -127,6 +250,38 @@ function swap(states) {
return newStates;
}


/**
* Performs a depth-first search on the chart generated by
* {@link module:lib/parser~earley|earley()} in order to construct a parse tree,
* an example of which is shown below
*
* @example
* {
* item: <Rule sum -> [factor, '+', factor]>,
* children: [
* { // first symbol - 'factor'
* item: <Rule factor -> [/\d+/]>,
* children: [
* '2'
* ]
* },
* '+', // second symbol
* { // third symbol - another 'factor'
* item: <Rule factor -> [/\d+/]>,
* children: [
* '3'
* ]
* }
* ]
* }
*
* @function dfs
* @memberof module:lib/parser
* @param {state[]} states - Set of lists of earley items
* @param {string[]} tokens - Input tokens to be parsed
* @return {object} Root node of the parse tree
*/
function dfs(states, tokens) {
let root = states[0].reduce((best, curr) => {
if (best == null || curr.origin > best.origin) {
Expand All @@ -149,6 +304,23 @@ function dfs(states, tokens) {
}


/**
* Recursive function that explores a specific earley item, constructs the parse
* tree for it, then sends it up the chimney!
*
* @function dfsHelper
* @param {state[]} states - Set of lists of earley items
* @param {earleyItem} root - Current earley item being explored, a tree for
* which is to be constructed
* @param {number} state - Current state/index of our current position in the
* list of tokens
* @param {number} depth - Index/position in the root's rule (RHS). In other
* words, index of the next symbol to match or explore
* @param {string[]} tokens - List of input tokens
* @return {null|object[]} Null if the search provided NO results for the
* given node, or a list of tree nodes, which are the respective parse trees
* of each of the root rule's RHS symbols
*/
function dfsHelper(states, root, state, depth, tokens) {
var edges;

Expand Down Expand Up @@ -209,17 +381,6 @@ function dfsHelper(states, root, state, depth, tokens) {
}


//let sentence = '23 + ( 32 * 46 )';
//let sentence = '( 23 + 32 ) * 46';
//let sentence = '23 + 32 * 46';
//let sentence = '( ( 12 ) )';
//let sentence = '1 * 2 + 3 * 4 + 5';
//let sentence = '1 + 2 + 3';
//let sentence = '1^3 + 2 * 3(3)';
//let tree = parse(sentence, rules.rules);

//console.log(JSON.stringify(tree, null, 2));

module.exports.parse = parse;
module.exports.earley = earley;
module.exports.dfs = dfs;
45 changes: 45 additions & 0 deletions lib/rules.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,31 @@
/**
* Defines Rule and Sym classes, which are used create productions that comprise
* a grammar
*
* @module lib/rules
*/

'use strict';

/**
* Defines a production rule, with a sole symbol on the left-hand side and a
* list of symbols on the right-hand side. The constructor also accepts a third
* argument, a valuator function, which can be used to evaluate values that are
* obtained by matching this production
*
* @class Rule
* @extends Array
* @constructor
* @memberof module:lib/rules
* @public
* @param {module:lib/rules~Sym} lhs - {@link module:lib/rules~Sym|Sym}
* representing the left hand side of the production
* @param {Array.<module:lib/rules~Sym|string|RegExp>} rhs - Sequence of
* {@link module:lib/rules~Sym|Syms}, plain strings, or RegExp objects that
* represents the right hand side of the production
* @param {Function=} valuator - Function used to evaluate values obtained by
* matching this production
*/
function Rule(lhs, rhs, valuator) {
let arr = [];

Expand All @@ -21,6 +47,20 @@ function Rule(lhs, rhs, valuator) {
Object.setPrototypeOf(Rule.prototype, Array.prototype);


/**
* Constructor for the Sym class, which simply represents a non-terminal symbol
* in a grammar. While parsing, Syms are compared by reference, not by name. So,
* the name argument is optional as it serves no purpose for parsing. For
* debugging and evaluation of a parse tree, however, the name could be quite
* useful
*
* @class Sym
* @constructor
* @memberof module:lib/rules
* @param {string=} name - Name to give to the newly created symbol. Names do not
* need to be unique among Syms in a grammar, as they are not used to compare
* equality
*/
function Sym(name) {
let symbol = {};
symbol.__proto__ = Sym.prototype;
Expand Down Expand Up @@ -83,9 +123,14 @@ let rules = [
new Rule(factor , [/\d+/] , (n) => parseFloat(n))
];


module.exports = {
rules : rules,

/** @see {@link module:lib/rules.Rule|Rule} */
Rule : Rule,

/** @see {@link module:lib/rules.Sym|Sym} */
Sym : Sym
};

17 changes: 17 additions & 0 deletions lib/tokenizer.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,22 @@
/**
* Provides a function for tokenizing a sentence given some grammar
*
* @module lib/tokenizer
*/

'use strict';

/**
* Tokenizes a sentence given some grammar by finding all terminal symbols
* within the grammar and splitting the sentence by each of those symbols
*
* @function
* @param {string} sent - Sentence or string to be split/tokenized
* @param {Array.<module:lib/rules~Rule|Rule>} grammar - List of
* [Rules]{@link module:lib/rules~Rule} that define the grammar
* @return {string[]} Tokens/the sentence, split by each terminal character
* found within the grammar
*/
module.exports = (sent, grammar) => {
let terms = grammar.reduce(
(tokens, rule) => tokens.concat(
Expand Down
20 changes: 20 additions & 0 deletions parsey.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,28 @@
/**
* Export module for the parsey package
*
* @module parsey
*/

module.exports = {
/**
* @see module:lib/parser.parse
*/
parse : require('./lib/parser').parse,

/**
* @see module:lib/tokenizer
*/
tokenize : require('./lib/tokenizer'),

/**
* @see module:lib/rules~Rule
*/
Rule : require('./lib/rules').Rule,

/**
* @see module:lib/rules~Sym
*/
Sym : require('./lib/rules').Sym
};

0 comments on commit c0de0ed

Please sign in to comment.