Document everything, pretty much

patgrasso · Aug 3, 2016 · c0de0ed · c0de0ed
1 parent f142ca1
commit c0de0ed
Show file tree

Hide file tree

Showing 4 changed files with 254 additions and 11 deletions.
diff --git a/lib/parser.js b/lib/parser.js
@@ -1,17 +1,69 @@
+/**
+ * Provides functions for parsing sentences with a pre-constructed grammar.
+ * The parsing algorithm is an implementation of the earley top-down chart
+ * parser. The parse tree is constructed from the chart using depth-first search
+ *
+ * @module lib/parser
+ */
 /*eslint no-console:0*/
+
 'use strict';
 
 const rules = require('./rules');
 const tokenize = require('./tokenizer');
 
 
+/**
+ * Tokenizes, then parses, the input string with the given grammar. The result
+ * is a parse tree represented with plain objects. See
+ * {@link module:lib/parser.dfs|dfs()} for an example of the structure of a
+ * parse tree
+ *
+ * @function parse
+ * @memberof module:lib/parser
+ * @param {string} sent - Input string to parse
+ * @param {module:lib/rules~Rule[]} grammar - Set of
+ * {@link module:lib/rules~Rule|Rules} that define a language
+ * @param {Function} [tokenizer={@link module:lib/tokenizer|tokenize}] -
+ * Function that accepts a string and a grammar (optionally) and splits
+ * the input string into tokens, each representing a symbol in the
+ * language
+ * @return {object} Root node of the parse tree
+ */
 function parse(sent, grammar, tokenizer) {
  let tokens = (tokenizer || tokenize)(sent, grammar || rules.rules);
  let states = earley(tokens, grammar || rules.rules);
  return dfs(states, tokens);
 }
 
 
+/**
+ * Parses the input tokens using the earley top-down chart parsing algorithm
+ * to product a set of states, each containing a list of earley items
+ *
+ * @function earley
+ * @memberof module:lib/parser
+ * @param {string[]} tokens - Sequence of symbols to be parsed
+ * @param {module:lib/rules~Rule[]} grammar - Set of rules that define a
+ * language
+ * @return {state[]} Set of 'states', each of which contains a list of earley
+ * items. Each earley item looks something like this:
+ * <pre><code>
+ * {
+ * name: [string],
+ * rule: [Rule],
+ * position: [number],
+ * origin: [number]
+ * }
+ * </code></pre>
+ * An earley item represents a completed parse of some individual rule. The
+ * position should be equivalent to rule.length, and the origin, despite its
+ * name, describes the state at which parse finished.
+ *
+ * This means that an earley item <i>should</i> exist in state 0 with an
+ * origin equivalent to the number of tokens passed in to indicate that the
+ * entire input was parsed successfully for some rule
+ */
 function earley(tokens, grammar) {
  let states = Array.apply(null, Array(tokens.length + 1)).map(() => []);
  var i, j;
@@ -37,6 +89,22 @@ function earley(tokens, grammar) {
 }
 
 
+/**
+ * Prediction stage in the earley algorithm. Given an earley item, determine
+ * if the next symbol to be processed is a non-terminal, and if so add all
+ * rules whose LHS equals that symbol to the current earley state.
+ *
+ * This also avoids adding duplicate rules to a state, a pitfall caused by
+ * left-recursive grammars
+ *
+ * @function predict
+ * @param {string[]} tokens - Input tokens being parsed
+ * @param {state[]} states - Set of lists of earley items
+ * @param {number} i - Index of the earley state to be processed
+ * @param {number} j - Index of the earley item to be processed within the state
+ * @param {module:lib/rules~Rule[]} grammar - Set of rules that define the
+ * language
+ */
 function predict(tokens, states, i, j, grammar) {
  let curr = states[i][j];
 
@@ -61,6 +129,19 @@ function predict(tokens, states, i, j, grammar) {
 }
 
 
+/**
+ * Scanning stage in the earley algorithm. Given an earley item, determine if
+ * the next symbol to be processed is a terminal, and if so see if it matches
+ * the tokens at the state/index described by `i`. If the token matches, add
+ * an earley item to the next state that is a duplicate of this one, except
+ * whose position is one greater
+ *
+ * @function scan
+ * @param {string[]} tokens - Input tokens being parsed
+ * @param {state[]} states - Set of lists of earley items
+ * @param {number} i - Index of the earley state to be processed
+ * @param {number} j - Index of the earley item to be processed within the state
+ */
 function scan(tokens, states, i, j) {
  let newItem
  , curr = states[i][j];
@@ -84,6 +165,23 @@ function scan(tokens, states, i, j) {
  }
 }
 
+
+/**
+ * Completion stage in the earley algorithm. If the current earley item's
+ * position is >= to the length of it's rule, it has successfully parsed the
+ * rule it represents!
+ *
+ * Once this has occurred, go back to the state in which the earley item
+ * originated and find all earley items whose next symbol to match matches our
+ * earley item, and add them to the current state, incrementing their positions
+ * accordingly
+ *
+ * @function complete
+ * @param {string[]} tokens - Input tokens being parsed
+ * @param {state[]} states - Set of lists of earley items
+ * @param {number} i - Index of the earley state to be processed
+ * @param {number} j - Index of the earley item to be processed within the state
+ */
 function complete(tokens, states, i, j) {
  let newItem
  , curr = states[i][j];
@@ -109,12 +207,37 @@ function complete(tokens, states, i, j) {
  }
 }
 
+
+/**
+ * Removes earley items from each state that failed to completely parse through.
+ * In other words, removes earley items whose position is less than the length
+ * of its rule
+ *
+ * @function removeUnfinishedItems
+ * @param {state[]} states - Set of lists of earley items
+ * @return {state[]} Set of lists of completed earley items
+ */
 function removeUnfinishedItems(states) {
  return states.map((state) => state.filter((earleyItem) => {
  return earleyItem.position >= earleyItem.rule.length;
  }));
 }
 
+
+/**
+ * Places earley items in the states in which they originated, as opposed to the
+ * states in which they finished parsing, and set their <code>origin</code>
+ * properties to the state in which they finished.
+ *
+ * This allows a depth-first search of the chart to move forwards through the
+ * graph, which is more intuitive than having to move backwards
+ *
+ * @function swap
+ * @param {state[]} states - Set of lists of earley items
+ * @return {state[]} Set of lists of earley items, but each item now exists in
+ * the state at which it originated, and the <code>origin</code> property of
+ * each item points to the state at which the parse completed
+ */
 function swap(states) {
  let newStates = Array.apply(null, Array(states.length)).map(() => []);
 
@@ -127,6 +250,38 @@ function swap(states) {
  return newStates;
 }
 
+
+/**
+ * Performs a depth-first search on the chart generated by
+ * {@link module:lib/parser~earley|earley()} in order to construct a parse tree,
+ * an example of which is shown below
+ *
+ * @example
+ * {
+ * item: <Rule sum -> [factor, '+', factor]>,
+ * children: [
+ * { // first symbol - 'factor'
+ * item: <Rule factor -> [/\d+/]>,
+ * children: [
+ * '2'
+ * ]
+ * },
+ * '+', // second symbol
+ * { // third symbol - another 'factor'
+ * item: <Rule factor -> [/\d+/]>,
+ * children: [
+ * '3'
+ * ]
+ * }
+ * ]
+ * }
+ *
+ * @function dfs
+ * @memberof module:lib/parser
+ * @param {state[]} states - Set of lists of earley items
+ * @param {string[]} tokens - Input tokens to be parsed
+ * @return {object} Root node of the parse tree
+ */
 function dfs(states, tokens) {
  let root = states[0].reduce((best, curr) => {
  if (best == null || curr.origin > best.origin) {
@@ -149,6 +304,23 @@ function dfs(states, tokens) {
 }
 
 
+/**
+ * Recursive function that explores a specific earley item, constructs the parse
+ * tree for it, then sends it up the chimney!
+ *
+ * @function dfsHelper
+ * @param {state[]} states - Set of lists of earley items
+ * @param {earleyItem} root - Current earley item being explored, a tree for
+ * which is to be constructed
+ * @param {number} state - Current state/index of our current position in the
+ * list of tokens
+ * @param {number} depth - Index/position in the root's rule (RHS). In other
+ * words, index of the next symbol to match or explore
+ * @param {string[]} tokens - List of input tokens
+ * @return {null|object[]} Null if the search provided NO results for the
+ * given node, or a list of tree nodes, which are the respective parse trees
+ * of each of the root rule's RHS symbols
+ */
 function dfsHelper(states, root, state, depth, tokens) {
  var edges;
 
@@ -209,17 +381,6 @@ function dfsHelper(states, root, state, depth, tokens) {
 }
 
 
-//let sentence = '23 + ( 32 * 46 )';
-//let sentence = '( 23 + 32 ) * 46';
-//let sentence = '23 + 32 * 46';
-//let sentence = '( ( 12 ) )';
-//let sentence = '1 * 2 + 3 * 4 + 5';
-//let sentence = '1 + 2 + 3';
-//let sentence = '1^3 + 2 * 3(3)';
-//let tree = parse(sentence, rules.rules);
-
-//console.log(JSON.stringify(tree, null, 2));
-
 module.exports.parse = parse;
 module.exports.earley = earley;
 module.exports.dfs = dfs;
diff --git a/lib/rules.js b/lib/rules.js
@@ -1,5 +1,31 @@
+/**
+ * Defines Rule and Sym classes, which are used create productions that comprise
+ * a grammar
+ *
+ * @module lib/rules
+ */
+
 'use strict';
 
+/**
+ * Defines a production rule, with a sole symbol on the left-hand side and a
+ * list of symbols on the right-hand side. The constructor also accepts a third
+ * argument, a valuator function, which can be used to evaluate values that are
+ * obtained by matching this production
+ *
+ * @class Rule
+ * @extends Array
+ * @constructor
+ * @memberof module:lib/rules
+ * @public
+ * @param {module:lib/rules~Sym} lhs - {@link module:lib/rules~Sym|Sym}
+ * representing the left hand side of the production
+ * @param {Array.<module:lib/rules~Sym|string|RegExp>} rhs - Sequence of
+ * {@link module:lib/rules~Sym|Syms}, plain strings, or RegExp objects that
+ * represents the right hand side of the production
+ * @param {Function=} valuator - Function used to evaluate values obtained by
+ * matching this production
+ */
 function Rule(lhs, rhs, valuator) {
  let arr = [];
 
@@ -21,6 +47,20 @@ function Rule(lhs, rhs, valuator) {
 Object.setPrototypeOf(Rule.prototype, Array.prototype);
 
 
+/**
+ * Constructor for the Sym class, which simply represents a non-terminal symbol
+ * in a grammar. While parsing, Syms are compared by reference, not by name. So,
+ * the name argument is optional as it serves no purpose for parsing. For
+ * debugging and evaluation of a parse tree, however, the name could be quite
+ * useful
+ *
+ * @class Sym
+ * @constructor
+ * @memberof module:lib/rules
+ * @param {string=} name - Name to give to the newly created symbol. Names do not
+ * need to be unique among Syms in a grammar, as they are not used to compare
+ * equality
+ */
 function Sym(name) {
  let symbol = {};
  symbol.__proto__ = Sym.prototype;
@@ -83,9 +123,14 @@ let rules = [
  new Rule(factor , [/\d+/] , (n) => parseFloat(n))
 ];
 
+
 module.exports = {
  rules : rules,
+
+ /** @see {@link module:lib/rules.Rule|Rule} */
  Rule : Rule,
+
+ /** @see {@link module:lib/rules.Sym|Sym} */
  Sym : Sym
 };
 
diff --git a/lib/tokenizer.js b/lib/tokenizer.js
@@ -1,5 +1,22 @@
+/**
+ * Provides a function for tokenizing a sentence given some grammar
+ *
+ * @module lib/tokenizer
+ */
+
 'use strict';
 
+/**
+ * Tokenizes a sentence given some grammar by finding all terminal symbols
+ * within the grammar and splitting the sentence by each of those symbols
+ *
+ * @function
+ * @param {string} sent - Sentence or string to be split/tokenized
+ * @param {Array.<module:lib/rules~Rule|Rule>} grammar - List of
+ * [Rules]{@link module:lib/rules~Rule} that define the grammar
+ * @return {string[]} Tokens/the sentence, split by each terminal character
+ * found within the grammar
+ */
 module.exports = (sent, grammar) => {
  let terms = grammar.reduce(
  (tokens, rule) => tokens.concat(

diff --git a/parsey.js b/parsey.js
@@ -1,8 +1,28 @@
+/**
+ * Export module for the parsey package
+ *
+ * @module parsey
+ */
 
 module.exports = {
+ /**
+ * @see module:lib/parser.parse
+ */
  parse : require('./lib/parser').parse,
+
+ /**
+ * @see module:lib/tokenizer
+ */
  tokenize : require('./lib/tokenizer'),
+
+ /**
+ * @see module:lib/rules~Rule
+ */
  Rule : require('./lib/rules').Rule,
+
+ /**
+ * @see module:lib/rules~Sym
+ */
  Sym : require('./lib/rules').Sym
 };