Add wordforms/search_suggest

Results of lexemes/search_suggest are wrapped in 'lexeme' for consistency.
MLRS · Jul 4, 2020 · 5fb002f · 5fb002f
1 parent 1c31cf8
commit 5fb002f
Show file tree

Hide file tree

Showing 8 changed files with 222 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -39,6 +39,16 @@ Run all tests with `npm test`.
 Run an individual testsuite with `npx mocha --exit test/schema.js` or use the `--grep` flag.
 To stop on first failure, use `--bail`
 
+### Using test data
+
+1. Set DB URL in `server-config.js` to `...gabra-test` (or something else)
+2. ```
+node scripts/node/populate.js test/data/*.json
+node scripts/node/resolve-lexeme-ids.js
+node scripts/node/create-indexes.js
+(cd scripts/node && ./run.js update-glosses-collection.js)
+```
+
 ## Repository
 
 - `master` branch is used for development.

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
  "name": "gabra-api",
- "version": "2.12.0",
+ "version": "2.13.0",
  "description": "Ġabra: an open lexicon for Maltese",
  "author": "John J. Camilleri <[email protected]> (http:https://johnjcamilleri.com/)",
  "license": "MIT",

diff --git a/public/markdown/api.md b/public/markdown/api.md
@@ -96,12 +96,14 @@ The results are sorted by part of speech and derived form, and will not include
 |:-----------------|:------------|:---------------------------|
 | `:id` (required) | Lexeme ID | `5200a366e36f237975000f26` |
 
-### Search suggest
+### Search suggest <small>Changed in v2.13</small>
 
-List variations in spelling (diacritics, character case) of a search term, from lemmas:
+Find matching words which vary in spelling (diacritics, character case) of the search term, from lemmas or wordforms:
 
 > [/lexemes/search_suggest?s=Hareg](#{baseURL}/lexemes/search_suggest?s=Hareg)
 
+> [/wordforms/search_suggest?s=ohorgu](#{baseURL}/wordforms/search_suggest?s=ohorgu)
+
 | Argument | Description | Example |
 |:---------------|:-------------|:--------|
 | `s` (required) | Search query | `Hareg` |

diff --git a/routes/lexemes.js b/routes/lexemes.js
@@ -178,24 +178,24 @@ router.get('/search', function (req, res) {
  collection.find(conds_l, opts),
  collection.count(conds_l)
  ])
- .then(values => {
- var docs = values[0]
- var count = values[1]
- queryObj.result_count = count
- res.json({
- 'results': docs.map(doc => {
- return {
- 'lexeme': doc
- }
- }),
- 'query': queryObj
- })
- })
- .catch(err => {
- console.error(err)
- res.status(500).end()
+ .then(values => {
+ var docs = values[0]
+ var count = values[1]
+ queryObj.result_count = count
+ res.json({
+ 'results': docs.map(doc => {
+ return {
+ 'lexeme': doc
+ }
+ }),
+ 'query': queryObj
+ })
+ })
+ .catch(err => {
+ console.error(err)
+ res.status(500).end()
+ })
  })
- })
 })
 
 /*
@@ -312,17 +312,16 @@ router.get('/search_suggest', function (req, res) {
  // s = s.replace(/^([^\[])/, function (m,c,o,s) { return '[' + c.toUpperCase() + ']'})
 
  // Handle diacritics
- s = s.replace(/^\^/, '')
- s = s.replace(/\$$/, '')
  s = s.replace(/c/g, 'ċ')
  s = s.replace(/g/g, '[gġ]')
  s = s.replace(/h/g, '[hħ]')
  s = s.replace(/z/g, '[zż]')
 
  // No substrings
+ s = s.replace(/^\^/, '')
+ s = s.replace(/\$$/, '')
  s = '^' + s + '$'
 
- var collection = db.get('lexemes')
  var query = {
  '$or': [
  {
@@ -334,23 +333,24 @@ router.get('/search_suggest', function (req, res) {
  ],
  'pending': {'$ne': true}
  }
+
  var opts = {
  'projection': {'lemma': true}
  }
- collection.find(query, opts, function (err, docs) {
- if (err) {
+ db.get('lexemes').find(query, opts)
+ .catch(function (err) {
  console.error(err)
  res.status(500).end()
- return
- }
- res.json({
- 'results': docs,
- 'query': {
- 'term': orig,
- 'result_count': docs.length
- }
  })
- })
+ .then(function (data) {
+ res.json({
+ 'results': data.map((l) => { return {'lexeme': l} }),
+ 'query': {
+ 'term': orig,
+ 'result_count': data.length
+ }
+ })
+ })
 })
 
 /*

diff --git a/routes/wordforms.js b/routes/wordforms.js
@@ -2,6 +2,7 @@ var express = require('express')
 var router = express.Router()
 var passport = require('passport')
 var async = require('async')
+var regexquote = require('regexp-quote')
 var monk = require('monk')
 
 var log = require('./helpers/logger').makeLogger('wordforms')
@@ -154,6 +155,57 @@ router.post('/replace/:lexeme_id',
  })
  })
 
+/*
+ * GET search suggest
+ */
+router.get('/search_suggest', function (req, res) {
+ var db = req.db
+
+ var orig = req.query.s
+ var s = regexquote(orig)
+
+ // Handle capitalisation
+ s = s.toLowerCase()
+ // s = s.replace(/^\[(.+?)\]/, function (m,c,o,s) { return '[' + c.toLowerCase() + c.toUpperCase() + ']'})
+ // s = s.replace(/^([^\[])/, function (m,c,o,s) { return '[' + c.toUpperCase() + ']'})
+
+ // Handle diacritics
+ s = s.replace(/c/g, 'ċ')
+ s = s.replace(/g/g, '[gġ]')
+ s = s.replace(/h/g, '[hħ]')
+ s = s.replace(/z/g, '[zż]')
+
+ // No substrings
+ s = s.replace(/^\^/, '')
+ s = s.replace(/\$$/, '')
+ s = '^' + s + '$'
+
+ var query = {
+ 'surface_form': {'$regex': s, '$ne': orig},
+ 'pending': {'$ne': true}
+ }
+ var opts = {
+ 'projection': {
+ 'surface_form': true,
+ 'lexeme_id': true
+ }
+ }
+ db.get('wordforms').find(query, opts)
+ .catch(function (err) {
+ console.error(err)
+ res.status(500).end()
+ })
+ .then(function (data) {
+ res.json({
+ 'results': data.map((l) => { return {'wordform': l} }),
+ 'query': {
+ 'term': orig,
+ 'result_count': data.length
+ }
+ })
+ })
+})
+
 /*
  * GET count
  */

diff --git a/test/data/wordforms.json b/test/data/wordforms.json
@@ -123,5 +123,79 @@
  "Camilleri2013"
  ],
  "pending" : true
+},
+{
+ "aspect": "perf",
+ "dir_obj": null,
+ "generated": true,
+ "ind_obj": null,
+ "lexeme": {
+ "lemma": "ħareġ",
+ "pos": "VERB"
+ },
+ "phonetic": "hrɪʧt",
+ "polarity": "pos",
+ "sources": ["Camilleri2013"],
+ "subject": {
+ "person": "p1",
+ "number": "sg"
+ },
+ "surface_form": "ħriġt"
+},
+{
+ "aspect": "perf",
+ "dir_obj": null,
+ "generated": true,
+ "ind_obj": null,
+ "lexeme": {
+ "lemma": "ħareġ",
+ "pos": "VERB"
+ },
+ "phonetic": "hrɪʧt",
+ "polarity": "pos",
+ "sources": ["Camilleri2013"],
+ "subject": {
+ "person": "p2",
+ "number": "sg"
+ },
+ "surface_form": "ħriġt"
+},
+{
+ "aspect": "perf",
+ "dir_obj": null,
+ "generated": true,
+ "ind_obj": null,
+ "lexeme": {
+ "lemma": "ħareġ",
+ "pos": "VERB"
+ },
+ "phonetic": "hɐrɛʧ",
+ "polarity": "pos",
+ "sources": ["Camilleri2013"],
+ "subject": {
+ "person": "p3",
+ "number": "sg",
+ "gender": "m"
+ },
+ "surface_form": "ħareġ"
+},
+{
+ "aspect": "perf",
+ "dir_obj": null,
+ "generated": true,
+ "ind_obj": null,
+ "lexeme": {
+ "lemma": "ħareġ",
+ "pos": "VERB"
+ },
+ "phonetic": "hɐrʤɛt",
+ "polarity": "pos",
+ "sources": ["Camilleri2013"],
+ "subject": {
+ "person": "p3",
+ "number": "sg",
+ "gender": "f"
+ },
+ "surface_form": "ħarġet"
 }
 ]
diff --git a/test/search.js b/test/search.js
@@ -22,13 +22,23 @@ describe('Search', function () {
  res.body.query.result_count.should.be.greaterThanOrEqual(opts.result_count)
  }
 
- // Results should contain these lemmas (in any order)
+ // Lexeme results should contain these lemmas (in any order)
  if (opts.lemmas) {
  for (let i in opts.lemmas) {
  let lemma = opts.lemmas[i]
  res.body.results.should.matchAny(function (value) {
  value.lexeme.lemma.should.equal(lemma)
- }, 'lemma "' + lemma + '" not found in results')
+ }, `lemma "${lemma}" not found in results`)
+ }
+ }
+
+ // Wordform results should contain these surface forms (in any order)
+ if (opts.surface_forms) {
+ for (let i in opts.surface_forms) {
+ let sf = opts.surface_forms[i]
+ res.body.results.should.matchAny(function (value) {
+ value.wordform.surface_form.should.equal(sf)
+ }, `surface form "${sf}" not found in results`)
  }
  }
 
@@ -93,6 +103,24 @@ describe('Search', function () {
 
  // -------------------------------------------------------------------------
 
+ describe('Search suggest', function () {
+ it('suggest lexeme', function (done) {
+ request(server)
+ .get('/lexemes/search_suggest?s=Hareg')
+ .expect(200)
+ .end(checkResponse({lemmas: ['ħareġ']}, done))
+ })
+
+ it('suggest wordform', function (done) {
+ request(server)
+ .get('/wordforms/search_suggest?s=harget')
+ .expect(200)
+ .end(checkResponse({surface_forms: ['ħarġet']}, done))
+ })
+ })
+
+ // -------------------------------------------------------------------------
+
  describe('Load stuff', function () {
  var lexeme_id