Added a discardDuplicateTokens property

This option is false by default. This is an optimization that can be turned on that avoids indexing the same token two or more times for the same string in the same object. This cuts indexing time by 3-4 for long form texts
matehat · Jan 17, 2014 · 7b2df22 · 7b2df22
1 parent 178d405
commit 7b2df22
Show file tree

Hide file tree

Showing 8 changed files with 733 additions and 13 deletions.
diff --git a/Classes/MHTextIndex.h b/Classes/MHTextIndex.h
@@ -31,6 +31,7 @@ typedef struct {
 @property NSSortOptions sortOptions;
 @property NSUInteger minimalTokenLength;
 @property BOOL skipStopWords;
+@property BOOL discardDuplicateTokens;
 
 @property (strong, readonly) NSString * path;
 @property (strong, readonly) NSString * name;

diff --git a/Classes/MHTextIndex.m b/Classes/MHTextIndex.m
@@ -8,6 +8,8 @@
 
 #import "MHTextIndex.h"
 #import "MHSearchResultItem.h"
+#import "bloom-filter.h"
+#import "hash-string.h"
 #import <Objective-LevelDB/LDBWritebatch.h>
 #import <Objective-LevelDB/LDBSnapshot.h>
 
@@ -162,7 +164,7 @@ void removeIndexForStringInObject(NSData *ident, NSUInteger stringIdx, LDBWriteb
  });
  }];
 }
-void indexWordInObjectTextFragment(NSData *ident, NSStringEncoding encoding,
+void indexWordInObjectTextFragment(NSData *ident, NSStringEncoding encoding, bloom_filter_s *bloomFilter,
  NSUInteger minimalTokenLength, BOOL skipStopWords,
  NSString *wordSubstring, NSRange wordSubstringRange, NSUInteger stringIdx,
  LDBWritebatch *wb) {
@@ -219,6 +221,14 @@ void indexWordInObjectTextFragment(NSData *ident, NSStringEncoding encoding,
  options:NSStringEncodingConversionAllowLossy
  range:subRange
  remainingRange:NULL];
+
+ if (bloomFilter != NULL) {
+ if (bloom_filter_query(bloomFilter, keyPtr, usedLength) == 1)
+ continue;
+ else
+ bloom_filter_insert(bloomFilter, keyPtr, usedLength);
+ }
+
  keyPtr += usedLength;
 
  // We insert a separator with value 0 to separate the suffix from the object id
@@ -305,6 +315,7 @@ - (instancetype)initWithName:(NSString *)name path:(NSString *)path options:(Lev
 
  _minimalTokenLength = 2;
  _skipStopWords = YES;
+ _discardDuplicateTokens = NO;
 
  _path = path;
  _name = name;
@@ -373,12 +384,26 @@ - (NSOperation *)indexObject:(id)object {
  [indexedObj.strings enumerateObjectsUsingBlock:^(NSString *obj, NSUInteger idx, BOOL *stop) {
  NSStringEncoding encoding = [obj fastestEncoding];
  NSParameterAssert([obj isKindOfClass:[NSString class]]);
+
+ bloom_filter_s *bloomFilter = NULL;
+ if (_sself->_discardDuplicateTokens) {
+ size_t est_token_count = [obj maximumLengthOfBytesUsingEncoding:encoding];
+ size_t table_size = ceil((est_token_count * log(0.0001)) / log(1.0 / (pow(2.0, log(2.0)))));
+ size_t num_funcs = round(log(2.0) * table_size / est_token_count);
+
+ bloomFilter = bloom_filter_new(table_size, jenkins_nocase_hash, num_funcs);
+ }
+
  [obj enumerateSubstringsInRange:(NSRange){0, obj.length}
  options:NSStringEnumerationByWords|NSStringEnumerationLocalized
  usingBlock:^(NSString *substring, NSRange substringRange, NSRange enclosingRange, BOOL *stop) {
- indexWordInObjectTextFragment(ident, encoding, _sself->_minimalTokenLength, _sself->_skipStopWords,
+ indexWordInObjectTextFragment(ident, encoding, bloomFilter,
+ _sself->_minimalTokenLength, _sself->_skipStopWords,
  substring, substringRange, idx, wb);
  }];
+
+ if (_sself->_discardDuplicateTokens)
+ bloom_filter_free(bloomFilter);
  }];
 
  [wb setObject:newIndexedObjectStrings forKey:indexKeyForIndexedObject(ident, IndexedObjectKeyTypeStrings)];
@@ -435,12 +460,26 @@ - (NSOperation *)updateIndexForObject:(id)object {
 
  NSStringEncoding encoding = [obj fastestEncoding];
  NSParameterAssert([obj isKindOfClass:[NSString class]]);
+
+ bloom_filter_s *bloomFilter = NULL;
+ if (_sself->_discardDuplicateTokens) {
+ size_t est_token_count = [obj maximumLengthOfBytesUsingEncoding:encoding];
+ size_t table_size = ceil((est_token_count * log(0.0001)) / log(1.0 / (pow(2.0, log(2.0)))));
+ size_t num_funcs = round(log(2.0) * table_size / est_token_count);
+
+ bloomFilter = bloom_filter_new(table_size, jenkins_nocase_hash, num_funcs);
+ }
+
  [obj enumerateSubstringsInRange:(NSRange){0, obj.length}
  options:NSStringEnumerationByWords|NSStringEnumerationLocalized
  usingBlock:^(NSString *substring, NSRange substringRange, NSRange enclosingRange, BOOL *stop) {
- indexWordInObjectTextFragment(ident, encoding, _sself->_minimalTokenLength, _sself->_skipStopWords,
+ indexWordInObjectTextFragment(ident, encoding, bloomFilter,
+ _sself->_minimalTokenLength, _sself->_skipStopWords,
  substring, substringRange, idx, wb);
  }];
+
+ if (_sself->_discardDuplicateTokens)
+ bloom_filter_free(bloomFilter);
  }];
 
  if (previousStrings.count > indexedObj.strings.count) {

diff --git a/Classes/bloom-filter.c b/Classes/bloom-filter.c
@@ -0,0 +1,290 @@
+/*
+
+Copyright (c) 2005-2008, Simon Howard
+
+Permission to use, copy, modify, and/or distribute this software 
+for any purpose with or without fee is hereby granted, provided 
+that the above copyright notice and this permission notice appear 
+in all copies. 
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL 
+WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED 
+WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE 
+AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR 
+CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 
+LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, 
+NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 
+CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 
+
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "bloom-filter.h"
+
+/* malloc() / free() testing */
+
+#ifdef ALLOC_TESTING
+#include "alloc-testing.h"
+#endif
+
+struct _bloom_filter_s {
+ bloom_filter_hash_func hash_func;
+ unsigned char *table;
+ unsigned int table_size;
+ unsigned int num_functions;
+};
+
+/* Salt values. These salts are XORed with the output of the hash
+ * function to give multiple unique hashes. */
+
+static const unsigned int salts[] = {
+ 0x5cee4612, 0xb5587b1c, 0xa250f2b0, 0xa3bf6d2a, 
+ 0x7a81bd1a, 0x92888d7f, 0x1dc977c7, 0xedc96624, 
+ 0x920c85d9, 0xf16066b3, 0xc6f0d4b3, 0x2b76eb86, 
+ 0xcacb3893, 0x493d81c5, 0xf5a133ac, 0x039740bf, 
+ 0x162b8224, 0xf841de90, 0xc3e5090d, 0x3bce93a7, 
+ 0xf1860334, 0xe832b5f1, 0xf5b6535b, 0xe4cf4fa6, 
+ 0x8357b769, 0x1442b07a, 0x21c5863d, 0xabc0d846, 
+ 0x6dc0d77a, 0x23a3992c, 0xe12179ba, 0xd81d1e23, 
+ 0xcff4727b, 0xe957ecfb, 0xee8f391a, 0x426efa23, 
+ 0x3a34ff2c, 0x8b875d94, 0x34fd0f63, 0xf159daae, 
+ 0xaabab8b3, 0xa83a07ba, 0x4e54fb33, 0xfb82fab8, 
+ 0x2ae2888f, 0xd1a307a8, 0xbe33322d, 0x87c73f86, 
+ 0x7270fa7e, 0x68673c55, 0x2c8026d0, 0xead8e422, 
+ 0xa3ee5132, 0xecb67767, 0x1c3b1ae5, 0x47adf5b6, 
+ 0xf4518d30, 0x46e62797, 0x9889aa76, 0x1405aadf, 
+ 0xf62f9124, 0x5c435ac5, 0x35b8dfe3, 0x651c08c5, 
+};
+
+bloom_filter_s *bloom_filter_new(unsigned int table_size, 
+ bloom_filter_hash_func hash_func,
+ unsigned int num_functions)
+{
+ bloom_filter_s *filter;
+
+ /* There is a limit on the number of functions which can be 
+ * applied, due to the table size */
+
+ if (num_functions > sizeof(salts) / sizeof(*salts)) {
+ return NULL;
+ }
+
+ /* Allocate bloom filter structure */
+
+ filter = malloc(sizeof(bloom_filter_s));
+
+ if (filter == NULL) {
+ return NULL;
+ }
+
+ /* Allocate table, each entry is one bit; these are packed into
+ * bytes. When allocating we must round the length up to the nearest
+ * byte. */
+
+ filter->table = calloc((table_size + 7) / 8, 1);
+
+ if (filter->table == NULL) {
+ free(filter);
+ return NULL;
+ }
+
+ filter->hash_func = hash_func;
+ filter->num_functions = num_functions;
+ filter->table_size = table_size;
+
+ return filter;
+}
+
+void bloom_filter_free(bloom_filter_s *bloomfilter)
+{
+ free(bloomfilter->table);
+ free(bloomfilter);
+}
+
+void bloom_filter_insert(bloom_filter_s *bloomfilter, bloom_filter_value value, unsigned long length)
+{
+ unsigned long hash;
+ unsigned long subhash;
+ unsigned int index;
+ unsigned int i;
+
+ /* Generate hash of the value to insert */
+
+ hash = bloomfilter->hash_func(value, length);
+
+ /* Generate multiple unique hashes by XORing with values in the
+ * salt table. */
+
+ for (i=0; i<bloomfilter->num_functions; ++i) {
+
+ /* Generate a unique hash */
+
+ subhash = hash ^ salts[i];
+
+ /* Find the index into the table */
+
+ index = subhash % bloomfilter->table_size;
+
+ /* Insert into the table. 
+ * index / 8 finds the byte index of the table,
+ * index % 8 gives the bit index within that byte to set. */
+
+ bloomfilter->table[index / 8] |= 1 << (index % 8);
+ }
+}
+
+int bloom_filter_query(bloom_filter_s *bloomfilter, bloom_filter_value value, unsigned long length)
+{
+ unsigned long hash;
+ unsigned long subhash;
+ unsigned int index;
+ unsigned int i;
+ unsigned char b;
+ int bit;
+
+ /* Generate hash of the value to lookup */
+
+ hash = bloomfilter->hash_func(value, length);
+
+ /* Generate multiple unique hashes by XORing with values in the
+ * salt table. */
+
+ for (i=0; i<bloomfilter->num_functions; ++i) {
+
+ /* Generate a unique hash */
+
+ subhash = hash ^ salts[i];
+
+ /* Find the index into the table to test */
+
+ index = subhash % bloomfilter->table_size;
+
+ /* The byte at index / 8 holds the value to test */
+
+ b = bloomfilter->table[index / 8];
+ bit = 1 << (index % 8);
+
+ /* Test if the particular bit is set; if it is not set,
+ * this value can not have been inserted. */
+
+ if ((b & bit) == 0) {
+ return 0;
+ }
+ }
+
+ /* All necessary bits were set. This may indicate that the value
+ * was inserted, or the values could have been set through other
+ * insertions. */
+
+ return 1;
+}
+
+void bloom_filter_read(bloom_filter_s *bloomfilter, unsigned char *array)
+{
+ unsigned int array_size;
+
+ /* The table is an array of bits, packed into bytes. Round up
+ * to the nearest byte. */
+
+ array_size = (bloomfilter->table_size + 7) / 8;
+
+ /* Copy into the buffer of the calling routine. */
+
+ memcpy(array, bloomfilter->table, array_size);
+}
+
+void bloom_filter_load(bloom_filter_s *bloomfilter, unsigned char *array)
+{
+ unsigned int array_size;
+
+ /* The table is an array of bits, packed into bytes. Round up
+ * to the nearest byte. */
+
+ array_size = (bloomfilter->table_size + 7) / 8;
+
+ /* Copy from the buffer of the calling routine. */
+
+ memcpy(bloomfilter->table, array, array_size);
+}
+
+bloom_filter_s *bloom_filter_union(bloom_filter_s *filter1, bloom_filter_s *filter2)
+{
+ bloom_filter_s *result;
+ unsigned int i;
+ unsigned int array_size;
+
+ /* To perform this operation, both filters must be created with
+ * the same values. */
+
+ if (filter1->table_size != filter2->table_size
+ || filter1->num_functions != filter2->num_functions
+ || filter1->hash_func != filter2->hash_func) {
+ return NULL;
+ }
+
+ /* Create a new bloom filter for the result */
+
+ result = bloom_filter_new(filter1->table_size, 
+ filter1->hash_func, 
+ filter1->num_functions);
+
+ if (result == NULL) {
+ return NULL;
+ }
+
+ /* The table is an array of bits, packed into bytes. Round up
+ * to the nearest byte. */
+
+ array_size = (filter1->table_size + 7) / 8;
+
+ /* Populate the table of the new filter */
+
+ for (i=0; i<array_size; ++i) {
+ result->table[i] = filter1->table[i] | filter2->table[i];
+ }
+
+ return result;
+}
+
+bloom_filter_s *bloom_filter_intersection(bloom_filter_s *filter1, 
+ bloom_filter_s *filter2)
+{
+ bloom_filter_s *result;
+ unsigned int i;
+ unsigned int array_size;
+
+ /* To perform this operation, both filters must be created with
+ * the same values. */
+
+ if (filter1->table_size != filter2->table_size
+ || filter1->num_functions != filter2->num_functions
+ || filter1->hash_func != filter2->hash_func) {
+ return NULL;
+ }
+
+ /* Create a new bloom filter for the result */
+
+ result = bloom_filter_new(filter1->table_size, 
+ filter1->hash_func, 
+ filter1->num_functions);
+
+ if (result == NULL) {
+ return NULL;
+ }
+
+ /* The table is an array of bits, packed into bytes. Round up
+ * to the nearest byte. */
+
+ array_size = (filter1->table_size + 7) / 8;
+
+ /* Populate the table of the new filter */
+
+ for (i=0; i<array_size; ++i) {
+ result->table[i] = filter1->table[i] & filter2->table[i];
+ }
+
+ return result;
+}
+