Skip to content

Commit

Permalink
Added a discardDuplicateTokens property
Browse files Browse the repository at this point in the history
This option is false by default. This is an optimization that can be turned on that avoids indexing the same token two or more times for the same string in the same object. This cuts indexing time by 3-4 for long form texts
  • Loading branch information
matehat committed Jan 17, 2014
1 parent 178d405 commit 7b2df22
Show file tree
Hide file tree
Showing 8 changed files with 733 additions and 13 deletions.
1 change: 1 addition & 0 deletions Classes/MHTextIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ typedef struct {
@property NSSortOptions sortOptions;
@property NSUInteger minimalTokenLength;
@property BOOL skipStopWords;
@property BOOL discardDuplicateTokens;

@property (strong, readonly) NSString * path;
@property (strong, readonly) NSString * name;
Expand Down
45 changes: 42 additions & 3 deletions Classes/MHTextIndex.m
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

#import "MHTextIndex.h"
#import "MHSearchResultItem.h"
#import "bloom-filter.h"
#import "hash-string.h"
#import <Objective-LevelDB/LDBWritebatch.h>
#import <Objective-LevelDB/LDBSnapshot.h>

Expand Down Expand Up @@ -162,7 +164,7 @@ void removeIndexForStringInObject(NSData *ident, NSUInteger stringIdx, LDBWriteb
});
}];
}
void indexWordInObjectTextFragment(NSData *ident, NSStringEncoding encoding,
void indexWordInObjectTextFragment(NSData *ident, NSStringEncoding encoding, bloom_filter_s *bloomFilter,
NSUInteger minimalTokenLength, BOOL skipStopWords,
NSString *wordSubstring, NSRange wordSubstringRange, NSUInteger stringIdx,
LDBWritebatch *wb) {
Expand Down Expand Up @@ -219,6 +221,14 @@ void indexWordInObjectTextFragment(NSData *ident, NSStringEncoding encoding,
options:NSStringEncodingConversionAllowLossy
range:subRange
remainingRange:NULL];

if (bloomFilter != NULL) {
if (bloom_filter_query(bloomFilter, keyPtr, usedLength) == 1)
continue;
else
bloom_filter_insert(bloomFilter, keyPtr, usedLength);
}

keyPtr += usedLength;

// We insert a separator with value 0 to separate the suffix from the object id
Expand Down Expand Up @@ -305,6 +315,7 @@ - (instancetype)initWithName:(NSString *)name path:(NSString *)path options:(Lev

_minimalTokenLength = 2;
_skipStopWords = YES;
_discardDuplicateTokens = NO;

_path = path;
_name = name;
Expand Down Expand Up @@ -373,12 +384,26 @@ - (NSOperation *)indexObject:(id)object {
[indexedObj.strings enumerateObjectsUsingBlock:^(NSString *obj, NSUInteger idx, BOOL *stop) {
NSStringEncoding encoding = [obj fastestEncoding];
NSParameterAssert([obj isKindOfClass:[NSString class]]);

bloom_filter_s *bloomFilter = NULL;
if (_sself->_discardDuplicateTokens) {
size_t est_token_count = [obj maximumLengthOfBytesUsingEncoding:encoding];
size_t table_size = ceil((est_token_count * log(0.0001)) / log(1.0 / (pow(2.0, log(2.0)))));
size_t num_funcs = round(log(2.0) * table_size / est_token_count);

bloomFilter = bloom_filter_new(table_size, jenkins_nocase_hash, num_funcs);
}

[obj enumerateSubstringsInRange:(NSRange){0, obj.length}
options:NSStringEnumerationByWords|NSStringEnumerationLocalized
usingBlock:^(NSString *substring, NSRange substringRange, NSRange enclosingRange, BOOL *stop) {
indexWordInObjectTextFragment(ident, encoding, _sself->_minimalTokenLength, _sself->_skipStopWords,
indexWordInObjectTextFragment(ident, encoding, bloomFilter,
_sself->_minimalTokenLength, _sself->_skipStopWords,
substring, substringRange, idx, wb);
}];

if (_sself->_discardDuplicateTokens)
bloom_filter_free(bloomFilter);
}];

[wb setObject:newIndexedObjectStrings forKey:indexKeyForIndexedObject(ident, IndexedObjectKeyTypeStrings)];
Expand Down Expand Up @@ -435,12 +460,26 @@ - (NSOperation *)updateIndexForObject:(id)object {

NSStringEncoding encoding = [obj fastestEncoding];
NSParameterAssert([obj isKindOfClass:[NSString class]]);

bloom_filter_s *bloomFilter = NULL;
if (_sself->_discardDuplicateTokens) {
size_t est_token_count = [obj maximumLengthOfBytesUsingEncoding:encoding];
size_t table_size = ceil((est_token_count * log(0.0001)) / log(1.0 / (pow(2.0, log(2.0)))));
size_t num_funcs = round(log(2.0) * table_size / est_token_count);

bloomFilter = bloom_filter_new(table_size, jenkins_nocase_hash, num_funcs);
}

[obj enumerateSubstringsInRange:(NSRange){0, obj.length}
options:NSStringEnumerationByWords|NSStringEnumerationLocalized
usingBlock:^(NSString *substring, NSRange substringRange, NSRange enclosingRange, BOOL *stop) {
indexWordInObjectTextFragment(ident, encoding, _sself->_minimalTokenLength, _sself->_skipStopWords,
indexWordInObjectTextFragment(ident, encoding, bloomFilter,
_sself->_minimalTokenLength, _sself->_skipStopWords,
substring, substringRange, idx, wb);
}];

if (_sself->_discardDuplicateTokens)
bloom_filter_free(bloomFilter);
}];

if (previousStrings.count > indexedObj.strings.count) {
Expand Down
290 changes: 290 additions & 0 deletions Classes/bloom-filter.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,290 @@
/*
Copyright (c) 2005-2008, Simon Howard
Permission to use, copy, modify, and/or distribute this software
for any purpose with or without fee is hereby granted, provided
that the above copyright notice and this permission notice appear
in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/

#include <stdlib.h>
#include <string.h>

#include "bloom-filter.h"

/* malloc() / free() testing */

#ifdef ALLOC_TESTING
#include "alloc-testing.h"
#endif

struct _bloom_filter_s {
bloom_filter_hash_func hash_func;
unsigned char *table;
unsigned int table_size;
unsigned int num_functions;
};

/* Salt values. These salts are XORed with the output of the hash
* function to give multiple unique hashes. */

static const unsigned int salts[] = {
0x5cee4612, 0xb5587b1c, 0xa250f2b0, 0xa3bf6d2a,
0x7a81bd1a, 0x92888d7f, 0x1dc977c7, 0xedc96624,
0x920c85d9, 0xf16066b3, 0xc6f0d4b3, 0x2b76eb86,
0xcacb3893, 0x493d81c5, 0xf5a133ac, 0x039740bf,
0x162b8224, 0xf841de90, 0xc3e5090d, 0x3bce93a7,
0xf1860334, 0xe832b5f1, 0xf5b6535b, 0xe4cf4fa6,
0x8357b769, 0x1442b07a, 0x21c5863d, 0xabc0d846,
0x6dc0d77a, 0x23a3992c, 0xe12179ba, 0xd81d1e23,
0xcff4727b, 0xe957ecfb, 0xee8f391a, 0x426efa23,
0x3a34ff2c, 0x8b875d94, 0x34fd0f63, 0xf159daae,
0xaabab8b3, 0xa83a07ba, 0x4e54fb33, 0xfb82fab8,
0x2ae2888f, 0xd1a307a8, 0xbe33322d, 0x87c73f86,
0x7270fa7e, 0x68673c55, 0x2c8026d0, 0xead8e422,
0xa3ee5132, 0xecb67767, 0x1c3b1ae5, 0x47adf5b6,
0xf4518d30, 0x46e62797, 0x9889aa76, 0x1405aadf,
0xf62f9124, 0x5c435ac5, 0x35b8dfe3, 0x651c08c5,
};

bloom_filter_s *bloom_filter_new(unsigned int table_size,
bloom_filter_hash_func hash_func,
unsigned int num_functions)
{
bloom_filter_s *filter;

/* There is a limit on the number of functions which can be
* applied, due to the table size */

if (num_functions > sizeof(salts) / sizeof(*salts)) {
return NULL;
}

/* Allocate bloom filter structure */

filter = malloc(sizeof(bloom_filter_s));

if (filter == NULL) {
return NULL;
}

/* Allocate table, each entry is one bit; these are packed into
* bytes. When allocating we must round the length up to the nearest
* byte. */

filter->table = calloc((table_size + 7) / 8, 1);

if (filter->table == NULL) {
free(filter);
return NULL;
}

filter->hash_func = hash_func;
filter->num_functions = num_functions;
filter->table_size = table_size;

return filter;
}

void bloom_filter_free(bloom_filter_s *bloomfilter)
{
free(bloomfilter->table);
free(bloomfilter);
}

void bloom_filter_insert(bloom_filter_s *bloomfilter, bloom_filter_value value, unsigned long length)
{
unsigned long hash;
unsigned long subhash;
unsigned int index;
unsigned int i;

/* Generate hash of the value to insert */

hash = bloomfilter->hash_func(value, length);

/* Generate multiple unique hashes by XORing with values in the
* salt table. */

for (i=0; i<bloomfilter->num_functions; ++i) {

/* Generate a unique hash */

subhash = hash ^ salts[i];

/* Find the index into the table */

index = subhash % bloomfilter->table_size;

/* Insert into the table.
* index / 8 finds the byte index of the table,
* index % 8 gives the bit index within that byte to set. */

bloomfilter->table[index / 8] |= 1 << (index % 8);
}
}

int bloom_filter_query(bloom_filter_s *bloomfilter, bloom_filter_value value, unsigned long length)
{
unsigned long hash;
unsigned long subhash;
unsigned int index;
unsigned int i;
unsigned char b;
int bit;

/* Generate hash of the value to lookup */

hash = bloomfilter->hash_func(value, length);

/* Generate multiple unique hashes by XORing with values in the
* salt table. */

for (i=0; i<bloomfilter->num_functions; ++i) {

/* Generate a unique hash */

subhash = hash ^ salts[i];

/* Find the index into the table to test */

index = subhash % bloomfilter->table_size;

/* The byte at index / 8 holds the value to test */

b = bloomfilter->table[index / 8];
bit = 1 << (index % 8);

/* Test if the particular bit is set; if it is not set,
* this value can not have been inserted. */

if ((b & bit) == 0) {
return 0;
}
}

/* All necessary bits were set. This may indicate that the value
* was inserted, or the values could have been set through other
* insertions. */

return 1;
}

void bloom_filter_read(bloom_filter_s *bloomfilter, unsigned char *array)
{
unsigned int array_size;

/* The table is an array of bits, packed into bytes. Round up
* to the nearest byte. */

array_size = (bloomfilter->table_size + 7) / 8;

/* Copy into the buffer of the calling routine. */

memcpy(array, bloomfilter->table, array_size);
}

void bloom_filter_load(bloom_filter_s *bloomfilter, unsigned char *array)
{
unsigned int array_size;

/* The table is an array of bits, packed into bytes. Round up
* to the nearest byte. */

array_size = (bloomfilter->table_size + 7) / 8;

/* Copy from the buffer of the calling routine. */

memcpy(bloomfilter->table, array, array_size);
}

bloom_filter_s *bloom_filter_union(bloom_filter_s *filter1, bloom_filter_s *filter2)
{
bloom_filter_s *result;
unsigned int i;
unsigned int array_size;

/* To perform this operation, both filters must be created with
* the same values. */

if (filter1->table_size != filter2->table_size
|| filter1->num_functions != filter2->num_functions
|| filter1->hash_func != filter2->hash_func) {
return NULL;
}

/* Create a new bloom filter for the result */

result = bloom_filter_new(filter1->table_size,
filter1->hash_func,
filter1->num_functions);

if (result == NULL) {
return NULL;
}

/* The table is an array of bits, packed into bytes. Round up
* to the nearest byte. */

array_size = (filter1->table_size + 7) / 8;

/* Populate the table of the new filter */

for (i=0; i<array_size; ++i) {
result->table[i] = filter1->table[i] | filter2->table[i];
}

return result;
}

bloom_filter_s *bloom_filter_intersection(bloom_filter_s *filter1,
bloom_filter_s *filter2)
{
bloom_filter_s *result;
unsigned int i;
unsigned int array_size;

/* To perform this operation, both filters must be created with
* the same values. */

if (filter1->table_size != filter2->table_size
|| filter1->num_functions != filter2->num_functions
|| filter1->hash_func != filter2->hash_func) {
return NULL;
}

/* Create a new bloom filter for the result */

result = bloom_filter_new(filter1->table_size,
filter1->hash_func,
filter1->num_functions);

if (result == NULL) {
return NULL;
}

/* The table is an array of bits, packed into bytes. Round up
* to the nearest byte. */

array_size = (filter1->table_size + 7) / 8;

/* Populate the table of the new filter */

for (i=0; i<array_size; ++i) {
result->table[i] = filter1->table[i] & filter2->table[i];
}

return result;
}

Loading

0 comments on commit 7b2df22

Please sign in to comment.