-
Notifications
You must be signed in to change notification settings - Fork 5
/
lsh.c
112 lines (90 loc) · 3.27 KB
/
lsh.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#include <stdlib.h>
#include "lsh.h"
void free_signature_list(struct signature_list* list) {
struct signature_list* tmp;
while (list != NULL) {
tmp = list->next;
free(list);
list = tmp;
}
}
void free_hash_tables(struct lsh* tables) {
for (unsigned int i = 0 ; i < N_BUCKETS ; i++) {
for (unsigned int j = 0 ; j < tables->size ; j++) {
free_signature_list(tables->buckets[i][j]);
}
free(tables->buckets[i]);
}
free(tables);
}
static unsigned int count_signatures(struct index* database) {
unsigned int n = 0;
for (unsigned int i = 0 ; i < database->n_entries ; i++) {
n += database->entries[i]->signatures->n_signatures;
}
return n;
}
struct signature_list* new_signature_list(unsigned int entry_index, unsigned int signature_index, struct signature_list* next) {
struct signature_list* l = (struct signature_list*)malloc(sizeof(struct signature_list));
if (l == NULL) {
return NULL;
}
l->entry_index = entry_index;
l->signature_index = signature_index;
l->next = next;
return l;
}
static uint32_t get_minhash(uint8_t* hash, int index) {
int base = index * BYTES_PER_BUCKET_HASH;
return (hash[base] << 24) | (hash[base + 1] << 16) | (hash[base + 2] << 8) | hash[base + 3];
}
struct lsh* create_hash_tables(struct index* database) {
struct lsh* tables = (struct lsh*)calloc(1, sizeof(struct lsh));
if (tables == NULL) {
return NULL;
}
unsigned int total_signatures = count_signatures(database);
tables->size = total_signatures / 2;
for (unsigned int i = 0 ; i < N_BUCKETS ; i++) {
tables->buckets[i] = (struct signature_list**)calloc(tables->size, sizeof(struct signature_list*));
if (tables->buckets[i] == NULL) {
free_hash_tables(tables);
return NULL;
}
}
for (unsigned int i = 0 ; i < database->n_entries ; i++) {
for (unsigned int j = 0 ; j < database->entries[i]->signatures->n_signatures ; j++) {
uint8_t* hash = database->entries[i]->signatures->signatures[j].minhash;
for (unsigned int k = 0 ; k < N_BUCKETS ; k++) {
uint32_t index = get_minhash(hash, k) % tables->size;
struct signature_list* tmp = new_signature_list(i, j, tables->buckets[k][index]);
if (tmp == NULL) {
free_hash_tables(tables);
return NULL;
}
tables->buckets[k][index] = tmp;
}
}
}
return tables;
}
int get_matches(struct lsh* tables, uint8_t* hash, struct signature_list* *list) {
(*list) = NULL;
int n = 0;
for (unsigned int i = 0 ; i < N_BUCKETS ; i++) {
uint32_t index = get_minhash(hash, i) % tables->size;
struct signature_list* tmp = tables->buckets[i][index];
// Let's add all these matches to our list
while (tmp != NULL) {
struct signature_list* new_item = new_signature_list(tmp->entry_index, tmp->signature_index, *list);
if (new_item == NULL) {
free_signature_list(*list);
return MEMORY_ERROR;
}
(*list) = new_item;
n++;
tmp = tmp->next;
}
}
return n;
}