-
Notifications
You must be signed in to change notification settings - Fork 1
/
check.go
258 lines (215 loc) · 6.54 KB
/
check.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
package hllcheck
import (
"encoding/binary"
"fmt"
"io"
"math"
"math/rand"
"text/tabwriter"
"github.com/e-dard/godist"
)
// Seed sets the seed used by the `math/rand` package.
var Seed = int64(1)
var params = []struct {
n uint64
p float64
}{
// Low cardinality with varying levels of duplication (density).
{n: 500, p: 0.0}, {n: 500, p: 0.25}, {n: 500, p: 0.8},
// Still small cardinality with varying levels of duplication.
{n: 1000, p: 0.0}, {n: 1000, p: 0.25}, {n: 1000, p: 0.8},
{n: 5000, p: 0.0}, {n: 5000, p: 0.25}, {n: 5000, p: 0.8},
{n: 10000, p: 0.0}, {n: 10000, p: 0.25}, {n: 10000, p: 0.8},
// Medium cardinality with varying levels of duplication.
{n: 100000, p: 0.0}, {n: 100000, p: 0.25}, {n: 100000, p: 0.8},
{n: 250000, p: 0.0}, {n: 250000, p: 0.25}, {n: 250000, p: 0.8},
{n: 500000, p: 0.0}, {n: 500000, p: 0.25}, {n: 500000, p: 0.8},
// Higher cardinality
{n: 1000000, p: 0.0}, {n: 1000000, p: 0.25}, {n: 1000000, p: 0.8},
{n: 5000000, p: 0.0}, {n: 5000000, p: 0.25}, {n: 5000000, p: 0.8},
{n: 25000000, p: 0.0}, {n: 25000000, p: 0.25}, {n: 25000000, p: 0.8},
// Very high
{n: 100000000, p: 0.0}, {n: 100000000, p: 0.25},
{n: 500000000, p: 0.0}, {n: 500000000, p: 0.25},
}
// RunData describes a set of data used for a single test run. RunData is shaped
// via the n and p parameters. n represents the size of the set, and p determines
// the probability of a duplicate value being inserted.
type RunData struct {
n uint64
p float64
i uint64 // Current progress of iterator.
x uint64 // Current value in itereator.
cardinality uint64
}
// NewRunData initialises a new RunData for evaluating the performance of a HLL++
// implementation.
func NewRunData(n uint64, p float64) *RunData {
if p < 0.0 || p > 1.0 {
panic(fmt.Sprintf("invalid p value %v", p))
}
return &RunData{n: n, p: p}
}
// Next returns the next value in the RunData. The second returned value
// determines if the iterator is complete.
func (r *RunData) Next() (uint64, bool) {
if r.i == r.n {
return 0, false
}
r.i++
if rand.Float64() >= r.p { // Move to next value and increase cardinality
r.x++
r.cardinality++
}
return r.x, true
}
// Cardinality returns the cardinality of the RunData dataset. The true
// cardinality is not known until the RunData's iterator has been drained.
// Cardinality panics if it's called prior to that.
func (r *RunData) Cardinality() uint64 {
if r.i < r.n {
panic("cardinality not available until all values have been generated")
}
return r.cardinality
}
// Size returns the size of the RunData dataset.
func (r *RunData) Size() uint64 {
return r.n
}
// Results contains a collection of results.
type Results [][]Result
// Result describes the result of an implementation on a dataset.
type Result struct {
ActualCardinality uint64
EstimatedCardinality uint64
Size uint64
}
// ErrorPercent returns the marginal error as a percentage between the estimated
// cardinality and the actual cardinaltiy. A result of -0.76 for example implies
// and error of -0.76% (the estimated cardinality was less than one percent the
// actual under the actual cardinality value).
func (r Result) ErrorPercent() float64 {
actual := float64(r.ActualCardinality)
estimated := float64(r.EstimatedCardinality)
return 100.0 * (1 - (actual / estimated))
}
// Duplication calculates how many values (as a proportion of all added values)
// were duplicates.
func (r Result) Duplication() float64 {
return 100.0 * (float64(r.Size-r.ActualCardinality) / float64(r.Size))
}
// HLL is the interface that a HLL/HLL++ algorithm must implement.
type HLL interface {
Add(v []byte)
Count() uint64
}
// HLLFactory is the interface that generates new instantiations of HLL implementations.
type HLLFactory interface {
New() HLL
}
type hllFactory struct {
f func() HLL
}
func (h hllFactory) New() HLL {
return h.f()
}
// ToHLLFatory is a helper for converting a function that returns an HLL
// implementation to an HLL Factory type.
func ToHLLFatory(f func() HLL) HLLFactory {
return hllFactory{f: f}
}
// Run runs a suite of tests against one or more HLL implementations, focussing
// on the accuracy of the algorithms. If w is non-nil, then the results will be
// written there as well as returned.
func Run(h1, h2 HLLFactory, w io.Writer) Results {
rand.Seed(Seed) // Set the seed.
if h1 == nil {
panic("must provide at least one implementation")
}
var tw *tabwriter.Writer
if w != nil {
tw = tabwriter.NewWriter(w, 24, 8, 4, '\t', 0)
}
var (
factories = []HLLFactory{h1, h2}
results = make(Results, 2)
buf = make([]byte, 8)
)
if tw != nil {
fmt.Fprint(tw, "Size\tActual Cardinality\tEstimation\tError (%)\tDuplication (%)\n")
}
for _, param := range params {
rd := NewRunData(param.n, param.p)
h1 := factories[0].New()
var h2 HLL
if factories[1] != nil {
h2 = factories[1].New()
}
for {
v, ok := rd.Next()
if !ok {
break
}
binary.BigEndian.PutUint64(buf, v)
h1.Add(buf)
if h2 != nil {
h2.Add(buf)
}
}
result1 := Result{
ActualCardinality: rd.Cardinality(),
EstimatedCardinality: h1.Count(),
Size: rd.Size(),
}
results[0] = append(results[0], result1)
var result2 Result
if h2 != nil {
result2 = Result{
ActualCardinality: rd.Cardinality(),
EstimatedCardinality: h2.Count(),
Size: rd.Size(),
}
results[1] = append(results[1], result2)
}
if tw != nil {
fmt.Fprintf(tw, "%d\t%d\t%d\t%0.4f%%\t%0.2f%%\n", result1.Size, result1.ActualCardinality, result1.EstimatedCardinality, result1.ErrorPercent(), result1.Duplication())
if h2 != nil {
fmt.Fprintf(tw, "%d\t%d\t%d\t%0.4f%%\t%0.2f%%\n", result2.Size, result2.ActualCardinality, result2.EstimatedCardinality, result2.ErrorPercent(), result2.Duplication())
}
}
if tw != nil {
fmt.Fprint(tw, "\n")
tw.Flush()
}
}
if tw == nil {
return results
}
fmt.Fprint(tw, "\n\nMean Error\tMedian Error\tError Variance\tMax Error\n")
// Analyse results.
for i, factory := range factories {
if factory == nil {
continue
}
var (
maxError float64
meanError float64
medianError float64
variance float64
)
dist := godist.Empirical{}
for _, result := range results[i] {
err := result.ErrorPercent()
if math.Abs(err) > math.Abs(maxError) {
maxError = err
}
dist.Add(err)
}
meanError, _ = dist.Mean()
medianError, _ = dist.Median()
variance, _ = dist.Variance()
fmt.Fprintf(tw, "%0.4f%%\t%0.4f%%\t%0.4f\t%0.4f%%\t\n", meanError, medianError, variance, maxError)
}
tw.Flush()
return results
}