-
Notifications
You must be signed in to change notification settings - Fork 202
/
Copy pathindex.html
474 lines (395 loc) · 19 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
<html>
<head>
<title>Bloom Filters by Example</title>
<style type="text/css">
table#bitvector {
border-width: 1px;
border-spacing: 0px;
border-style: none;
border-color: rgb(51, 51, 51);
border-collapse: collapse;
background-color: white;
margin: auto;
}
table#bitvector th {
border-width: 1px;
padding: 1px;
border-style: solid;
border-color: rgb(221, 221, 221);
background-color: white;
-moz-border-radius: ;
}
table#bitvector td {
border-width: 1px;
padding: 1px;
border-style: solid;
border-color: rgb(221, 221, 221);
background-color: white;
-moz-border-radius: ;
}
.active {
background-color: green !important;
}
.set {
background-color: black !important;
}
#content {
width: 650px;
margin: auto;
background-color: #fff;
padding: 20px;
-webkit-border-radius: 10px;
-moz-border-radius: 10px;
border-radius: 10px;
}
body {
background-color: #d3ddb4;
color: black;
font-family: Palatino, Georgia, "Times New Roman", Times, serif;
font-size: 17px;
}
p {
line-height: 1.2em;
margin: 1em 0px;
}
sup {
font-size: 0.75em;
line-height: 0.5em;
}
.insetbox {
width: 400px;
margin: auto;
background-color: #f1f1ce;
padding: 10px;
-webkit-border-radius: 10px;
-moz-border-radius: 10px;
border-radius: 10px;
}
#addstring {}
#testmembership {}
#ismember {}
.github-corner-svg {
color: #d3ddb4;
position: absolute;
top: 0;
border: 0;
left: 0;
transform: scale(-1, 1);
}
</style>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.5.2/jquery.min.js"></script>
<script src="./murmurhash.js"></script>
<script>
// The Jenkins 96 bit mix function:
// https://web.archive.org/web/20061030103559/http://www.concentric.net/~Ttwang/tech/inthash.htm
// stolen from Google Chromium's bloom filter
// https://chromium.googlesource.com/chromium/chromium/+/refs/heads/main/chrome/browser/safe_browsing/bloom_filter.cc
// thanks dudes!
var seed1 = Math.floor(Math.random() * 2e32);
var seed2 = Math.floor(Math.random() * 2e32);
function hashMix(hash_key) {
var a = seed1;
var b = seed2;
var c = hash_key;
console.log(a, b, c);
a -= (b + c); a ^= (c >> 13);
b -= (c + a); b ^= (a << 8);
c -= (a + b); c ^= (b >> 13);
a -= (b + c); a ^= (c >> 12);
b -= (c + a); b ^= (a << 16);
c -= (a + b); c ^= (b >> 5);
a -= (b + c); a ^= (c >> 3);
b -= (c + a); b ^= (a << 10);
c -= (a + b); c ^= (b >> 15);
//XXX: can this even be negative? It was designed to run with uints. Javascript is dumb.
return Math.abs(c);
}
// thanks Borgar!
// http://stackoverflow.com/questions/1240408/reading-bytes-from-a-javascript-string/1242596#1242596
function stringToBytes(str) {
var ch, st, re = [];
for (var i = 0; i < str.length; i++) {
ch = str.charCodeAt(i); // get char
st = []; // set up "stack"
do {
st.push(ch & 0xFF); // push byte to stack
ch = ch >> 8; // shift value down by 1 byte
}
while (ch);
// add stack contents to result
// done because chars have "wrong" endianness
re = re.concat(st.reverse());
}
// return an array of bytes
return re;
}
var FNVPRIME = 0x01000193;
var FNVINIT = 0x811c9dc5;
function fnv1s(str) {
var bytes = stringToBytes(str);
var hash = FNVINIT;
for (var i = 0; i < bytes.length; i++) {
hash *= FNVPRIME;
hash ^= bytes[i];
}
return Math.abs(hash);
}
nboxes = 15;
strings = [];
function bloom(s) {
$(".active").attr("class", "set");
strings.push(s);
$("#yourset").html(strings.join(", "));
//clear the text input box
$("#addtoset").val("");
var a = fnv1s(s) % nboxes;
var b = murmur(s) % nboxes;
$("#bit[i='" + a + "']").attr("class", "active");
$("#bit[i='" + b + "']").attr("class", "active");
//the probability of a false positive is the number of 1s in the bit vector
//divided by the number of bits in the vector to the power of k, the number
//of index functions you're using
var p = Math.round((Math.pow($(".set, .active").length / nboxes, 2)) * 100);
$("#false_pos_prob").html(p + "%");
}
function testMembership(evt) {
//clear out "active" cells
$(".active").attr("class", "set");
var s = $("#membership").val();
var a = fnv1s(s) % nboxes;
var b = murmur(s) % nboxes;
if ($("#bit[i='" + a + "']").attr("class") == "set" &&
$("#bit[i='" + b + "']").attr("class") == "set") {
$("#ismember").html("maybe!");
} else {
$("#ismember").html("no");
}
$("#memfnv").html(a)
$("#memmurmur").html(b)
}
$(function () {
//add the table cells which represent our bloom filter bit array
for (var i = 0; i < nboxes; i++) {
$("#bits").append('<td id="bit" i="' + i + '" width=20> </td>');
$("#labels").append('<td id="label" i="' + i + '" align="center">' + i + '</td>');
}
//handle a click on the "add to bloom filter" button
$("#hash").click(function () {
var s = $("#addtoset").val();
$("#fnv").html(fnv1s(s) % nboxes)
$("#murmur").html(murmur(s) % nboxes)
bloom(s);
});
// handle enter key on "add to bloom filter" form
$('#addtoset').keydown(function (event) {
if (event.keyCode == '13') {
event.preventDefault();
$('#hash').click();
}
});
$("#membership").keyup(testMembership);
});
</script>
</head>
<body>
<a aria-label="View source on GitHub" class="github-corner" href="https://github.com/llimllib/bloomfilter-tutorial"
rel="noopener">
<svg aria-hidden="true" class="github-corner-svg" height="110" viewBox="0 0 250 250" width="110">
<path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z" />
<path
d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2"
fill="currentColor" class="octo-arm" />
<path
d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z"
fill="currentColor" class="octo-body" />
</svg>
</a>
<div id="content">
<span style="float: right"><a href="./zh_CN/">简体中文</a></span>
<h1>Bloom Filters by Example</h1>
<p>A Bloom filter is a data structure designed to tell you, rapidly and memory-efficiently, whether an element is
present in a set.
<p>The price paid for this efficiency is that a Bloom filter is a <strong>probabilistic data structure</strong>: it
tells us that the element either <em>definitely is not</em> in the set or <em>may be</em> in the set.
<p>The base data structure of a Bloom filter is a <strong>Bit Vector</strong>. Here's a small one we'll use to
demonstrate:
<div class="insetbox">
<table id="bitvector" border=1 cellpadding=0 cellspacing=0>
<tr id="bits"></tr>
<tr id="labels"></tr>
</table>
</div>
<p>Each empty cell in that table represents a bit, and the number below it its index. To add an element to the Bloom
filter, we simply hash it a few times and set the bits in the bit vector at the index of those hashes to 1.
<p>It's easier to see what that means than explain it, so enter some strings and see how the bit vector changes. Fnv
and Murmur are two simple hash functions:
<div id="addstring" class="insetbox">
<p>Enter a string: <input id="addtoset"><input type="submit" id="hash" value="add to bloom filter">
<div id="hashes">
fnv: <span id="fnv"></span><br>
murmur: <span id="murmur"></span>
</div>
<p>Your set: [<span id="yourset"></span>]
</div>
<p>When you add a string, you can see that the bits at the index given by the hashes are set to 1. I've used the
color green to show the newly added ones, but any colored cell is simply a 1.
<p>To test for membership, you simply hash the string with the same hash functions, then see if those values are set
in the bit vector. If they aren't, you know that the element isn't in the set. If they are, you only know that it
<em>might</em> be, because another element or some combination of other elements could have set the same bits.
Again, let's demonstrate:
<div id="testmembership" class="insetbox">
<p>Test an element for membership: <input id="membership">
<div id="memhashes">
fnv: <span id="memfnv"></span><br>
murmur: <span id="memmurmur"></span>
</div>
<p>Is the element in the set? <span id="ismember">no</span>
<p>Probability of a false positive: <span id="false_pos_prob">0%</span>
</div>
<p>And that's the basics of a bloom filter!
<h2>Advanced Topics</h2>
<p>Before I write a bit more about Bloom filters, a disclaimer: I've never used them in production. Don't take my
word for it. All I intend to do is give you general ideas and pointers to where you can find out more.
<p>In the following text, we will refer to a Bloom filter with <em>k</em> hashes, <em>m</em> bits in the filter, and
<em>n</em> elements that have been inserted.
<h3>Hash Functions</h3>
<p>The hash functions used in a Bloom filter should be <strong><a
href="http://en.wiktionary.org/wiki/independent_function">independent</a></strong> and <strong><a
href="http://en.wikipedia.org/wiki/Uniform_distribution_(discrete)">uniformly distributed</a></strong>. They
should also be as fast as possible (cryptographic hashes such as sha1, though widely used therefore are not very
good choices).
<p>Examples of fast, simple hashes that are independent enough<sup><a href="#footnote3">3</a></sup> include <a
href="https://sites.google.com/site/murmurhash/">murmur</a>, <a
href="https://github.com/Cyan4973/xxHash">xxHash</a>, the <a
href="http://isthe.com/chongo/tech/comp/fnv/">fnv</a> series of hashes, and <a
href="https://web.archive.org/web/20061030103559/http://www.concentric.net/~Ttwang/tech/inthash.htm">HashMix</a>.
<p>To see the difference that a faster-than-cryptographic hash function can make, <a
href="https://github.com/bitly/dablooms/pull/19">check out this story</a> of a ~800% speedup when switching a
bloom filter implementation from md5 to murmur.
<p>In a short survey of bloom filter implementations:
<ul>
<li>
<a
href="https://github.com/chromium/chromium/blob/faf8581c2f9cdcb590d3544530c88a00c043461b/components/optimization_guide/core/bloom_filter.cc">Chromium</a>
uses <a
href="https://github.com/chromium/chromium/blob/faf8581c2f9cdcb590d3544530c88a00c043461b/components/optimization_guide/core/bloom_filter.cc#L17-L26">murmur</a>.
(also, <a
href="https://web.archive.org/web/20160306232658/http://blog.alexyakunin.com/2010/03/nice-bloom-filter-application.html">here's</a>
a short description of how they use bloom filters)
</li>
<li>
<a href="https://plan9.io/sources/plan9/sys/src/cmd/venti/srv/bloom.c">Plan9</a> uses a simple hash as proposed
in <a href="http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.152.579&rank=1">Mitzenmacher 2005</a>
</li>
<li>
<a href="https://github.com/sdroege/snippets/blob/master/snippets/bloomfilter.c">Sdroege Bloom filter</a> uses
fnv1a (included just because I wanted to show one that uses fnv.)
</li>
<li>
<a href="https://github.com/squid-cache/squid/blob/master/src/store_key_md5.cc">Squid</a> uses MD5
</li>
<li>
<a
href="https://github.com/RedisBloom/RedisBloom/blob/be2ef438ddd4343f12d4689eeaed9be21ff491f1/src/cuckoo.h#L38">RedisBloom</a>
uses murmur
</li>
<li>
<a
href="https://github.com/apache/spark/blob/93251ed77ea1c5d037c64d2292b8760b03c8e181/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilterImpl.java#L87">Apache
Spark</a> uses murmur
</li>
<li>
<a
href="https://github.com/influxdata/influxdb/blob/e20b5e99a6f92614b1b97bda0807ef6c2ab1f7e9/pkg/bloom/bloom.go#L3">influxdb</a>
uses xxhash
</li>
<li>
<a
href="https://github.com/armon/bloomd/blob/23c19a7f5cbb35d7c3d970bef13bc2bfbe3625f6/src/libbloom/bloom.c#L306">bloomd</a>
(a neat project that uses a redis-ish protocol) uses murmur for the first two hashes, <a
href="https://burtleburtle.net/bob/hash/spooky.html">SpookyHash</a> for the second two hashes, and a
combination of the two for further hashes, as described in <sup><a href="#footnote3">3</a></sup>
</li>
<li>
<a href="https://github.com/hashlookup/fleur">fleur (C)</a>, <a href="https://github.com/DCSO/flor">flor
(python)</a>, and <a href="https://github.com/DCSO/bloom">bloom (go)</a> all use fnv
</li>
<li>
<a
href="https://github.com/sqlite/sqlite/blob/aa07b36dd52cfb4d7690d4e8917ea0187d44b405/src/vdbe.c#L674">Sqlite</a>
added a bloom filter for analytic queries, but I do not understand the hash algorithm. Dr. Hipp <a
href="https://sqlite.org/forum/forumpost/bd59962986a5668b">explains the purpose</a> of the filters on the
sqlite forum.
</li>
<li>
<a
href="https://github.com/facebook/rocksdb/blob/88bc91f3cc2b492b8a45ba2c49650f527df97ad8/util/bloom_impl.h">RocksDB</a>
is configurable, but claims <a
href="https://github.com/facebook/rocksdb/blob/88bc91f3cc2b492b8a45ba2c49650f527df97ad8/util/hash.cc#L67-L83">in
the source</a> that xxh3, a member of the <a href="https://github.com/Cyan4973/xxHash#benchmarks">xxhash
family</a> performed best for them
<ul>
<li>They also link <a
href="https://www.khoury.northeastern.edu/home/pete/pub/bloom-filters-verification.pdf">"Bloom Filters in
Probabilistic Verification"</a> by Dillinger and Maniolios, but it's pretty far over my head.</li>
</ul>
</li>
<li>
<a
href="https://github.com/scylladb/scylladb/blob/10a11c2886cffb99f1be06fd5bf1920d5b0953b6/utils/bloom_filter.cc#L75-L84">ScyllaDB</a>
uses murmur
</li>
</ul>
<h3>How big should I make my Bloom filter?</h3>
<p>It's a nice property of Bloom filters that you can modify the false positive rate of your filter. A larger filter
will have less false positives, and a smaller one more.
<p>Your false positive rate will be approximately <em>(1-e<sup>-kn/m</sup>)<sup>k</sup></em>, so you can just plug
the number <em>n</em> of elements you expect to insert, and try various values of <em>k</em> and <em>m</em> to
configure your filter for your application.<sup><a href="#footnote2">2</a></sup>
<p>This leads to an obvious question:
<h3>How many hash functions should I use?</h3>
<p>The more hash functions you have, the slower your bloom filter, and the quicker it fills up. If you have too few,
however, you may suffer too many false positives.
<p>Since you have to pick <em>k</em> when you create the filter, you'll have to ballpark what range you expect
<em>n</em> to be in. Once you have that, you still have to choose a potential <em>m</em> (the number of bits) and
<em>k</em> (the number of hash functions).
<p>It seems a difficult optimization problem, but fortunately, given an <em>m</em> and an <em>n</em>, we have a
function to choose the optimal value of <em>k</em>: <em>(m/n)ln(2)</em> <sup><a href="#footnote2">2</a>, <a
href="#footnote3">3</a></sup>
<p>So, to choose the size of a bloom filter, we:
<p>
<ol>
<li>Choose a ballpark value for <em>n</em>
<li>Choose a value for <em>m</em>
<li>Calculate the optimal value of <em>k</em>
<li>Calculate the error rate for our chosen values of <em>n</em>, <em>m</em>, and <em>k</em>. If it's
unacceptable, return to step 2 and change m; otherwise we're done.
</ol>
<h3>How fast and space efficient is a Bloom filter?</h3>
<p>Given a Bloom filter with <em>m</em> bits and <em>k</em> hashing functions, both insertion and membership testing
are <em>O(k)</em>. That is, each time you want to add an element to the set or check set membership, you just need
to run the element through the <em>k</em> hash functions and add it to the set or check those bits.
<p>The space advantages are more difficult to sum up; again it depends on the error rate you're willing to tolerate.
It also depends on the potential range of the elements to be inserted; if it is very limited, a deterministic bit
vector can do better. If you can't even ballpark estimate the number of elements to be inserted, you may be better
off with a hash table or a scalable Bloom filter<sup><a href="#footnote4">4</a></sup>.
<h3>What can I use them for?</h3>
<p>I'll link you to <a href="http://en.wikipedia.org/wiki/Bloom_filter#Examples">wiki</a> instead of copying what
they say. <a
href="https://archive.org/details/pyvideo_402___handling-ridiculous-amounts-of-data-with-probabilistic-data-structures">C.
Titus Brown</a> also has an excellent talk on an application of Bloom filters to bioinformatics.
<h3>References</h3>
<p><a name="footnote1">1:</span> <a
href="http://citeseer.ist.psu.edu/viewdoc/download;jsessionid=6CA79DD1A90B3EFD3D62ACE5523B99E7?doi=10.1.1.127.9672&rep=rep1&type=pdf">Network
Applications of Bloom Filters: A Survey</a>, Broder and Mitzenmacher. An excellent overview.
<p><a name="footnote2">2:</span> <a
href="http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives">Wikipedia</a>, which has
an excellent and comprehensive page on Bloom filters
<p><a name="footnote3">3:</span> <a
href="http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.152.579&rank=1">Less Hashing, Same
Performance</a>, Kirsch and Mitzenmacher
<p><a name="footnote4">4:</span> <a href="http://gsd.di.uminho.pt/members/cbm/ps/dbloom.pdf">Scalable
Bloom Filters</a>, Almeida et al
</div>
</body>
</html>