bloom.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2018 HPCC Systems®.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #include "platform.h"
  14. #include "jlib.hpp"
  15. #include "jset.hpp"
  16. #include "bloom.hpp"
  17. #include "math.h"
  18. #include "eclhelper.hpp"
  19. #include "rtlrecord.hpp"
  20. BloomFilter::BloomFilter(unsigned _cardinality, double _probability)
  21. {
  22. unsigned cardinality = _cardinality ? _cardinality : 1;
  23. double probability = _probability >= 0.3 ? 0.3 : (_probability < 0.01 ? 0.01 : _probability);
  24. numBits = rtlRoundUp(-(cardinality*log(probability))/pow(log(2),2));
  25. unsigned tableSize = (numBits + 7) / 8;
  26. numBits = tableSize * 8;
  27. numHashes = round((numBits * log(2))/cardinality);
  28. table = (byte *) calloc(tableSize, 1);
  29. }
  30. BloomFilter::BloomFilter(unsigned _numHashes, unsigned _tableSize, byte *_table)
  31. {
  32. numBits = _tableSize * 8;
  33. numHashes = _numHashes;
  34. table = _table; // Note - takes ownership
  35. }
  36. BloomFilter::~BloomFilter()
  37. {
  38. free(table);
  39. }
  40. void BloomFilter::add(hash64_t hash)
  41. {
  42. uint32_t hash1 = hash >> 32;
  43. uint32_t hash2 = hash & 0xffffffff;
  44. for (unsigned i=0; i < numHashes; i++)
  45. {
  46. // Kirsch and Mitzenmacher technique (Harvard U)
  47. uint64_t bit = (hash1 + (i * hash2)) % numBits;
  48. uint64_t slot = bit / 8;
  49. unsigned shift = bit % 8;
  50. unsigned mask = 1 << shift;
  51. table[slot] |= mask;
  52. }
  53. }
  54. bool BloomFilter::test(hash64_t hash) const
  55. {
  56. uint32_t hash1 = hash >> 32;
  57. uint32_t hash2 = hash & 0xffffffff;
  58. for (unsigned i=0; i < numHashes; i++)
  59. {
  60. // Kirsch and Mitzenmacher technique (Harvard U)
  61. uint64_t bit = (hash1 + (i * hash2)) % numBits;
  62. uint64_t slot = bit / 8;
  63. unsigned shift = bit % 8;
  64. unsigned mask = 1 << shift;
  65. if (!(table[slot] & mask))
  66. return false;
  67. }
  68. return true;
  69. }
  70. IndexBloomFilter::IndexBloomFilter(unsigned _numHashes, unsigned _tableSize, byte *_table, __uint64 _fields)
  71. : BloomFilter(_numHashes, _tableSize, _table), fields(_fields)
  72. {}
  73. int IndexBloomFilter::compare(CInterface *const *_a, CInterface *const *_b)
  74. {
  75. const IndexBloomFilter *a = static_cast<IndexBloomFilter *>(*_a);
  76. const IndexBloomFilter *b = static_cast<IndexBloomFilter *>(*_b);
  77. return a->fields - b->fields;
  78. }
  79. bool IndexBloomFilter::reject(const IIndexFilterList &filters) const
  80. {
  81. hash64_t hashval = HASH64_INIT;
  82. return getBloomHash(fields, filters, hashval) && !test(hashval);
  83. }
  84. extern bool getBloomHash(__int64 fields, const IIndexFilterList &filters, hash64_t &hashval)
  85. {
  86. while (fields)
  87. {
  88. unsigned f = ffsll(fields)-1; // extract lowest 1 bit
  89. fields &= ~ (((__uint64) 1)<<f); // and clear it
  90. const IIndexFilter *filter = filters.item(f);
  91. if (filter)
  92. {
  93. assertex(filter->queryFieldIndex() == f);
  94. if (!filter->getBloomHash(hashval))
  95. return false;
  96. }
  97. }
  98. return true;
  99. }
  100. class RowHasher : public CInterfaceOf<IRowHasher>
  101. {
  102. public:
  103. RowHasher(const RtlRecord &_recInfo, __uint64 _fields);
  104. virtual hash64_t hash(const byte *row) const override;
  105. virtual __uint64 queryFields() const override { return fields; }
  106. private:
  107. const RtlRecord &recInfo;
  108. const __uint64 fields;
  109. };
  110. class SimpleRowHasher : public RowHasher
  111. {
  112. public:
  113. SimpleRowHasher(const RtlRecord &_recInfo, __uint64 _fields, unsigned _offset, unsigned _length);
  114. virtual hash64_t hash(const byte *row) const override;
  115. private:
  116. const unsigned offset;
  117. const unsigned length;
  118. };
  119. RowHasher::RowHasher(const RtlRecord &_recInfo, __uint64 _fields) : recInfo(_recInfo), fields(_fields)
  120. {
  121. }
  122. hash64_t RowHasher::hash(const byte *row) const
  123. {
  124. auto lfields = fields;
  125. hash64_t hashval = HASH64_INIT;
  126. // Assumes fixed size fields for now. Could probably optimize a bit
  127. while (lfields)
  128. {
  129. unsigned f = ffsll(lfields)-1; // extract lowest 1 bit
  130. lfields &= ~ (((__uint64) 1)<<f); // and clear it
  131. hashval = rtlHash64Data(recInfo.queryType(f)->getMinSize(), row + recInfo.getFixedOffset(f), hashval);
  132. }
  133. return hashval;
  134. }
  135. SimpleRowHasher::SimpleRowHasher(const RtlRecord &_recInfo, __uint64 _fields, unsigned _offset, unsigned _length)
  136. : RowHasher(_recInfo, _fields), offset(_offset), length(_length)
  137. {
  138. }
  139. hash64_t SimpleRowHasher::hash(const byte *row) const
  140. {
  141. return rtlHash64Data(length, row + offset, HASH64_INIT);
  142. }
  143. // For cases where we know data is sorted
  144. class jhtree_decl SortedBloomBuilder : public CInterfaceOf<IBloomBuilder>
  145. {
  146. public:
  147. SortedBloomBuilder(const IBloomBuilderInfo &_helper);
  148. SortedBloomBuilder(unsigned _maxHashes, double _probability);
  149. virtual const BloomFilter * build() const override;
  150. virtual bool add(hash64_t val) override;
  151. virtual unsigned queryCount() const override;
  152. virtual bool valid() const override;
  153. protected:
  154. ArrayOf<hash64_t> hashes;
  155. const unsigned maxHashes;
  156. hash64_t lastHash = 0;
  157. const double probability = 0.0;
  158. bool isValid = true;
  159. };
  160. SortedBloomBuilder::SortedBloomBuilder(const IBloomBuilderInfo &helper)
  161. : maxHashes(helper.getBloomLimit()),
  162. probability(helper.getBloomProbability())
  163. {
  164. if (maxHashes==0 || !helper.getBloomEnabled())
  165. isValid = false;
  166. }
  167. SortedBloomBuilder::SortedBloomBuilder(unsigned _maxHashes, double _probability)
  168. : maxHashes(_maxHashes),
  169. probability(_probability)
  170. {
  171. if (maxHashes==0)
  172. isValid = false;
  173. }
  174. unsigned SortedBloomBuilder::queryCount() const
  175. {
  176. return hashes.length();
  177. }
  178. bool SortedBloomBuilder::add(hash64_t hash)
  179. {
  180. if (isValid && (hash != lastHash || !hashes.length()))
  181. {
  182. if (hashes.length()==maxHashes)
  183. {
  184. isValid = false;
  185. hashes.kill();
  186. }
  187. else
  188. hashes.append(hash);
  189. lastHash = hash;
  190. }
  191. return isValid;
  192. }
  193. bool SortedBloomBuilder::valid() const
  194. {
  195. return isValid && hashes.length();
  196. }
  197. const BloomFilter * SortedBloomBuilder::build() const
  198. {
  199. if (!valid())
  200. return nullptr;
  201. BloomFilter *b = new BloomFilter(hashes.length(), probability);
  202. ForEachItemIn(idx, hashes)
  203. {
  204. b->add(hashes.item(idx));
  205. }
  206. return b;
  207. }
  208. // For cases where we do not know data is sorted - use a hash table to store the hashes
  209. class jhtree_decl UnsortedBloomBuilder : public CInterfaceOf<IBloomBuilder>
  210. {
  211. public:
  212. UnsortedBloomBuilder(const IBloomBuilderInfo &_helper);
  213. UnsortedBloomBuilder(unsigned _maxHashes, double _probability);
  214. ~UnsortedBloomBuilder();
  215. virtual const BloomFilter * build() const override;
  216. virtual bool add(hash64_t val) override;
  217. virtual unsigned queryCount() const override;
  218. virtual bool valid() const override;
  219. protected:
  220. hash64_t *hashes = nullptr;
  221. const unsigned maxHashes;
  222. const unsigned tableSize;
  223. unsigned tableCount = 0;
  224. const double probability = 0.0;
  225. };
  226. UnsortedBloomBuilder::UnsortedBloomBuilder(const IBloomBuilderInfo &helper)
  227. : maxHashes(helper.getBloomLimit()),
  228. probability(helper.getBloomProbability()),
  229. tableSize(((helper.getBloomLimit()*4)/3)+1)
  230. {
  231. if (tableSize && helper.getBloomEnabled())
  232. {
  233. hashes = (hash64_t *) calloc(sizeof(hash64_t), tableSize);
  234. }
  235. }
  236. UnsortedBloomBuilder::UnsortedBloomBuilder(unsigned _maxHashes, double _probability)
  237. : maxHashes(_maxHashes),
  238. probability(_probability),
  239. tableSize(((_maxHashes*4)/3)+1)
  240. {
  241. if (tableSize)
  242. hashes = (hash64_t *) calloc(sizeof(hash64_t), tableSize);
  243. }
  244. UnsortedBloomBuilder::~UnsortedBloomBuilder()
  245. {
  246. free(hashes);
  247. }
  248. unsigned UnsortedBloomBuilder::queryCount() const
  249. {
  250. return tableCount;
  251. }
  252. bool UnsortedBloomBuilder::add(hash64_t hash)
  253. {
  254. if (hashes)
  255. {
  256. if (!hash)
  257. {
  258. // Something that genuinely hashes to zero cannot be handled - so just mark the builder as invalid
  259. free(hashes);
  260. hashes = nullptr;
  261. tableCount = 0;
  262. }
  263. else
  264. {
  265. unsigned pos = hash % tableSize;
  266. for (;;)
  267. {
  268. hash64_t val = hashes[pos];
  269. if (!val)
  270. break;
  271. if (val== hash)
  272. return true;
  273. pos++;
  274. if (pos == tableSize)
  275. pos = 0;
  276. }
  277. if (tableCount==maxHashes)
  278. {
  279. free(hashes);
  280. hashes = nullptr;
  281. tableCount = 0;
  282. }
  283. else
  284. {
  285. hashes[pos] = hash;
  286. tableCount++;
  287. }
  288. }
  289. }
  290. return hashes != nullptr;
  291. }
  292. bool UnsortedBloomBuilder::valid() const
  293. {
  294. return tableCount != 0;
  295. }
  296. const BloomFilter * UnsortedBloomBuilder::build() const
  297. {
  298. if (!valid())
  299. return nullptr;
  300. BloomFilter *b = new BloomFilter(tableCount, probability);
  301. for (unsigned idx = 0; idx < tableSize; idx++)
  302. {
  303. hash64_t val = hashes[idx];
  304. if (val)
  305. b->add(val);
  306. }
  307. return b;
  308. }
  309. extern jhtree_decl IBloomBuilder *createBloomBuilder(const IBloomBuilderInfo &helper)
  310. {
  311. __uint64 fields = helper.getBloomFields();
  312. if (!(fields & (fields+1))) // only true if all the ones are at the lsb end...
  313. return new SortedBloomBuilder(helper);
  314. else
  315. return new UnsortedBloomBuilder(helper);
  316. }
  317. extern jhtree_decl IRowHasher *createRowHasher(const RtlRecord &recInfo, __uint64 fields)
  318. {
  319. if (!(fields & (fields-1))) // Only one bit set
  320. {
  321. unsigned field = ffsll(fields)-1;
  322. if (recInfo.isFixedOffset(field) && recInfo.queryType(field)->isFixedSize())
  323. return new SimpleRowHasher(recInfo, fields, recInfo.getFixedOffset(field), recInfo.queryType(field)->getMinSize()); // Specialize to speed up most common case
  324. }
  325. else if (!(fields & (fields+1))) // only true if all the ones are at the lsb end...
  326. {
  327. unsigned lastField = ffsll(fields+1)-2;
  328. if (recInfo.isFixedOffset(lastField) && recInfo.queryType(lastField)->isFixedSize())
  329. return new SimpleRowHasher(recInfo, fields, 0, recInfo.queryType(lastField)->getMinSize()); // Specialize to speed up another common case - fixed-size block at start
  330. }
  331. return new RowHasher(recInfo, fields);
  332. }
  333. #ifdef _USE_CPPUNIT
  334. #include "unittests.hpp"
  335. class BloomTest : public CppUnit::TestFixture
  336. {
  337. CPPUNIT_TEST_SUITE(BloomTest);
  338. CPPUNIT_TEST(testSortedBloom);
  339. CPPUNIT_TEST(testUnsortedBloom);
  340. CPPUNIT_TEST(testFailedSortedBloomBuilder);
  341. CPPUNIT_TEST(testFailedUnsortedBloomBuilder);
  342. CPPUNIT_TEST_SUITE_END();
  343. const unsigned count = 1000000;
  344. void testSortedBloom()
  345. {
  346. SortedBloomBuilder b(count, 0.01);
  347. for (unsigned val = 0; val < count; val++)
  348. {
  349. b.add(rtlHash64Data(sizeof(val), &val, HASH64_INIT));
  350. b.add(rtlHash64Data(sizeof(val), &val, HASH64_INIT));
  351. }
  352. Owned<const BloomFilter> f = b.build();
  353. unsigned falsePositives = 0;
  354. unsigned falseNegatives = 0;
  355. unsigned start = usTick();
  356. for (unsigned val = 0; val < count; val++)
  357. {
  358. if (!f->test(rtlHash64Data(sizeof(val), &val, HASH64_INIT)))
  359. falseNegatives++;
  360. if (f->test(rtlHash64Data(sizeof(val), &val, HASH64_INIT+1)))
  361. falsePositives++;
  362. }
  363. unsigned end = usTick();
  364. ASSERT(falseNegatives==0);
  365. DBGLOG("Bloom filter (%d, %d) gave %d false positives (%.02f %%) in %d uSec", f->queryNumHashes(), f->queryTableSize(), falsePositives, (falsePositives * 100.0)/count, end-start);
  366. }
  367. void testUnsortedBloom()
  368. {
  369. UnsortedBloomBuilder b(count, 0.01);
  370. for (unsigned val = 0; val < count; val++)
  371. b.add(rtlHash64Data(sizeof(val), &val, HASH64_INIT));
  372. for (unsigned val = 0; val < count; val++)
  373. b.add(rtlHash64Data(sizeof(val), &val, HASH64_INIT));
  374. Owned<const BloomFilter> f = b.build();
  375. unsigned falsePositives = 0;
  376. unsigned falseNegatives = 0;
  377. unsigned start = usTick();
  378. for (unsigned val = 0; val < count; val++)
  379. {
  380. if (!f->test(rtlHash64Data(sizeof(val), &val, HASH64_INIT)))
  381. falseNegatives++;
  382. if (f->test(rtlHash64Data(sizeof(val), &val, HASH64_INIT+1)))
  383. falsePositives++;
  384. }
  385. unsigned end = usTick();
  386. ASSERT(falseNegatives==0);
  387. DBGLOG("Bloom filter (%d, %d) gave %d false positives (%.02f %%) in %d uSec", f->queryNumHashes(), f->queryTableSize(), falsePositives, (falsePositives * 100.0)/count, end-start);
  388. }
  389. void testFailedSortedBloomBuilder()
  390. {
  391. SortedBloomBuilder b1(0, 0.01);
  392. ASSERT(!b1.valid())
  393. ASSERT(!b1.add(0))
  394. SortedBloomBuilder b2(1, 0.01);
  395. ASSERT(b2.add(1))
  396. ASSERT(!b2.add(2))
  397. SortedBloomBuilder b3(10, 0.01);
  398. ASSERT(b3.add(1))
  399. ASSERT(b3.add(0)) // ok to add 0 to sorted bloom tables
  400. ASSERT(b3.add(2))
  401. }
  402. void testFailedUnsortedBloomBuilder()
  403. {
  404. UnsortedBloomBuilder b1(0, 0.01);
  405. ASSERT(!b1.valid())
  406. ASSERT(!b1.add(0))
  407. UnsortedBloomBuilder b2(1, 0.01);
  408. ASSERT(b2.add(1))
  409. ASSERT(!b2.add(2))
  410. UnsortedBloomBuilder b3(10, 0.01);
  411. ASSERT(b3.add(1))
  412. ASSERT(!b3.add(0)) // Not ok to add hash value 0 to unsorted
  413. ASSERT(!b3.add(2))
  414. }
  415. };
  416. CPPUNIT_TEST_SUITE_REGISTRATION( BloomTest );
  417. CPPUNIT_TEST_SUITE_NAMED_REGISTRATION( BloomTest, "BloomTest" );
  418. #endif