bloom.hpp 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2018 HPCC Systems®.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #ifndef _BLOOM_INCL
  14. #define _BLOOM_INCL
  15. #include "jhtree.hpp"
  16. /**
  17. * A BloomFilter object is used to create or test a Bloom filter - this can be used to quickly determine whether a value has been added to the filter,
  18. * giving some false positives but no false negatives.
  19. */
  20. class jhtree_decl BloomFilter : public CInterface
  21. {
  22. public:
  23. /*
  24. * Create an empty bloom filter
  25. *
  26. * @param cardinality Expected number of values to be added. This will be used to determine the appropriate size and hash count
  27. * @param probability Desired probability of false positives. This will be used to determine the appropriate size and hash count
  28. */
  29. BloomFilter(unsigned cardinality, double probability=0.1);
  30. /*
  31. * Create a bloom filter from a previously-generated table. Parameters must batch those used when building the table.
  32. *
  33. * @param numHashes Number of hashes to use for each lookup.
  34. * @param tableSize Size (in bytes) of the table
  35. * @param table Bloom table. Note that the BloomFilter object will take ownership of this memory, so it must be allocated on the heap.
  36. */
  37. BloomFilter(unsigned numHashes, unsigned tableSize, byte *table);
  38. /*
  39. * BloomFilter destructor
  40. */
  41. virtual ~BloomFilter();
  42. /*
  43. * Add a value to the filter
  44. *
  45. * @param hash The hash of the value to be added
  46. */
  47. void add(hash64_t hash);
  48. /*
  49. * Test if a value has been added to the filter (with some potential for false-positives)
  50. *
  51. * @param hash The hash of the value to be tested.
  52. * @return False if the value is definitely not present, otherwise true.
  53. */
  54. bool test(hash64_t hash) const;
  55. /*
  56. * Add a value to the filter, by key
  57. *
  58. * @param len The length of the key
  59. * @param val The key data
  60. */
  61. inline void add(size32_t len, const void *val) { add(rtlHash64Data(len, val, HASH64_INIT)); }
  62. /*
  63. * Test if a value has been added to the filter (with some potential for false-positives), by key
  64. *
  65. * @param len The length of the key
  66. * @param val The key data
  67. * @return False if the value is definitely not present, otherwise true.
  68. */
  69. inline bool test(size32_t len, const void *val) { return test(rtlHash64Data(len, val, HASH64_INIT)); }
  70. /*
  71. * Retrieve bloom table size
  72. *
  73. * @return Size, in bytes.
  74. */
  75. inline unsigned queryTableSize() const { return numBits / 8; }
  76. /*
  77. * Retrieve bloom table hash count
  78. *
  79. * @return Hash count.
  80. */
  81. inline unsigned queryNumHashes() const { return numHashes; }
  82. /*
  83. * Retrieve bloom table data
  84. *
  85. * @return Table data.
  86. */
  87. inline const byte *queryTable() const { return table; }
  88. protected:
  89. unsigned numBits;
  90. unsigned numHashes;
  91. byte *table;
  92. };
  93. class jhtree_decl BloomBuilder
  94. {
  95. public:
  96. BloomBuilder(unsigned _maxHashes = 1000000);
  97. const BloomFilter * build(double probability=0.1) const;
  98. bool add(hash64_t hash);
  99. inline bool add(size32_t len, const void *val) { return add(rtlHash64Data(len, val, HASH64_INIT)); }
  100. bool valid() const;
  101. protected:
  102. ArrayOf<hash64_t> hashes;
  103. const unsigned maxHashes;
  104. bool isValid;
  105. };
  106. #endif