rtldistr.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #include "platform.h"
  14. #include <math.h>
  15. #include <stdio.h>
  16. #include "jexcept.hpp"
  17. #include "jmisc.hpp"
  18. #include "jutil.hpp"
  19. #include "jlib.hpp"
  20. #include "rtldistr.hpp"
  21. #define DISTRIBUTION_THRESHOLD 10000
  22. //---------------------------------------------------------------------------
  23. class CDistributionTable : public CInterface
  24. {
  25. protected:
  26. StringAttr fieldname;
  27. public:
  28. CDistributionTable(const char *_fieldname) : fieldname(_fieldname) {}
  29. virtual unsigned __int64 distinct() = 0;
  30. virtual bool exact() = 0;
  31. virtual void reportValues(StringBuffer &out) = 0;
  32. virtual void report(StringBuffer &out)
  33. {
  34. unsigned __int64 d = distinct();
  35. out.append("<Field name=\"").append(fieldname).append("\"");
  36. if (exact())
  37. {
  38. out.append(" distinct=\"").append(d).append("\">\n");
  39. reportValues(out);
  40. out.append("</Field>\n");
  41. }
  42. else
  43. out.append(" estimate=\"").append(d).append("\"/>\n");
  44. }
  45. };
  46. class CBoolDistributionTable : public CDistributionTable, implements IBoolDistributionTable
  47. {
  48. unsigned __int64 counts[2];
  49. public:
  50. IMPLEMENT_IINTERFACE;
  51. CBoolDistributionTable(const char *_fieldname) : CDistributionTable(_fieldname)
  52. {
  53. counts[0] = counts[1] = 0;
  54. }
  55. virtual void report(StringBuffer &out)
  56. {
  57. CDistributionTable::report(out);
  58. }
  59. virtual void merge(MemoryBuffer &in)
  60. {
  61. unsigned __int64 c[2];
  62. in.read(sizeof(c), &c);
  63. counts[false] += c[false];
  64. counts[true] += c[true];
  65. }
  66. virtual void serialize(MemoryBuffer &out) { out.append(sizeof(counts), &counts); }
  67. virtual void reportValues(StringBuffer &out)
  68. {
  69. if (counts[0])
  70. out.appendf(" <Value count=\"%" I64F "d\">false</Value>\n", counts[0]);
  71. if (counts[1])
  72. out.appendf(" <Value count=\"%" I64F "d\">true</Value>\n", counts[1]);
  73. }
  74. virtual void noteValue(bool val)
  75. {
  76. counts[val]++;
  77. }
  78. virtual unsigned __int64 distinct()
  79. {
  80. return (counts[0] != 0) + (counts[1] != 0);
  81. }
  82. virtual bool exact()
  83. {
  84. return true;
  85. }
  86. };
  87. class CByteDistributionTable : public CDistributionTable
  88. {
  89. unsigned __int64 counts[256];
  90. public:
  91. CByteDistributionTable(const char *_fieldname) : CDistributionTable(_fieldname)
  92. {
  93. memset(counts, 0, sizeof(counts));
  94. }
  95. virtual unsigned __int64 distinct()
  96. {
  97. unsigned __int64 ret = 0;
  98. for (unsigned i = 0; i < 256; i++)
  99. if (counts[i])
  100. ret++;
  101. return ret;
  102. }
  103. virtual bool exact()
  104. {
  105. return true;
  106. }
  107. virtual void merge(MemoryBuffer &in)
  108. {
  109. unsigned __int64 _counts[256];
  110. in.read(sizeof(_counts), &_counts);
  111. for (unsigned i = 0; i < _elements_in(counts); i++)
  112. counts[i] += _counts[i];
  113. }
  114. virtual void serialize(MemoryBuffer &out) { out.append(sizeof(counts), &counts); }
  115. virtual void reportValues(StringBuffer &out)
  116. {
  117. for (unsigned i = 0; i < 256; i++)
  118. if (counts[i])
  119. {
  120. out.appendf(" <Value count=\"%" I64F "d\">", counts[i]);
  121. reportValue(out, i);
  122. out.append("</Value>\n");
  123. }
  124. }
  125. virtual void reportValue(StringBuffer &out, unsigned val)
  126. {
  127. out.append(val);
  128. }
  129. void doNoteValue(unsigned int val)
  130. {
  131. counts[val]++;
  132. }
  133. };
  134. class CCharDistributionTable : public CByteDistributionTable, implements IStringDistributionTable
  135. {
  136. public:
  137. IMPLEMENT_IINTERFACE;
  138. CCharDistributionTable(const char *_fieldname) : CByteDistributionTable(_fieldname) {}
  139. virtual void reportValue(StringBuffer &out, unsigned val)
  140. {
  141. unsigned char v = val;
  142. encodeXML((const char *) &v, out, ENCODE_WHITESPACE, 1);
  143. }
  144. virtual void report(StringBuffer &out)
  145. {
  146. CByteDistributionTable::report(out);
  147. }
  148. virtual void merge(MemoryBuffer &in) { CByteDistributionTable::merge(in); }
  149. virtual void serialize(MemoryBuffer &out) { CByteDistributionTable::serialize(out); }
  150. virtual void noteValue(unsigned len, const char *val)
  151. {
  152. assertex(len==1);
  153. doNoteValue((unsigned char) *val);
  154. }
  155. };
  156. class FixedMapper : public Mapping
  157. {
  158. public:
  159. FixedMapper(const void *k, int ksize);
  160. unsigned __int64 count;
  161. };
  162. FixedMapper::FixedMapper(const void *_key, int _ksize) : Mapping(_key, _ksize)
  163. {
  164. count = 0;
  165. }
  166. class CFixedDistributionTable : public CDistributionTable
  167. {
  168. public:
  169. CFixedDistributionTable(const char *_fieldname, unsigned _ksize, unsigned _threshold)
  170. : CDistributionTable(_fieldname), threshold(_threshold), table(_ksize, false), ksize(_ksize)
  171. {
  172. estimated = false;
  173. cardinality = 0;
  174. }
  175. virtual unsigned __int64 distinct() { return estimated ? cardinality : table.count(); }
  176. virtual bool exact() { return !estimated; }
  177. virtual void merge(MemoryBuffer &in)
  178. {
  179. bool inEstimated;
  180. unsigned inNum, inCardinality;
  181. in.read(inCardinality).read(inEstimated).read(inNum);
  182. if (inEstimated) estimated = true;
  183. cardinality += inCardinality;
  184. for (unsigned idx=0; idx < inNum; idx++)
  185. {
  186. const void * key;
  187. unsigned __int64 count;
  188. key = in.readDirect(ksize);
  189. in.read(count);
  190. FixedMapper * mapped = queryLookup(key);
  191. if (mapped)
  192. mapped->count += count;
  193. }
  194. }
  195. virtual void serialize(MemoryBuffer &out)
  196. {
  197. out.append(cardinality);
  198. out.append(estimated);
  199. out.append(table.count());
  200. HashIterator iter(table);
  201. ForEach(iter)
  202. {
  203. FixedMapper & cur = (FixedMapper &) iter.get();
  204. out.append(ksize, cur.getKey()).append(cur.count);
  205. }
  206. }
  207. void addValue(const void *buf)
  208. {
  209. FixedMapper *mapped = queryLookup(buf);
  210. if (mapped)
  211. mapped->count++;
  212. }
  213. virtual void reportValue(StringBuffer &out, FixedMapper &val) = 0;
  214. virtual void reportValues(StringBuffer &out)
  215. {
  216. HashIterator iter(table);
  217. ForEach(iter)
  218. {
  219. FixedMapper & cur = (FixedMapper &) iter.get();
  220. out.appendf(" <Value count=\"%" I64F "d\">", cur.count);
  221. reportValue(out, cur);
  222. out.append("</Value>\n");
  223. }
  224. }
  225. protected:
  226. FixedMapper * queryLookup(const void *buf)
  227. {
  228. if (estimated)
  229. return NULL;
  230. FixedMapper *mapped = (FixedMapper *) table.find(buf);
  231. if (!mapped)
  232. {
  233. mapped = new FixedMapper(buf, ksize);
  234. table.addOwn(*mapped);
  235. cardinality++;
  236. if (cardinality==threshold)
  237. estimated = true;
  238. }
  239. return mapped;
  240. }
  241. protected:
  242. unsigned ksize;
  243. unsigned cardinality;
  244. unsigned threshold;
  245. bool estimated;
  246. KeptHashTable table;
  247. };
  248. class CIntDistributionTable : public CFixedDistributionTable, implements IIntDistributionTable
  249. {
  250. public:
  251. IMPLEMENT_IINTERFACE;
  252. CIntDistributionTable(const char *_fieldname, unsigned threshold) : CFixedDistributionTable(_fieldname, sizeof(int), threshold)
  253. {
  254. }
  255. virtual void merge(MemoryBuffer &in) { CFixedDistributionTable::merge(in); }
  256. virtual void serialize(MemoryBuffer &out) { CFixedDistributionTable::serialize(out); }
  257. virtual void reportValue(StringBuffer &out, FixedMapper &val)
  258. {
  259. out.append(*(int *)val.getKey());
  260. }
  261. virtual void report(StringBuffer &out)
  262. {
  263. CFixedDistributionTable::report(out);
  264. }
  265. virtual void noteValue(int val)
  266. {
  267. addValue(&val);
  268. }
  269. };
  270. class CUIntDistributionTable : public CFixedDistributionTable, implements IUIntDistributionTable
  271. {
  272. public:
  273. IMPLEMENT_IINTERFACE;
  274. CUIntDistributionTable(const char *_fieldname, unsigned threshold) : CFixedDistributionTable(_fieldname, sizeof(unsigned int), threshold)
  275. {
  276. }
  277. virtual void merge(MemoryBuffer &in) { CFixedDistributionTable::merge(in); }
  278. virtual void serialize(MemoryBuffer &out) { CFixedDistributionTable::serialize(out); }
  279. virtual void reportValue(StringBuffer &out, FixedMapper &val)
  280. {
  281. out.append(*(unsigned int *)val.getKey());
  282. }
  283. virtual void report(StringBuffer &out)
  284. {
  285. CFixedDistributionTable::report(out);
  286. }
  287. virtual void noteValue(unsigned int val)
  288. {
  289. addValue(&val);
  290. }
  291. };
  292. class CInt64DistributionTable : public CFixedDistributionTable, implements IInt64DistributionTable
  293. {
  294. public:
  295. IMPLEMENT_IINTERFACE;
  296. CInt64DistributionTable(const char *_fieldname, unsigned threshold) : CFixedDistributionTable(_fieldname, sizeof(__int64), threshold)
  297. {
  298. }
  299. virtual void merge(MemoryBuffer &in) { CFixedDistributionTable::merge(in); }
  300. virtual void serialize(MemoryBuffer &out) { CFixedDistributionTable::serialize(out); }
  301. virtual void reportValue(StringBuffer &out, FixedMapper &val)
  302. {
  303. out.append(*(__int64 *)val.getKey());
  304. }
  305. virtual void report(StringBuffer &out)
  306. {
  307. CFixedDistributionTable::report(out);
  308. }
  309. virtual void noteValue(__int64 val)
  310. {
  311. addValue(&val);
  312. }
  313. };
  314. class CUInt64DistributionTable : public CFixedDistributionTable, implements IUInt64DistributionTable
  315. {
  316. public:
  317. IMPLEMENT_IINTERFACE;
  318. CUInt64DistributionTable(const char *_fieldname, unsigned threshold) : CFixedDistributionTable(_fieldname, sizeof(unsigned __int64), threshold)
  319. {
  320. }
  321. virtual void merge(MemoryBuffer &in) { CFixedDistributionTable::merge(in); }
  322. virtual void serialize(MemoryBuffer &out) { CFixedDistributionTable::serialize(out); }
  323. virtual void reportValue(StringBuffer &out, FixedMapper &val)
  324. {
  325. out.append(*(unsigned __int64 *)val.getKey());
  326. }
  327. virtual void report(StringBuffer &out)
  328. {
  329. CFixedDistributionTable::report(out);
  330. }
  331. virtual void noteValue(unsigned __int64 val)
  332. {
  333. addValue(&val);
  334. }
  335. };
  336. class CRealDistributionTable : public CFixedDistributionTable, implements IRealDistributionTable
  337. {
  338. public:
  339. IMPLEMENT_IINTERFACE;
  340. CRealDistributionTable(const char *_fieldname, unsigned threshold) : CFixedDistributionTable(_fieldname, sizeof(double), threshold)
  341. {
  342. }
  343. virtual void merge(MemoryBuffer &in) { CFixedDistributionTable::merge(in); }
  344. virtual void serialize(MemoryBuffer &out) { CFixedDistributionTable::serialize(out); }
  345. virtual void reportValue(StringBuffer &out, FixedMapper &val)
  346. {
  347. out.append(*(double *)val.getKey());
  348. }
  349. virtual void report(StringBuffer &out)
  350. {
  351. CFixedDistributionTable::report(out);
  352. }
  353. virtual void noteValue(double val)
  354. {
  355. addValue(&val);
  356. }
  357. };
  358. class CStringDistributionTable : public CFixedDistributionTable, implements IStringDistributionTable
  359. {
  360. public:
  361. IMPLEMENT_IINTERFACE;
  362. CStringDistributionTable(const char *_fieldname, unsigned _ksize, unsigned threshold) : CFixedDistributionTable(_fieldname, _ksize, threshold)
  363. {
  364. }
  365. virtual void merge(MemoryBuffer &in) { CFixedDistributionTable::merge(in); }
  366. virtual void serialize(MemoryBuffer &out) { CFixedDistributionTable::serialize(out); }
  367. virtual void reportValue(StringBuffer &out, FixedMapper &val)
  368. {
  369. encodeXML((const char *) val.getKey(), out, ENCODE_WHITESPACE, ksize);
  370. }
  371. virtual void report(StringBuffer &out)
  372. {
  373. CFixedDistributionTable::report(out);
  374. }
  375. virtual void noteValue(unsigned len, const char *val)
  376. {
  377. assertex(len==ksize);
  378. addValue(val);
  379. }
  380. };
  381. //--------------------------------------------------------------------------------------
  382. ECLRTL_API IStringDistributionTable *createIStringDistributionTable(const char *name, unsigned size)
  383. {
  384. switch (size)
  385. {
  386. case 0:
  387. // case UNKNOWN_LENGTH:
  388. assertex(false); // TBD
  389. case 1:
  390. return new CCharDistributionTable(name);
  391. default:
  392. return new CStringDistributionTable(name, size, DISTRIBUTION_THRESHOLD);
  393. }
  394. }
  395. ECLRTL_API IRealDistributionTable *createIRealDistributionTable(const char *name, unsigned size)
  396. {
  397. return new CRealDistributionTable(name, DISTRIBUTION_THRESHOLD);
  398. }
  399. ECLRTL_API IBoolDistributionTable *createIBoolDistributionTable(const char *name, unsigned size)
  400. {
  401. return new CBoolDistributionTable(name);
  402. }
  403. ECLRTL_API IIntDistributionTable *createIIntDistributionTable(const char *name, unsigned size)
  404. {
  405. // MORE - could optimize size 1
  406. return new CIntDistributionTable(name, DISTRIBUTION_THRESHOLD);
  407. }
  408. ECLRTL_API IInt64DistributionTable *createIInt64DistributionTable(const char *name, unsigned size)
  409. {
  410. return new CInt64DistributionTable(name, DISTRIBUTION_THRESHOLD);
  411. }
  412. ECLRTL_API IUIntDistributionTable *createIUIntDistributionTable(const char *name, unsigned size)
  413. {
  414. return new CUIntDistributionTable(name, DISTRIBUTION_THRESHOLD);
  415. }
  416. ECLRTL_API IUInt64DistributionTable *createIUInt64DistributionTable(const char *name, unsigned size)
  417. {
  418. return new CUInt64DistributionTable(name, DISTRIBUTION_THRESHOLD);
  419. }