123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451 |
- /*##############################################################################
- HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ############################################################################## */
- #include "platform.h"
- #include <math.h>
- #include <stdio.h>
- #include "jexcept.hpp"
- #include "jmisc.hpp"
- #include "jutil.hpp"
- #include "jlib.hpp"
- #include "rtldistr.hpp"
- #define DISTRIBUTION_THRESHOLD 10000
- //---------------------------------------------------------------------------
- class CDistributionTable : public CInterface
- {
- protected:
- StringAttr fieldname;
- public:
- CDistributionTable(const char *_fieldname) : fieldname(_fieldname) {}
- virtual unsigned __int64 distinct() = 0;
- virtual bool exact() = 0;
- virtual void reportValues(StringBuffer &out) = 0;
- virtual void report(StringBuffer &out)
- {
- unsigned __int64 d = distinct();
- out.append("<Field name=\"").append(fieldname).append("\"");
- if (exact())
- {
- out.append(" distinct=\"").append(d).append("\">\n");
- reportValues(out);
- out.append("</Field>\n");
- }
- else
- out.append(" estimate=\"").append(d).append("\"/>\n");
- }
- };
- class CBoolDistributionTable : public CDistributionTable, implements IBoolDistributionTable
- {
- unsigned __int64 counts[2];
- public:
- IMPLEMENT_IINTERFACE;
- CBoolDistributionTable(const char *_fieldname) : CDistributionTable(_fieldname)
- {
- counts[0] = counts[1] = 0;
- }
- virtual void report(StringBuffer &out)
- {
- CDistributionTable::report(out);
- }
- virtual void merge(MemoryBuffer &in)
- {
- unsigned __int64 c[2];
- in.read(sizeof(c), &c);
- counts[false] += c[false];
- counts[true] += c[true];
- }
- virtual void serialize(MemoryBuffer &out) { out.append(sizeof(counts), &counts); }
- virtual void reportValues(StringBuffer &out)
- {
- if (counts[0])
- out.appendf(" <Value count=\"%" I64F "d\">false</Value>\n", counts[0]);
- if (counts[1])
- out.appendf(" <Value count=\"%" I64F "d\">true</Value>\n", counts[1]);
- }
- virtual void noteValue(bool val)
- {
- counts[val]++;
- }
- virtual unsigned __int64 distinct()
- {
- return (counts[0] != 0) + (counts[1] != 0);
- }
- virtual bool exact()
- {
- return true;
- }
- };
- class CByteDistributionTable : public CDistributionTable
- {
- unsigned __int64 counts[256];
- public:
- CByteDistributionTable(const char *_fieldname) : CDistributionTable(_fieldname)
- {
- memset(counts, 0, sizeof(counts));
- }
- virtual unsigned __int64 distinct()
- {
- unsigned __int64 ret = 0;
- for (unsigned i = 0; i < 256; i++)
- if (counts[i])
- ret++;
- return ret;
- }
- virtual bool exact()
- {
- return true;
- }
- virtual void merge(MemoryBuffer &in)
- {
- unsigned __int64 _counts[256];
- in.read(sizeof(_counts), &_counts);
- for (unsigned i = 0; i < _elements_in(counts); i++)
- counts[i] += _counts[i];
- }
- virtual void serialize(MemoryBuffer &out) { out.append(sizeof(counts), &counts); }
- virtual void reportValues(StringBuffer &out)
- {
- for (unsigned i = 0; i < 256; i++)
- if (counts[i])
- {
- out.appendf(" <Value count=\"%" I64F "d\">", counts[i]);
- reportValue(out, i);
- out.append("</Value>\n");
- }
- }
- virtual void reportValue(StringBuffer &out, unsigned val)
- {
- out.append(val);
- }
- void doNoteValue(unsigned int val)
- {
- counts[val]++;
- }
- };
- class CCharDistributionTable : public CByteDistributionTable, implements IStringDistributionTable
- {
- public:
- IMPLEMENT_IINTERFACE;
- CCharDistributionTable(const char *_fieldname) : CByteDistributionTable(_fieldname) {}
- virtual void reportValue(StringBuffer &out, unsigned val)
- {
- unsigned char v = val;
- encodeXML((const char *) &v, out, ENCODE_WHITESPACE, 1);
- }
- virtual void report(StringBuffer &out)
- {
- CByteDistributionTable::report(out);
- }
- virtual void merge(MemoryBuffer &in) { CByteDistributionTable::merge(in); }
- virtual void serialize(MemoryBuffer &out) { CByteDistributionTable::serialize(out); }
- virtual void noteValue(unsigned len, const char *val)
- {
- assertex(len==1);
- doNoteValue((unsigned char) *val);
- }
- };
- class FixedMapper : public Mapping
- {
- public:
- FixedMapper(const void *k, int ksize);
- unsigned __int64 count;
- };
- FixedMapper::FixedMapper(const void *_key, int _ksize) : Mapping(_key, _ksize)
- {
- count = 0;
- }
- class CFixedDistributionTable : public CDistributionTable
- {
- public:
- CFixedDistributionTable(const char *_fieldname, unsigned _ksize, unsigned _threshold)
- : CDistributionTable(_fieldname), threshold(_threshold), table(_ksize, false), ksize(_ksize)
- {
- estimated = false;
- cardinality = 0;
- }
- virtual unsigned __int64 distinct() { return estimated ? cardinality : table.count(); }
- virtual bool exact() { return !estimated; }
- virtual void merge(MemoryBuffer &in)
- {
- bool inEstimated;
- unsigned inNum, inCardinality;
- in.read(inCardinality).read(inEstimated).read(inNum);
- if (inEstimated) estimated = true;
- cardinality += inCardinality;
- for (unsigned idx=0; idx < inNum; idx++)
- {
- const void * key;
- unsigned __int64 count;
- key = in.readDirect(ksize);
- in.read(count);
- FixedMapper * mapped = queryLookup(key);
- if (mapped)
- mapped->count += count;
- }
- }
- virtual void serialize(MemoryBuffer &out)
- {
- out.append(cardinality);
- out.append(estimated);
- out.append(table.count());
- HashIterator iter(table);
- ForEach(iter)
- {
- FixedMapper & cur = (FixedMapper &) iter.get();
- out.append(ksize, cur.getKey()).append(cur.count);
- }
- }
- void addValue(const void *buf)
- {
- FixedMapper *mapped = queryLookup(buf);
- if (mapped)
- mapped->count++;
- }
- virtual void reportValue(StringBuffer &out, FixedMapper &val) = 0;
- virtual void reportValues(StringBuffer &out)
- {
- HashIterator iter(table);
- ForEach(iter)
- {
- FixedMapper & cur = (FixedMapper &) iter.get();
- out.appendf(" <Value count=\"%" I64F "d\">", cur.count);
- reportValue(out, cur);
- out.append("</Value>\n");
- }
- }
- protected:
- FixedMapper * queryLookup(const void *buf)
- {
- if (estimated)
- return NULL;
- FixedMapper *mapped = (FixedMapper *) table.find(buf);
- if (!mapped)
- {
- mapped = new FixedMapper(buf, ksize);
- table.addOwn(*mapped);
- cardinality++;
- if (cardinality==threshold)
- estimated = true;
- }
- return mapped;
- }
- protected:
- unsigned ksize;
- unsigned cardinality;
- unsigned threshold;
- bool estimated;
- KeptHashTable table;
- };
- class CIntDistributionTable : public CFixedDistributionTable, implements IIntDistributionTable
- {
- public:
- IMPLEMENT_IINTERFACE;
- CIntDistributionTable(const char *_fieldname, unsigned threshold) : CFixedDistributionTable(_fieldname, sizeof(int), threshold)
- {
- }
- virtual void merge(MemoryBuffer &in) { CFixedDistributionTable::merge(in); }
- virtual void serialize(MemoryBuffer &out) { CFixedDistributionTable::serialize(out); }
- virtual void reportValue(StringBuffer &out, FixedMapper &val)
- {
- out.append(*(int *)val.getKey());
- }
- virtual void report(StringBuffer &out)
- {
- CFixedDistributionTable::report(out);
- }
- virtual void noteValue(int val)
- {
- addValue(&val);
- }
- };
- class CUIntDistributionTable : public CFixedDistributionTable, implements IUIntDistributionTable
- {
- public:
- IMPLEMENT_IINTERFACE;
- CUIntDistributionTable(const char *_fieldname, unsigned threshold) : CFixedDistributionTable(_fieldname, sizeof(unsigned int), threshold)
- {
- }
- virtual void merge(MemoryBuffer &in) { CFixedDistributionTable::merge(in); }
- virtual void serialize(MemoryBuffer &out) { CFixedDistributionTable::serialize(out); }
- virtual void reportValue(StringBuffer &out, FixedMapper &val)
- {
- out.append(*(unsigned int *)val.getKey());
- }
- virtual void report(StringBuffer &out)
- {
- CFixedDistributionTable::report(out);
- }
- virtual void noteValue(unsigned int val)
- {
- addValue(&val);
- }
- };
- class CInt64DistributionTable : public CFixedDistributionTable, implements IInt64DistributionTable
- {
- public:
- IMPLEMENT_IINTERFACE;
- CInt64DistributionTable(const char *_fieldname, unsigned threshold) : CFixedDistributionTable(_fieldname, sizeof(__int64), threshold)
- {
- }
- virtual void merge(MemoryBuffer &in) { CFixedDistributionTable::merge(in); }
- virtual void serialize(MemoryBuffer &out) { CFixedDistributionTable::serialize(out); }
- virtual void reportValue(StringBuffer &out, FixedMapper &val)
- {
- out.append(*(__int64 *)val.getKey());
- }
- virtual void report(StringBuffer &out)
- {
- CFixedDistributionTable::report(out);
- }
- virtual void noteValue(__int64 val)
- {
- addValue(&val);
- }
- };
- class CUInt64DistributionTable : public CFixedDistributionTable, implements IUInt64DistributionTable
- {
- public:
- IMPLEMENT_IINTERFACE;
- CUInt64DistributionTable(const char *_fieldname, unsigned threshold) : CFixedDistributionTable(_fieldname, sizeof(unsigned __int64), threshold)
- {
- }
- virtual void merge(MemoryBuffer &in) { CFixedDistributionTable::merge(in); }
- virtual void serialize(MemoryBuffer &out) { CFixedDistributionTable::serialize(out); }
- virtual void reportValue(StringBuffer &out, FixedMapper &val)
- {
- out.append(*(unsigned __int64 *)val.getKey());
- }
- virtual void report(StringBuffer &out)
- {
- CFixedDistributionTable::report(out);
- }
- virtual void noteValue(unsigned __int64 val)
- {
- addValue(&val);
- }
- };
- class CRealDistributionTable : public CFixedDistributionTable, implements IRealDistributionTable
- {
- public:
- IMPLEMENT_IINTERFACE;
- CRealDistributionTable(const char *_fieldname, unsigned threshold) : CFixedDistributionTable(_fieldname, sizeof(double), threshold)
- {
- }
- virtual void merge(MemoryBuffer &in) { CFixedDistributionTable::merge(in); }
- virtual void serialize(MemoryBuffer &out) { CFixedDistributionTable::serialize(out); }
- virtual void reportValue(StringBuffer &out, FixedMapper &val)
- {
- out.append(*(double *)val.getKey());
- }
- virtual void report(StringBuffer &out)
- {
- CFixedDistributionTable::report(out);
- }
- virtual void noteValue(double val)
- {
- addValue(&val);
- }
- };
- class CStringDistributionTable : public CFixedDistributionTable, implements IStringDistributionTable
- {
- public:
- IMPLEMENT_IINTERFACE;
- CStringDistributionTable(const char *_fieldname, unsigned _ksize, unsigned threshold) : CFixedDistributionTable(_fieldname, _ksize, threshold)
- {
- }
- virtual void merge(MemoryBuffer &in) { CFixedDistributionTable::merge(in); }
- virtual void serialize(MemoryBuffer &out) { CFixedDistributionTable::serialize(out); }
- virtual void reportValue(StringBuffer &out, FixedMapper &val)
- {
- encodeXML((const char *) val.getKey(), out, ENCODE_WHITESPACE, ksize);
- }
- virtual void report(StringBuffer &out)
- {
- CFixedDistributionTable::report(out);
- }
- virtual void noteValue(unsigned len, const char *val)
- {
- assertex(len==ksize);
- addValue(val);
- }
- };
- //--------------------------------------------------------------------------------------
- ECLRTL_API IStringDistributionTable *createIStringDistributionTable(const char *name, unsigned size)
- {
- switch (size)
- {
- case 0:
- // case UNKNOWN_LENGTH:
- assertex(false); // TBD
- case 1:
- return new CCharDistributionTable(name);
- default:
- return new CStringDistributionTable(name, size, DISTRIBUTION_THRESHOLD);
- }
- }
- ECLRTL_API IRealDistributionTable *createIRealDistributionTable(const char *name, unsigned size)
- {
- return new CRealDistributionTable(name, DISTRIBUTION_THRESHOLD);
- }
- ECLRTL_API IBoolDistributionTable *createIBoolDistributionTable(const char *name, unsigned size)
- {
- return new CBoolDistributionTable(name);
- }
- ECLRTL_API IIntDistributionTable *createIIntDistributionTable(const char *name, unsigned size)
- {
- // MORE - could optimize size 1
- return new CIntDistributionTable(name, DISTRIBUTION_THRESHOLD);
- }
- ECLRTL_API IInt64DistributionTable *createIInt64DistributionTable(const char *name, unsigned size)
- {
- return new CInt64DistributionTable(name, DISTRIBUTION_THRESHOLD);
- }
- ECLRTL_API IUIntDistributionTable *createIUIntDistributionTable(const char *name, unsigned size)
- {
- return new CUIntDistributionTable(name, DISTRIBUTION_THRESHOLD);
- }
- ECLRTL_API IUInt64DistributionTable *createIUInt64DistributionTable(const char *name, unsigned size)
- {
- return new CUInt64DistributionTable(name, DISTRIBUTION_THRESHOLD);
- }
|