浏览代码

HPCC-17625 JPTree memory usage and contention issues

Add a read-only atom table for further memory reduction. This is used on
attribute names only at present, though we could consider a similar usage for
element names?

Signed-off-by: Richard Chapman <rchapman@hpccsystems.com>
Richard Chapman 8 年之前
父节点
当前提交
f379f74b42
共有 5 个文件被更改,包括 283 次插入94 次删除
  1. 17 0
      system/jlib/jhash.hpp
  2. 81 0
      system/jlib/jptree-attrs.hpp
  3. 39 0
      system/jlib/jptree.cpp
  4. 143 92
      system/jlib/jptree.ipp
  5. 3 2
      system/jlib/jsuperhash.hpp

+ 17 - 0
system/jlib/jhash.hpp

@@ -518,6 +518,23 @@ public:
         return c;
     }
 
+    unsigned findIndex(const char *key, unsigned h)
+    {
+        unsigned i=h%htn;
+        while (table[i]) {
+            if ((table[i]->hash==h)&&table[i]->eq(key))
+                return i;
+            if (++i==htn)
+                i = 0;
+        }
+        return (unsigned) -1;
+    }
+
+    C *getIndex(unsigned v) const
+    {
+        assert(v != (unsigned)-1 && v < htn);
+        return table[v];
+    }
 
     void remove(C *c)
     {

+ 81 - 0
system/jlib/jptree-attrs.hpp

@@ -0,0 +1,81 @@
+    "@accessed",
+    "@activity",
+    "@agentSession",
+    "@buildVersion",
+    "@checkSum",
+    "@cloneable",
+    "@cluster",
+    "@clusterName",
+    "@codeVersion",
+    "@columnMapping",
+    "@connected",
+    "@created",
+    "@creator",
+    "@csvSeparate",
+    "@csvTerminate",
+    "@defaultBaseDir",
+    "@defaultReplicateDir",
+    "@description",
+    "@directory",
+    "@dllname",
+    "@eclVersion",
+    "@enqueuedt",
+    "@expireDays",
+    "@fetchEntire",
+    "@fieldwidth",
+    "@fileCrc",
+    "@filename",
+    "@footerLength",
+    "@format",
+    "@formatCrc",
+    "@hasArchive",
+    "@headerLength",
+    "@interleaved",
+    "@isArchive",
+    "@isClone",
+    "@isLibrary",
+    "@isScalar",
+    "@jobName",
+    "@libCount",
+    "@localValue",
+    "@mapFlags",
+    "@maxActivity",
+    "@minActivity",
+    "@modified",
+    "@numclusters",
+    "@numparts",
+    "@numsubfiles",
+    "@offset",
+    "@partmask",
+    "@persistent",
+    "@prevenqueuedt",
+    "@prevpriority",
+    "@prevwuid",
+    "@priority",
+    "@publishedBy",
+    "@recordCount",
+    "@recordSize",
+    "@recordSizeEntry",
+    "@redundancy",
+    "@rowLimit",
+    "@rowTag",
+    "@sequence",
+    "@serverId",
+    "@session",
+    "@severity",
+    "@source",
+    "@stateEx",
+    "@status",
+    "@submitID",
+    "@table:date-value",
+    "@table:number-columns-repeated",
+    "@table:style-name",
+    "@table:value",
+    "@table:value-type",
+    "@target",
+    "@totalThorTime",
+    "@transformerEntry",
+    "@waiting",
+    "@workunit",
+    "@wuidVersion",
+    "@xmlns:xsi",

+ 39 - 0
system/jlib/jptree.cpp

@@ -70,17 +70,56 @@ IPropertyTreeIterator *createNullPTreeIterator() { return LINK(nullPTreeIterator
 
 //===================================================================
 
+#ifdef USE_READONLY_ATOMTABLE
+RONameTable *AttrStrUnionWithTable::roNameTable = nullptr;
+#endif
 static AtomRefTable *keyTable = nullptr;
 static AtomRefTable *keyTableNC = nullptr;
+
 static CriticalSection hashcrit;
 static CAttrValHashTable *attrHT = nullptr;
 static AttrValue **freelist = nullptr;
 static unsigned freelistmax = 0;
 static CLargeMemoryAllocator freeallocator((memsize_t)-1, 0x1000*sizeof(AttrValue), true);
 
+#ifdef USE_READONLY_ATOMTABLE
+static const char * roAttributes[] =
+{
+#include "jptree-attrs.hpp"    // potentially auto-generated
+    nullptr
+};
+
+void initializeRoTable()
+{
+    for (const char **attr = roAttributes; *attr; attr++)
+    {
+        AttrStrUnionWithTable::roNameTable->find(*attr, true);
+    }
+#ifdef TRACE_ATOM_SIZE
+    // If you are wanting an idea of the savings from use of the RO hash table, it may be useful to reset
+    // the counts here. But it's more correct to actually leave them in place.
+    //AttrStrAtom::totsize = 0;
+    //AttrStrAtom::maxsize = 0;
+#endif
+#ifdef _DEBUG
+    for (const char **a = roAttributes; *a; a++)
+    {
+        // sanity check
+        unsigned idx = AttrStrUnionWithTable::roNameTable->findIndex(*a,AttrStrC::getHash(*a));
+        AttrStrC *val = AttrStrUnionWithTable::roNameTable->getIndex(idx);
+        assert(val && val->eq(*a));
+    }
+#endif
+}
+#endif
+
 MODULE_INIT(INIT_PRIORITY_JPTREE)
 {
     nullPTreeIterator = new NullPTreeIterator;
+#ifdef USE_READONLY_ATOMTABLE
+    AttrStrUnionWithTable::roNameTable = new RONameTable;
+    initializeRoTable();
+#endif
 	keyTable = new AtomRefTable;
 	keyTableNC = new AtomRefTable(true);
 	attrHT = new CAttrValHashTable;

+ 143 - 92
system/jlib/jptree.ipp

@@ -275,6 +275,97 @@ struct AttrStr
 #endif
 };
 
+
+// In order to keep code common between atom and local versions of ptree, we store the atom-specific information BEFORE the this pointer of AttrStr
+// This requires some care - in particular must use the right method to destroy the objects, and must not add any virtual methods to either class
+// Note that memory usage is significant as we create literally millions of these objects
+
+typedef unsigned hashfunc( const unsigned char *k, unsigned length, unsigned initval);
+
+struct AttrStrAtom
+{
+    unsigned hash;
+    unsigned short linkcount;
+    char str_DO_NOT_USE_DIRECTLY[1];  // Actually N
+
+    static AttrStrAtom *create(const char *k, size32_t kl, hashfunc _hash)
+    {
+#ifdef TRACE_ALL_ATOM
+        DBGLOG("TRACE_ALL_ATOM: %s", k);
+#endif
+#ifdef TRACE_ATOM_SIZE
+        totsize += sizeof(AttrStrAtom)+kl+1;
+        if (totsize > maxsize)
+        {
+            maxsize.store(totsize);
+            DBGLOG("TRACE_ATOM_SIZE: total size now %" I64F "d", maxsize.load());
+        }
+#endif
+        AttrStrAtom *ret = (AttrStrAtom *) malloc(offsetof(AttrStrAtom, str_DO_NOT_USE_DIRECTLY)+kl+1);
+        memcpy(ret->str_DO_NOT_USE_DIRECTLY, k, kl);
+        ret->str_DO_NOT_USE_DIRECTLY[kl] = 0;
+        ret->hash = _hash((const unsigned char *) k, kl, 17);
+        ret->linkcount = 0;
+        return ret;
+    }
+    static void destroy(AttrStrAtom *a)
+    {
+#ifdef TRACE_ATOM_SIZE
+        totsize -= sizeof(AttrStrAtom)+strlen(a->str_DO_NOT_USE_DIRECTLY)+1;
+#endif
+        free(a);
+    }
+
+    AttrStr *toAttrStr()
+    {
+        return (AttrStr *) &str_DO_NOT_USE_DIRECTLY;
+    }
+    static AttrStrAtom *toAtom(AttrStr *a)
+    {
+        return (AttrStrAtom *)(&a->str_DO_NOT_USE_DIRECTLY - offsetof(AttrStrAtom, str_DO_NOT_USE_DIRECTLY));
+    }
+#ifdef TRACE_ATOM_SIZE
+    static std::atomic<__int64> totsize;
+    static std::atomic<__int64> maxsize;
+#endif
+ };
+
+struct AttrStrC : public AttrStrAtom
+{
+    static inline unsigned getHash(const char *k)
+    {
+        return hashc((const byte *)k, strlen(k), 17);
+    }
+    inline bool eq(const char *k)
+    {
+        return streq(k,str_DO_NOT_USE_DIRECTLY);
+    }
+    static AttrStrC *create(const char *k)
+    {
+        size32_t kl = k ? strlen(k) : 0;
+        return (AttrStrC *) AttrStrAtom::create(k, kl, hashc);
+    }
+};
+
+struct AttrStrNC : public AttrStrAtom
+{
+    static inline unsigned getHash(const char *k)
+    {
+        return hashnc((const byte *)k, strlen(k), 17);
+    }
+    inline bool eq(const char *k)
+    {
+        return strieq(k,str_DO_NOT_USE_DIRECTLY);
+    }
+    static AttrStrNC *create(const char *k)
+    {
+        size32_t kl = k ? strlen(k) : 0;
+        return (AttrStrNC *) AttrStrAtom::create(k, kl, hashnc);
+    }
+};
+
+typedef CMinHashTable<AttrStrC> RONameTable;
+
 // NOTE - hairy code alert!
 // To save on storage (and contention) we store short string values in same slot as the pointer to longer
 // ones would occupy. This relies on the assumption that the pointers you want to store are always AT LEAST
@@ -293,9 +384,18 @@ struct PtrStrUnion
         {
 #ifdef LITTLE_ENDIAN
             char flag;
-            char chars[sizeof(PTR *)-1];
-#else
-            char chars[sizeof(PTR *)-1];
+#endif
+            union
+            {
+                char chars[sizeof(PTR *)-1];
+                struct
+                {
+                    int8_t idx1;
+                    int16_t idx2;
+                    int32_t idx4;
+                };
+            };
+#ifndef LITTLE_ENDIAN
             char flag;
 #endif
         };
@@ -308,7 +408,10 @@ struct PtrStrUnion
     inline const char *get() const
     {
         if (!isPtr())
+        {
+            assert(flag==1);
             return chars;
+        }
         else if (ptr)
             return ptr->get();
         else
@@ -372,11 +475,47 @@ struct PtrStrUnion
     }
 };
 
+#ifdef USE_STRUNION
+#define USE_READONLY_ATOMTABLE
+#endif
+
 typedef PtrStrUnion<AttrStr> AttrStrUnion;
 
+#ifdef USE_READONLY_ATOMTABLE
+struct AttrStrUnionWithTable : public AttrStrUnion
+{
+    inline const char *get() const
+    {
+        if (!isPtr() && flag==3)
+            return roNameTable->getIndex(idx4)->str_DO_NOT_USE_DIRECTLY;  // Should probably rename this back now!
+        return AttrStrUnion::get();
+    }
+    bool set(const char *key)
+    {
+        if (AttrStrUnion::set(key))
+            return true;
+        if (key && key[0]=='@')
+        {
+            unsigned idx = roNameTable->findIndex(key, AttrStrC::getHash(key));
+            if (idx != (unsigned) -1)
+            {
+                flag = 3;
+                idx4 = idx;
+                return true;
+            }
+        }
+        return false;
+    }
+    static RONameTable *roNameTable;
+};
+#else
+typedef AttrStrUnion AttrStrUnionWithTable;
+
+#endif
+
 struct AttrValue
 {
-    AttrStrUnion key;
+    AttrStrUnionWithTable key;
     AttrStrUnion value;
 };
 
@@ -515,94 +654,6 @@ protected: // data
 };
 
 
-// In order to keep code common between atom and local versions of ptree, we store the atom-specific information BEFORE the this pointer of AttrStr
-// This requires some care - in particular must use the right method to destroy the objects, and must not add any virtual methods to either class
-// Note that memory usage is significant as we create literally millions of these objects
-
-typedef unsigned hashfunc( const unsigned char *k, unsigned length, unsigned initval);
-
-struct AttrStrAtom
-{
-    unsigned hash;
-    unsigned short linkcount;
-    char str_DO_NOT_USE_DIRECTLY[1];  // Actually N
-
-    static AttrStrAtom *create(const char *k, size32_t kl, hashfunc _hash)
-    {
-#ifdef TRACE_ALL_ATOM
-        DBGLOG("TRACE_ALL_ATOM: %s", k);
-#endif
-#ifdef TRACE_ATOM_SIZE
-        totsize += sizeof(AttrStrAtom)+kl+1;
-        if (totsize > maxsize)
-        {
-            maxsize.store(totsize);
-            DBGLOG("TRACE_ATOM_SIZE: total size now %" I64F "d", maxsize.load());
-        }
-#endif
-        AttrStrAtom *ret = (AttrStrAtom *) malloc(offsetof(AttrStrAtom, str_DO_NOT_USE_DIRECTLY)+kl+1);
-        memcpy(ret->str_DO_NOT_USE_DIRECTLY, k, kl);
-        ret->str_DO_NOT_USE_DIRECTLY[kl] = 0;
-        ret->hash = _hash((const unsigned char *) k, kl, 17);
-        ret->linkcount = 0;
-        return ret;
-    }
-    static void destroy(AttrStrAtom *a)
-    {
-#ifdef TRACE_ATOM_SIZE
-        totsize -= sizeof(AttrStrAtom)+strlen(a->str_DO_NOT_USE_DIRECTLY)+1;
-#endif
-        free(a);
-    }
-
-    AttrStr *toAttrStr()
-    {
-        return (AttrStr *) &str_DO_NOT_USE_DIRECTLY;
-    }
-    static AttrStrAtom *toAtom(AttrStr *a)
-    {
-        return (AttrStrAtom *)(&a->str_DO_NOT_USE_DIRECTLY - offsetof(AttrStrAtom, str_DO_NOT_USE_DIRECTLY));
-    }
-#ifdef TRACE_ATOM_SIZE
-    static std::atomic<__int64> totsize;
-    static std::atomic<__int64> maxsize;
-#endif
- };
-
-struct AttrStrC : public AttrStrAtom
-{
-    static inline unsigned getHash(const char *k)
-    {
-        return hashc((const byte *)k, strlen(k), 17);
-    }
-    inline bool eq(const char *k)
-    {
-        return streq(k,str_DO_NOT_USE_DIRECTLY);
-    }
-    static AttrStrC *create(const char *k)
-    {
-        size32_t kl = k ? strlen(k) : 0;
-        return (AttrStrC *) AttrStrAtom::create(k, kl, hashc);
-    }
-};
-
-struct AttrStrNC : public AttrStrAtom
-{
-    static inline unsigned getHash(const char *k)
-    {
-        return hashnc((const byte *)k, strlen(k), 17);
-    }
-    inline bool eq(const char *k)
-    {
-        return strieq(k,str_DO_NOT_USE_DIRECTLY);
-    }
-    static AttrStrNC *create(const char *k)
-    {
-        size32_t kl = k ? strlen(k) : 0;
-        return (AttrStrNC *) AttrStrAtom::create(k, kl, hashnc);
-    }
-};
-
 class CAttrValHashTable
 {
     CMinHashTable<AttrStrC>  htc;

+ 3 - 2
system/jlib/jsuperhash.hpp

@@ -72,11 +72,12 @@ protected:
     bool             removeExact(void * et);
     inline void      setCache(unsigned v) const { cache = v; }
 
+    inline unsigned  doFind(const void * findParam) const
+      { return doFind(getHashFromFindParam(findParam), findParam); }
+
 private:
     bool             doAdd(void *, bool);
     void             doDeleteElement(unsigned);
-    inline unsigned  doFind(const void * findParam) const
-      { return doFind(getHashFromFindParam(findParam), findParam); }
     unsigned         doFind(unsigned, const void *) const;
     unsigned         doFindElement(unsigned, const void *) const;
     unsigned         doFindNew(unsigned) const;