浏览代码

HPCC-12693 Improve the support for huge pages

This patch checks if transparent huge pages are supported, and if so
uses madvise to use huge pages for the allocated memory.  It also avoids
fragmentation problems in the transparent huge pages by ensuring that
memory returned to the system is always a multiple of the huge page size.

Signed-off-by: Gavin Halliday <gavin.halliday@lexisnexis.com>
Gavin Halliday 10 年之前
父节点
当前提交
936410356e
共有 3 个文件被更改,包括 122 次插入16 次删除
  1. 76 15
      roxie/roxiemem/roxiemem.cpp
  2. 44 0
      system/jlib/jdebug.cpp
  3. 2 1
      system/jlib/jdebug.hpp

+ 76 - 15
roxie/roxiemem/roxiemem.cpp

@@ -97,6 +97,8 @@ static unsigned heapLWM;
 static unsigned heapLargeBlocks;
 static unsigned heapLargeBlockGranularity;
 static ILargeMemCallback * heapLargeBlockCallback;
+static bool heapNotifyUnusedEachFree = true;
+static bool heapNotifyUnusedEachBlock = false;
 static unsigned __int64 lastStatsCycles;
 static unsigned __int64 statsCyclesInterval;
 
@@ -107,6 +109,7 @@ static atomic_t dataBuffersActive;
 const unsigned UNSIGNED_BITS = sizeof(unsigned) * 8;
 const unsigned UNSIGNED_ALLBITS = (unsigned) -1;
 const unsigned TOPBITMASK = 1<<(UNSIGNED_BITS-1);
+const memsize_t heapBlockSize = UNSIGNED_BITS*HEAP_ALIGNMENT_SIZE;
 
 template <typename VALUE_TYPE, typename ALIGN_TYPE>
 inline VALUE_TYPE align_pow2(VALUE_TYPE value, ALIGN_TYPE alignment)
@@ -116,6 +119,18 @@ inline VALUE_TYPE align_pow2(VALUE_TYPE value, ALIGN_TYPE alignment)
 
 #define PAGES(x, alignment)    (((x) + ((alignment)-1)) / (alignment))           // hope the compiler converts to a shift
 
+inline void notifyMemoryUnused(void * address, memsize_t size)
+{
+#ifdef NOTIFY_UNUSED_PAGES_ON_FREE
+#ifdef _WIN32
+        VirtualAlloc(address, size, MEM_RESET, PAGE_READWRITE);
+#else
+        // for linux mark as unwanted
+        madvise(address,size,MADV_DONTNEED);
+#endif
+#endif
+}
+
 //---------------------------------------------------------------------------------------------------------------------
 
 typedef MapBetween<unsigned, unsigned, memsize_t, memsize_t> MapActivityToMemsize;
@@ -157,6 +172,8 @@ static void initializeHeap(bool allowHugePages, unsigned pages, unsigned largeBl
         if (heapBase != MAP_FAILED)
         {
             heapUseHugePages = true;
+            //MORE: At the moment I'm not sure calling madvise() has any benefit, but needs testing before releasing
+            //heapNotifyUnusedPagesOnFree = false;
             DBGLOG("Using Huge Pages for roxiemem");
         }
         else
@@ -172,8 +189,13 @@ static void initializeHeap(bool allowHugePages, unsigned pages, unsigned largeBl
 
     if (!heapBase)
     {
+        const memsize_t hugePageSize = getHugePageSize();
+        memsize_t heapAlignment = allowHugePages ? hugePageSize : HEAP_ALIGNMENT_SIZE;
+        if (heapAlignment < HEAP_ALIGNMENT_SIZE)
+            heapAlignment = HEAP_ALIGNMENT_SIZE;
+
         int ret;
-        if ((ret = posix_memalign((void **) &heapBase, HEAP_ALIGNMENT_SIZE, memsize)) != 0) {
+        if ((ret = posix_memalign((void **) &heapBase, heapAlignment, memsize)) != 0) {
 
         	switch (ret)
         	{
@@ -197,6 +219,29 @@ static void initializeHeap(bool allowHugePages, unsigned pages, unsigned largeBl
         	}
             HEAPERROR("RoxieMemMgr: Unable to create heap");
         }
+
+        //If we are allowed to use huge pages, then mark huge pages as beneficial
+        if (allowHugePages)
+        {
+            if (areTransparentHugePagesEnabled())
+            {
+                if (madvise(heapBase,memsize,MADV_HUGEPAGE) == 0)
+                {
+                    //Prevent the transparent huge page code from working hard trying to defragment memory when single heaplets are released
+                    heapNotifyUnusedEachFree = false;
+                    if ((heapBlockSize % hugePageSize) == 0)
+                    {
+                        //If we notify heapBlockSize items at a time it will always be a multiple of hugePageSize so shouldn't trigger defragmentation
+                        heapNotifyUnusedEachBlock = true;
+                        DBGLOG("Heap advised as worth using huge pages - memory released in blocks");
+                    }
+                    else
+                        DBGLOG("Heap advised as worth using huge pages - MEMORY WILL NOT BE RELEASED");
+                }
+            }
+            else
+                DBGLOG("Huge pages requested, but transparent huge pages currently disabled");
+        }
     }
 #endif
 
@@ -553,18 +598,15 @@ static void subfree_aligned(void *ptr, unsigned pages = 1)
         DBGLOG("RoxieMemMgr: Incorrect alignment of freed area (ptr=%p)", ptr);
         HEAPERROR("RoxieMemMgr: Incorrect alignment of freed area");
     }
-#ifdef NOTIFY_UNUSED_PAGES_ON_FREE
-#ifdef _WIN32
-    VirtualAlloc(ptr, pages*HEAP_ALIGNMENT_SIZE, MEM_RESET, PAGE_READWRITE);
-#else
-    // for linux mark as unwanted
-    madvise(ptr,pages*HEAP_ALIGNMENT_SIZE,MADV_DONTNEED);
-#endif
-#endif
+    if (heapNotifyUnusedEachFree)
+        notifyMemoryUnused(ptr, pages*HEAP_ALIGNMENT_SIZE);
+
     unsigned wordOffset = (unsigned) (pageOffset / UNSIGNED_BITS);
     unsigned bitOffset = (unsigned) (pageOffset % UNSIGNED_BITS);
     unsigned mask = 1<<bitOffset;
     unsigned nextPageOffset = (pageOffset+pages + (UNSIGNED_BITS-1)) / UNSIGNED_BITS;
+    char * firstReleaseBlock = NULL;
+    char * lastReleaseBlock = NULL;
     {
         CriticalBlock b(heapBitCrit);
         heapAllocated -= pages;
@@ -590,7 +632,17 @@ static void subfree_aligned(void *ptr, unsigned pages = 1)
         {
             unsigned prev = heapBitmap[wordOffset];
             if ((prev & mask) == 0)
-                heapBitmap[wordOffset] = (prev|mask);
+            {
+                unsigned next = prev | mask;
+                heapBitmap[wordOffset] = next;
+                if ((next == UNSIGNED_ALLBITS) && heapNotifyUnusedEachBlock)
+                {
+                    char * address = heapBase + wordOffset * heapBlockSize;
+                    if (!firstReleaseBlock)
+                        firstReleaseBlock = address;
+                    lastReleaseBlock = address;
+                }
+            }
             else
                 HEAPERROR("RoxieMemMgr: Page freed twice");
             if (!--pages)
@@ -604,6 +656,10 @@ static void subfree_aligned(void *ptr, unsigned pages = 1)
                 mask <<= 1;
         }
     }
+
+    if (firstReleaseBlock)
+        notifyMemoryUnused(firstReleaseBlock, (lastReleaseBlock - firstReleaseBlock) + heapBlockSize);
+
     if (memTraceLevel >= 2)
         DBGLOG("RoxieMemMgr: subfree_aligned() %u pages ok - addr=%p heapLWM=%u totalPages=%u", _pages, ptr, heapLWM, heapTotalPages);
 }
@@ -4529,6 +4585,8 @@ protected:
             _heapLWM = heapLWM;
             _heapAllocated = heapAllocated;
             _heapUseHugePages = heapUseHugePages;
+            _heapNotifyUnusedEachFree = heapNotifyUnusedEachFree;
+            _heapNotifyUnusedEachBlock = heapNotifyUnusedEachBlock;
         }
         ~HeapPreserver()
         {
@@ -4540,6 +4598,8 @@ protected:
             heapLWM = _heapLWM;
             heapAllocated = _heapAllocated;
             heapUseHugePages = _heapUseHugePages;
+            heapNotifyUnusedEachFree = _heapNotifyUnusedEachFree;
+            heapNotifyUnusedEachBlock = _heapNotifyUnusedEachBlock;
         }
         char *_heapBase;
         char *_heapEnd;
@@ -4549,6 +4609,8 @@ protected:
         unsigned _heapLWM;
         unsigned _heapAllocated;
         bool _heapUseHugePages;
+        bool _heapNotifyUnusedEachFree;
+        bool _heapNotifyUnusedEachBlock;
     };
     void initBitmap(unsigned size)
     {
@@ -4743,6 +4805,8 @@ protected:
         HeapPreserver preserver;
 
         initBitmap(maxBitmapSize);
+        heapNotifyUnusedEachFree = false; // prevent calls to map out random chunks of memory!
+        heapNotifyUnusedEachBlock = false;
 
         Semaphore sem;
         BitmapAllocatorThread * threads[numBitmapThreads];
@@ -4773,12 +4837,9 @@ protected:
     }
     void testBitmapThreading()
     {
-#ifndef NOTIFY_UNUSED_PAGES_ON_FREE
-        //Don't run this with NOTIFY_UNUSED_PAGES_ON_FREE enabled - I'm not sure what the calls to map out random memory are likely to do!
         testBitmapThreading(1);
         testBitmapThreading(3);
         testBitmapThreading(11);
-#endif
     }
 
     void testHuge()
@@ -5670,7 +5731,7 @@ public:
 protected:
     void testSetup()
     {
-        setTotalMemoryLimit(false, memorySize, 0, NULL, NULL);
+        setTotalMemoryLimit(true, memorySize, 0, NULL, NULL);
     }
 
     void testCleanup()
@@ -5866,7 +5927,7 @@ public:
 protected:
     void testSetup()
     {
-        setTotalMemoryLimit(false, hugeMemorySize, 0, NULL, NULL);
+        setTotalMemoryLimit(true, hugeMemorySize, 0, NULL, NULL);
     }
 
     void testCleanup()

+ 44 - 0
system/jlib/jdebug.cpp

@@ -2778,6 +2778,50 @@ void PrintMemoryReport(bool full)
 #endif
 
 
+bool areTransparentHugePagesEnabled()
+{
+#ifdef __linux__
+    StringBuffer contents;
+    try
+    {
+        contents.loadFile("/sys/kernel/mm/transparent_hugepage/enabled");
+        return !strstr(contents.str(), "[never]");
+    }
+    catch (IException * e)
+    {
+        e->Release();
+    }
+#endif
+    return false;
+}
+
+memsize_t getHugePageSize()
+{
+#ifdef __linux__
+    StringBuffer contents;
+    try
+    {
+        //Search for an entry   Hugepagesize:      xxxx kB
+        const char * const tag = "Hugepagesize:";
+        contents.loadFile("/proc/meminfo");
+        const char * hugepage = strstr(contents.str(), tag);
+        if (hugepage)
+        {
+            const char * next = hugepage + strlen(tag);
+            char * end;
+            memsize_t size = strtoul(next, &end, 10);
+            if (strncmp(end, " kB", 3) == 0)
+                return size * 0x400;
+        }
+    }
+    catch (IException * e)
+    {
+        e->Release();
+    }
+#endif
+    return 0x200000; // Default for an x86 system
+}
+
 //===========================================================================
 
 #ifdef LEAK_CHECK

+ 2 - 1
system/jlib/jdebug.hpp

@@ -307,7 +307,8 @@ extern jlib_decl unsigned getAffinityCpus();
 extern jlib_decl void printProcMap(const char *fn, bool printbody, bool printsummary, StringBuffer *lnout, MemoryBuffer *mb, bool useprintf);
 extern jlib_decl void PrintMemoryReport(bool full=true);
 extern jlib_decl void printAllocationSummary();
-
+extern jlib_decl bool areTransparentHugePagesEnabled();
+extern jlib_decl memsize_t getHugePageSize();
 
 #endif