Преглед на файлове

Merge branch 'candidate-7.12.x'

Signed-off-by: Richard Chapman <rchapman@hpccsystems.com>
Richard Chapman преди 4 години
родител
ревизия
82bbdd787d
променени са 41 файла, в които са добавени 231 реда и са изтрити 70 реда
  1. 1 1
      .github/workflows/build-and-publish-debug.yml
  2. 15 1
      .github/workflows/build-and-publish.yml
  3. 15 5
      .github/workflows/build-test-eclwatch.yml
  4. 17 8
      .github/workflows/build-windows.yml
  5. 15 7
      .github/workflows/test-helm.yml
  6. 6 0
      dali/sasha/saserver.cpp
  7. 9 0
      dali/server/daserver.cpp
  8. 37 4
      docs/EN_US/ECLWatch/TheECLWatchMan.xml
  9. BIN
      docs/EN_US/images/ECLWaGage01.jpg
  10. BIN
      docs/EN_US/images/ECLWaGage02.jpg
  11. BIN
      docs/PT_BR/images/ECLWaGage01.jpg
  12. BIN
      docs/PT_BR/images/ECLWaGage02.jpg
  13. 18 2
      ecl/agentexec/agentexec.cpp
  14. 5 0
      ecl/eclccserver/eclccserver.cpp
  15. 2 1
      esp/platform/espcfg.cpp
  16. 2 2
      esp/services/ws_topology/ws_topologyService.cpp
  17. 1 1
      esp/src/eclwatch/PreflightDetailsWidget.js
  18. 1 1
      esp/src/eclwatch/TargetClustersQueryWidget.js
  19. 1 1
      esp/src/eclwatch/templates/MachineInformationWidget.html
  20. 7 0
      fs/dafilesrv/dafilesrv.cpp
  21. 1 0
      helm/hpcc/templates/eclagent.yaml
  22. 1 0
      helm/hpcc/templates/eclccserver.yaml
  23. 0 4
      roxie/ccd/ccdcontext.cpp
  24. 2 1
      roxie/ccd/ccddali.cpp
  25. 6 5
      roxie/ccd/ccdlistener.cpp
  26. 15 3
      roxie/ccd/ccdmain.cpp
  27. 6 5
      roxie/ccd/ccdprotocol.cpp
  28. 2 1
      roxie/ccd/ccdqueue.cpp
  29. 4 0
      roxie/ccd/ccdstate.cpp
  30. 1 1
      roxie/ccd/hpccprotocol.hpp
  31. 15 6
      roxie/roxiemem/roxiemem.cpp
  32. 1 1
      roxie/udplib/udpipmap.hpp
  33. 5 1
      roxie/udplib/udplib.hpp
  34. 0 1
      roxie/udplib/udptopo.cpp
  35. 1 3
      system/jlib/jsocket.cpp
  36. 3 1
      system/jlib/jthread.cpp
  37. 1 1
      testing/unittests/unittests.cpp
  38. 2 1
      thorlcr/activities/diskread/thdiskreadslave.cpp
  39. 5 0
      thorlcr/activities/hashdistrib/thhashdistribslave.cpp
  40. 4 1
      thorlcr/master/thmastermain.cpp
  41. 4 0
      thorlcr/slave/thslavemain.cpp

+ 1 - 1
.github/workflows/build-and-publish-debug.yml

@@ -14,7 +14,7 @@ on:
 
 jobs:
   build:
-    name: Build
+    name: "Build and publish debug container"
     runs-on: ubuntu-latest
     if: github.repository == 'hpcc-systems/HPCC-Platform'
     steps:

+ 15 - 1
.github/workflows/build-and-publish.yml

@@ -14,7 +14,7 @@ on:
 
 jobs:
   build:
-    name: Build
+    name: "Build and publish release container"
     runs-on: ubuntu-latest
     if: github.repository == 'hpcc-systems/HPCC-Platform'
     steps:
@@ -26,3 +26,17 @@ jobs:
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
           latest: 1   # this should only be set on the current minor branch
+
+      - name: Notify on failure
+        uses: hpcc-systems/action-send-mail@v2
+        if: ${{ failure() }}
+        with:
+          server_address: smtp.gmail.com
+          server_port: 465
+          username: ${{secrets.GMAIL_USER}}
+          password: ${{secrets.GMAIL_PW}}
+          subject: Github Actions job result
+          body: Build job of ${{github.repository}} failed!
+          to: ${{secrets.GMAIL_RECIPIENTS}}
+          from: Github Actions
+

+ 15 - 5
.github/workflows/build-test-eclwatch.yml

@@ -10,8 +10,6 @@ on:
       - "!candidate-7.2.*"
       - "!candidate-7.0.*"
       - "!candidate-6.*"
-    paths:
-      - "esp/src/**/*"
   pull_request:
     branches:
       - "master"
@@ -21,12 +19,24 @@ on:
       - "!candidate-7.2.*"
       - "!candidate-7.0.*"
       - "!candidate-6.*"
-    paths:
-      - "esp/src/**/*"
 
 jobs:
+  pre_job:
+    runs-on: ubuntu-latest
+    # Map a step output to a job output
+    outputs:
+      should_skip: ${{ steps.skip_check.outputs.should_skip }}
+    steps:
+      - id: skip_check
+        uses: hpcc-systems/skip-duplicate-actions@master
+        with:
+          github_token: ${{ github.token }}
+          paths: '["esp/src/**/*" ]'
+
   build:
-    if: github.repository == 'hpcc-systems/HPCC-Platform'
+    name: "Check eclwatch and npm"
+    needs: pre_job
+    if: ${{ needs.pre_job.outputs.should_skip != 'true' && github.repository == 'hpcc-systems/HPCC-Platform' }}
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2

+ 17 - 8
.github/workflows/build-windows.yml

@@ -5,23 +5,32 @@ on:
     branches:
       - "master"
       - "candidate-*"
+      - "!candidate-7.10.*"
       - "!candidate-7.8.*"
       - "!candidate-7.6.*"
       - "!candidate-7.4.*"
       - "!candidate-7.2.*"
       - "!candidate-7.0.*"
       - "!candidate-6.*"
-    paths-ignore:
-      - docs/**
-      - esp/src/**/*
-      - dockerfiles/**
-      - helm/**
-      - ecllibrary/**
-      - testing/**
-      - initfiles/**
 
 jobs:
+  pre_job:
+    # continue-on-error: true # Uncomment once integration is finished
+    runs-on: ubuntu-latest
+    # Map a step output to a job output
+    outputs:
+      should_skip: ${{ steps.skip_check.outputs.should_skip }}
+    steps:
+      - id: skip_check
+        uses: hpcc-systems/skip-duplicate-actions@master
+        with:
+          github_token: ${{ github.token }}
+          paths_ignore: '["docs/**", "esp/src/**/*", "dockerfiles/**", "helm/**", "ecllibrary/**", "testing/**", "initfiles/**" ]'
+
   build:
+    name: "Check compiles on Win32"
+    needs: pre_job
+    if: ${{ needs.pre_job.outputs.should_skip != 'true' }}
     runs-on: windows-latest
     steps:
       - name: Display build summary

+ 15 - 7
.github/workflows/test-helm.yml

@@ -11,9 +11,6 @@ on:
       - "!candidate-7.2.*"
       - "!candidate-7.0.*"
       - "!candidate-6.*"
-    paths:
-      - "helm/**/*"
-      - "testing/helm/**/*"
   pull_request:
     branches:
       - "master"
@@ -24,14 +21,25 @@ on:
       - "!candidate-7.2.*"
       - "!candidate-7.0.*"
       - "!candidate-6.*"
-    paths:
-      - "helm/**/*"
-      - "testing/helm/**/*"
 
 jobs:
+  pre_job:
+     # continue-on-error: true # Uncomment once integration is finished
+     runs-on: ubuntu-latest
+     # Map a step output to a job output
+     outputs:
+       should_skip: ${{ steps.skip_check.outputs.should_skip }}
+     steps:
+       - id: skip_check
+         uses: hpcc-systems/skip-duplicate-actions@master
+         with:
+           github_token: ${{ github.token }}
+           paths: '["helm/**/*", "testing/helm/**/*" ]'
   build:
+    name: "Check helm chart lint"
     runs-on: ubuntu-latest
-    if: github.repository == 'hpcc-systems/HPCC-Platform'
+    needs: pre_job
+    if: ${{ github.repository == 'hpcc-systems/HPCC-Platform' && needs.pre_job.outputs.should_skip != 'true' }}
     steps:
       - uses: actions/checkout@v2
         with:

+ 6 - 0
dali/sasha/saserver.cpp

@@ -351,7 +351,9 @@ int main(int argc, const char* argv[])
                 coalesceDatastore(force);
             }
             else {
+#ifndef _CONTAINERIZED
                 startPerformanceMonitor(serverConfig->getPropInt("@perfReportDelay", DEFAULT_PERF_REPORT_DELAY)*1000);
+#endif
                 AddServers();
                 addAbortHandler(actionOnAbort);
 
@@ -397,7 +399,9 @@ int main(int argc, const char* argv[])
                 delete stopThread;
 
                 PROGLOG("SASERVER exiting");
+#ifndef _CONTAINERIZED
                 stopPerformanceMonitor();
+#endif
             }
             delete SashaServerStatus;
             SashaServerStatus = NULL;
@@ -405,7 +409,9 @@ int main(int argc, const char* argv[])
     }
     catch(IException *e){ 
         EXCLOG(e, "Sasha Server Exception: ");
+#ifndef _CONTAINERIZED
         stopPerformanceMonitor();
+#endif
         e->Release();
     }
     catch (const char *s) {

+ 9 - 0
dali/server/daserver.cpp

@@ -687,7 +687,10 @@ int main(int argc, const char* argv[])
             UseSysLogForOperatorMessages();
         AddServers(auditDir.str());
         addAbortHandler(actionOnAbort);
+
+#ifndef _CONTAINERIZED
         startPerformanceMonitor(serverConfig->getPropInt("Coven/@perfReportDelay", DEFAULT_PERF_REPORT_DELAY)*1000);
+#endif
         StringBuffer absPath;
         makeAbsolutePath(dataPath.str(), absPath);
         setPerformanceMonitorPrimaryFileSystem(absPath.str());
@@ -710,7 +713,9 @@ int main(int argc, const char* argv[])
         {
             EXCLOG(e, "Failed whilst starting servers");
             stopServer();
+#ifndef _CONTAINERIZED
             stopPerformanceMonitor();
+#endif
             throw;
         }
         try {
@@ -730,7 +735,9 @@ int main(int argc, const char* argv[])
         catch (IException *e) {
             EXCLOG(e, "LDAP initialization error");
             stopServer();
+#ifndef _CONTAINERIZED
             stopPerformanceMonitor();
+#endif
             throw;
         }
         PROGLOG("DASERVER[%d] starting - listening to port %d",myrank,queryMyNode()->endpoint().port);
@@ -754,7 +761,9 @@ int main(int argc, const char* argv[])
             removeAbortHandler(actionOnAbort);
         }
         stopServer();
+#ifndef _CONTAINERIZED
         stopPerformanceMonitor();
+#endif
     }
     catch (IException *e) {
         EXCLOG(e, "Exception");

+ 37 - 4
docs/EN_US/ECLWatch/TheECLWatchMan.xml

@@ -710,12 +710,45 @@
       page in ECL Watch click on the <emphasis role="bold">ECL
       Watch</emphasis> image at the top of any page, as shown above.</para>
 
-      <sect2>
-        <title>Cluster Disk Usage</title>
+      <sect2 id="ClusterDiskUsageGraphs">
+        <title>Cluster Disk Usage Graphs</title>
 
         <para>The Cluster Activity page displays graphs along the top showing
-        cluster disk usage. Clicking on each of the images displays more
-        information about the individual disk usage activity.</para>
+        cluster disk usage. These graphs can give you a quick glance at the
+        capacity of your clusters. Clicking on each of the images displays
+        more information about the individual cluster's disk usage
+        activity.</para>
+
+        <para><figure>
+            <title>Cluster Activity Page Graphs</title>
+
+            <mediaobject>
+              <imageobject>
+                <imagedata fileref="images/ECLWaGage01.jpg" />
+              </imageobject>
+            </mediaobject>
+          </figure>The graphs show the amount of available storage, the
+        average amount of storage in use, and the maximum amount of storage in
+        use across all nodes. The dark indicator line displays the average in
+        use across the disks in that cluster. The colored doughnut portion
+        shows the maximum amount of storage in use by any single node. If this
+        differs greatly, it could affect performance. </para>
+
+        <para>The color of the graphs change as the storage capacity changes.
+        Green indicates low utilization, yellow indicates higher utilization,
+        and red indicates very high utilization.</para>
+
+        <para><figure>
+            <title>Cluster Graph Detail</title>
+
+            <mediaobject>
+              <imageobject>
+                <imagedata fileref="images/ECLWaGage02.jpg" />
+              </imageobject>
+            </mediaobject>
+          </figure>The graphs provide an indication of available capacity
+        across your nodes. This can help identify any potential disk space
+        issues. </para>
       </sect2>
 
       <sect2 id="ECLWatch_ClusterActivity">

BIN
docs/EN_US/images/ECLWaGage01.jpg


BIN
docs/EN_US/images/ECLWaGage02.jpg


BIN
docs/PT_BR/images/ECLWaGage01.jpg


BIN
docs/PT_BR/images/ECLWaGage02.jpg


+ 18 - 2
ecl/agentexec/agentexec.cpp

@@ -26,14 +26,17 @@
 #include "environment.hpp"
 #include "dafdesc.hpp"
 
-class CEclAgentExecutionServer : public CInterfaceOf<IThreadFactory>
+class CEclAgentExecutionServer : public CInterfaceOf<IThreadFactory>, implements IAbortHandler
 {
 public:
+    IMPLEMENT_IINTERFACE_USING(CInterfaceOf<IThreadFactory>);
+
     CEclAgentExecutionServer(IPropertyTree *config);
     ~CEclAgentExecutionServer();
 
     int run();
     virtual IPooledThread *createNew() override;
+    virtual bool onAbort() override;
 private:
     bool executeWorkunit(const char * wuid);
 
@@ -45,6 +48,7 @@ private:
 #ifdef _CONTAINERIZED
     Owned<IThreadPool> pool;
 #endif
+    std::atomic<bool> running = { false };
 };
 
 //---------------------------------------------------------------------------------
@@ -133,7 +137,9 @@ int CEclAgentExecutionServer::run()
 
     try 
     {
-        while (true)
+        running = true;
+        LocalIAbortHandler abortHandler(*this);
+        while (running)
         {
 #ifdef _CONTAINERIZED
             if (!pool->waitAvailable(10000))
@@ -170,6 +176,7 @@ int CEclAgentExecutionServer::run()
                 break;
             }
         }
+        DBGLOG("Closing down");
     }
 
     catch (IException *e) 
@@ -305,6 +312,15 @@ IPooledThread *CEclAgentExecutionServer::createNew()
 #endif
 }
 
+bool CEclAgentExecutionServer::onAbort()
+{
+    DBGLOG("Close down requested");
+    running = false;
+    if (queue)
+        queue->cancelAcceptConversation();
+    return false;
+}
+
 bool CEclAgentExecutionServer::executeWorkunit(const char * wuid)
 {
 #ifdef _CONTAINERIZED

+ 5 - 0
ecl/eclccserver/eclccserver.cpp

@@ -907,9 +907,12 @@ int main(int argc, const char *argv[])
         }
         else
         {
+#ifndef _CONTAINERIZED
             unsigned optMonitorInterval = globals->getPropInt("@monitorInterval", 60);
             if (optMonitorInterval)
                 startPerformanceMonitor(optMonitorInterval*1000, PerfMonStandard, nullptr);
+#endif
+
 #ifdef _CONTAINERIZED
             bool filtered = false;
             std::unordered_map<std::string, bool> listenQueues;
@@ -967,7 +970,9 @@ int main(int argc, const char *argv[])
     {
         IERRLOG("Terminating unexpectedly");
     }
+#ifndef _CONTAINERIZED
     stopPerformanceMonitor();
+#endif
     globals.clear();
     UseSysLogForOperatorMessages(false);
     ::closedownClientProcess(); // dali client closedown

+ 2 - 1
esp/platform/espcfg.cpp

@@ -364,10 +364,11 @@ CEspConfig::CEspConfig(IProperties* inputs, IPropertyTree* envpt, IPropertyTree*
         const unsigned dafilesrvConnectTimeout = m_cfg->getPropInt("@dafilesrvConnectTimeout", 10)*1000;
         const unsigned dafilesrvReadTimeout = m_cfg->getPropInt("@dafilesrvReadTimeout", 10)*1000;
         setRemoteFileTimeouts(dafilesrvConnectTimeout, dafilesrvReadTimeout);
-
+#ifndef _CONTAINERIZED
 #ifndef _DEBUG
         startPerformanceMonitor(m_cfg->getPropInt("@perfReportDelay", 60)*1000);
 #endif
+#endif
 
         IPropertyTreeIterator *pt_iter = NULL;
         StringBuffer xpath;

+ 2 - 2
esp/services/ws_topology/ws_topologyService.cpp

@@ -1032,7 +1032,7 @@ void CWsTopologyEx::readTpLogFileRequest(IEspContext &context, const char* fileN
             throw MakeStringException(ECLWATCH_INVALID_INPUT, "This log file has no timestamp.");
 
         if  (req.getLastHours_isNull())
-            throw MakeStringException(ECLWATCH_INVALID_INPUT, "Invlid 'Hours' field.");
+            throw MakeStringException(ECLWATCH_INVALID_INPUT, "Invalid 'Hours' field.");
 
         readLogReq.lastHours = req.getLastHours();
         unsigned hour, minute, second, nano;
@@ -1051,7 +1051,7 @@ void CWsTopologyEx::readTpLogFileRequest(IEspContext &context, const char* fileN
             throw MakeStringException(ECLWATCH_INVALID_INPUT, "This log file has no timestamp.");
 
         if ((readLogReq.startDate.length() < 8) && (readLogReq.endDate.length() < 8))
-            throw MakeStringException(ECLWATCH_INVALID_INPUT, "Invlid 'Time' field.");
+            throw MakeStringException(ECLWATCH_INVALID_INPUT, "Invalid 'Time' field.");
         break;
     }
     }

+ 1 - 1
esp/src/eclwatch/PreflightDetailsWidget.js

@@ -164,7 +164,7 @@ define([
                             CPULoad: {
                                 label: this.i18n.CPULoad,
                                 renderCell: function (object, value, node, options) {
-                                    switch (request < value) {
+                                    switch ( value > request) {
                                         case true:
                                             domClass.add(node, "ErrorCell");
                                             break;

+ 1 - 1
esp/src/eclwatch/TargetClustersQueryWidget.js

@@ -181,7 +181,7 @@ define([
             });
 
             retVal.on(".dgrid-row:dblclick", function (evt) {
-                event.preventDefault();
+                evt.preventDefault();
             });
 
             retVal.on("dgrid-select", function (event) {

+ 1 - 1
esp/src/eclwatch/templates/MachineInformationWidget.html

@@ -21,7 +21,7 @@
                         <option colspan="1" value="0" selected="true">%</option>
                         <option colspan="1" value="1" selected="true">MB</option>
                     </select>
-                    <input id="${id}DiskThreshold" title="${i18n.WarnIfAvailableDiskSpaceIsUnder}:" name="DiskThreshold" placeholder="${i18n.EnterAPercentageOrMB}" colspan="2" value="95" checked data-dojo-type="dijit.form.ValidationTextBox" />
+                    <input id="${id}DiskThreshold" title="${i18n.WarnIfAvailableDiskSpaceIsUnder}:" name="DiskThreshold" placeholder="${i18n.EnterAPercentageOrMB}" colspan="2" value="5" checked data-dojo-type="dijit.form.ValidationTextBox" />
                     <select data-dojo-type="dijit.form.Select" id="${id}DiskThresholdType" name="DiskThresholdType" class="miniSelect">
                         <option colspan="1" value="0" selected="true">%</option>
                         <option colspan="1" value="1" selected="true">MB</option>

+ 7 - 0
fs/dafilesrv/dafilesrv.cpp

@@ -834,6 +834,7 @@ int main(int argc,char **argv)
     server->setThrottle(ThrottleStd, parallelRequestLimit, throttleDelayMs, throttleCPULimit);
     server->setThrottle(ThrottleSlow, parallelSlowRequestLimit, throttleSlowDelayMs, throttleSlowCPULimit);
 
+#ifndef _CONTAINERIZED
     class CPerfHook : public CSimpleInterfaceOf<IPerfMonHook>
     {
     public:
@@ -850,6 +851,8 @@ int main(int argc,char **argv)
         }
     } perfHook;
     startPerformanceMonitor(10*60*1000, PerfMonStandard, &perfHook);
+#endif
+
     writeSentinelFile(sentinelFile);
     try
     {
@@ -869,7 +872,11 @@ int main(int argc,char **argv)
             removeSentinelFile(sentinelFile); // so init does not keep trying to start it ...
         e->Release();
     }
+
+#ifndef _CONTAINERIZED
     stopPerformanceMonitor();
+#endif
+
     if (server)
         server->stop();
     server.clear();

+ 1 - 0
helm/hpcc/templates/eclagent.yaml

@@ -29,6 +29,7 @@ spec:
                 {{ include "hpcc.configArg" . }},
                 {{ include "hpcc.daliArg" $ }}
               ]
+{{ include "hpcc.addSentinelProbes" . | indent 8 }}
 {{- include "hpcc.addSecurityContext" (dict "root" $ "me" .) | indent 8 }}
 {{- if .useChildProcesses }}
 {{- include "hpcc.addResources" (dict "me" .resources) | indent 8 }}

+ 1 - 0
helm/hpcc/templates/eclccserver.yaml

@@ -29,6 +29,7 @@ spec:
                 {{ include "hpcc.configArg" . }},
                 {{ include "hpcc.daliArg" $ }}
               ]
+{{ include "hpcc.addSentinelProbes" . | indent 8 }}
 {{- include "hpcc.addSecurityContext" (dict "root" $ "me" .) | indent 8 }}
 {{- if .useChildProcesses }}
 {{- include "hpcc.addResources" (dict "me" .resources) | indent 8 }}

+ 0 - 4
roxie/ccd/ccdcontext.cpp

@@ -1391,10 +1391,6 @@ public:
         if (linuxYield)
             sched_yield();
 #endif
-#ifdef _DEBUG
-        if (shuttingDown)
-            throw MakeStringException(ROXIE_FORCE_SHUTDOWN, "Roxie is shutting down");
-#endif
         if (aborted) // NOTE - don't bother getting lock before reading this (for speed) - a false read is very unlikely and not a problem
         {
             CriticalBlock b(abortLock);

+ 2 - 1
roxie/ccd/ccddali.cpp

@@ -197,7 +197,8 @@ private:
                 }
                 else if (owner->connect(ROXIE_DALI_CONNECT_TIMEOUT))
                 {
-                    DBGLOG("roxie: CRoxieDaliConnectWatcher reconnected");
+                    if (traceLevel)
+                        DBGLOG("CRoxieDaliConnectWatcher reconnected");
                     try
                     {
                         owner->disconnectSem.wait();

+ 6 - 5
roxie/ccd/ccdlistener.cpp

@@ -751,7 +751,7 @@ public:
         started.wait();
     }
 
-    virtual bool stop(unsigned timeout)
+    virtual bool stop()
     {
         if (running)
         {
@@ -759,7 +759,7 @@ public:
             join();
             Release();
         }
-        return pool->joinAll(false, timeout);
+        return pool->joinAll(false);
     }
 
     void reportBadQuery(const char *name, const IRoxieContextLogger &logctx)
@@ -973,14 +973,14 @@ public:
     {
         UNIMPLEMENTED;
     }
-    virtual bool stop(unsigned timeout)
+    virtual bool stop()
     {
         if (queue)
         {
             DBGLOG("RoxieWorkUnitListener::stop");
             queue->cancelAcceptConversation();
         }
-        return RoxieListener::stop(timeout);
+        return RoxieListener::stop();
     }
 
     virtual void stopListening()
@@ -1092,7 +1092,8 @@ public:
 
     virtual bool stop() override
     {
-        IERRLOG("RoxieQueryWorker stopped with queries active");
+        if (traceLevel)
+            DBGLOG("RoxieQueryWorker thread stop requested with query active - ignoring");
         return true;
     }
 

+ 15 - 3
roxie/ccd/ccdmain.cpp

@@ -209,7 +209,9 @@ unsigned leafCacheMB = 50;
 unsigned blobCacheMB = 0;
 
 unsigned roxiePort = 0;
+#ifndef _CONTAINERIZED
 Owned<IPerfMonHook> perfMonHook;
+#endif
 
 MODULE_INIT(INIT_PRIORITY_STANDARD)
 {
@@ -730,6 +732,7 @@ int CCD_API roxie_main(int argc, const char *argv[], const char * defaultYaml)
         if (standAloneDll || wuid)
         {
             oneShotRoxie = true;
+            DBGLOG("Starting roxie - wuid=%s", wuid ? wuid : "<none>");
             allFilesDynamic = true;
             if (topology->getPropBool("@server", false))
             {
@@ -1118,11 +1121,12 @@ int CCD_API roxie_main(int argc, const char *argv[], const char * defaultYaml)
             mtu_size = 1400;    // upper limit on outbound buffer size - allow some header room too
             roxiemem::setDataAlignmentSize(0x400);
         }
-        unsigned pinterval = topology->getPropInt("@systemMonitorInterval",1000*60);
+#ifndef _CONTAINERIZED
         perfMonHook.setown(roxiemem::createRoxieMemStatsPerfMonHook());  // Note - we create even if pinterval is 0, as can be enabled via control message
+        unsigned pinterval = topology->getPropInt("@systemMonitorInterval",1000*60);
         if (pinterval)
             startPerformanceMonitor(pinterval, PerfMonStandard, perfMonHook);
-
+#endif
 
         topology->getProp("@pluginDirectory", pluginDirectory);
         StringBuffer packageDirectory;
@@ -1393,13 +1397,14 @@ int CCD_API roxie_main(int argc, const char *argv[], const char * defaultYaml)
                 E->Release();
             }
         }
+        DBGLOG("Roxie closing down");
         shuttingDown = true;
         if (pingInterval)
             stopPingTimer();
         setSEHtoExceptionHandler(NULL);
         while (socketListeners.isItem(0))
         {
-            socketListeners.item(0).stop(1000);
+            socketListeners.item(0).stop();
             socketListeners.remove(0);
         }
         packetDiscarder->stop();
@@ -1419,7 +1424,9 @@ int CCD_API roxie_main(int argc, const char *argv[], const char * defaultYaml)
     }
 
     roxieMetrics.clear();
+#ifndef _CONTAINERIZED
     stopPerformanceMonitor();
+#endif
     ::Release(globalPackageSetManager);
     globalPackageSetManager = NULL;
     stopDelayedReleaser();
@@ -1430,7 +1437,9 @@ int CCD_API roxie_main(int argc, const char *argv[], const char * defaultYaml)
     releaseRoxieStateCache();
     setDaliServixSocketCaching(false);  // make sure it cleans up or you get bogus memleak reports
     setNodeCaching(false); // ditto
+#ifndef _CONTAINERIZED
     perfMonHook.clear();
+#endif
     stopAeronDriver();
     stopTopoThread();
 
@@ -1460,6 +1469,9 @@ int CCD_API roxie_main(int argc, const char *argv[], const char * defaultYaml)
     return 0;
 }
 
+// These defaults only apply when roxie is linked into a standalone executable
+// Note that the defaults for roxie executable (in whatever mode) are set in roxie.cpp
+
 static constexpr const char * standaloneDefaultYaml = R"!!(
 version: "1.0"
 roxie:

+ 6 - 5
roxie/ccd/ccdprotocol.cpp

@@ -128,7 +128,7 @@ public:
         started.wait();
     }
 
-    virtual bool stop(unsigned timeout)
+    virtual bool stop()
     {
         if (running)
         {
@@ -136,7 +136,7 @@ public:
             join();
             Release();
         }
-        return pool->joinAll(false, timeout);
+        return pool->joinAll(false);
     }
 
     void setThreadAffinity(int numCores)
@@ -244,11 +244,11 @@ public:
         return sink.get();
     }
 
-    virtual bool stop(unsigned timeout)
+    virtual bool stop()
     {
         if (socket)
             socket->cancel_accept();
-        return ProtocolListener::stop(timeout);
+        return ProtocolListener::stop();
     }
 
     virtual void disconnectQueue()
@@ -398,7 +398,8 @@ public:
 
     virtual bool stop() override
     {
-        IERRLOG("RoxieQueryWorker stopped with queries active");
+        if (traceLevel)
+            DBGLOG("RoxieQueryWorker thread stop requested with query active - ignoring");
         return true;
     }
 

+ 2 - 1
roxie/ccd/ccdqueue.cpp

@@ -848,7 +848,8 @@ void doUnload(IRoxieQueryPacket *packet, const IRoxieContextLogger &logctx)
 {
     const RoxiePacketHeader &header = packet->queryHeader();
     unsigned channelNo = header.channel;
-    logctx.CTXLOG("Unload received for channel %d", channelNo);
+    if (logctx.queryTraceLevel())
+        logctx.CTXLOG("Unload received for channel %d", channelNo);
     hash64_t hashValue = header.queryHash;
     hashValue = rtlHash64Data(sizeof(channelNo), &channelNo, hashValue);
     SpinBlock b(onDemandQueriesCrit);

+ 4 - 0
roxie/ccd/ccdstate.cpp

@@ -2743,12 +2743,16 @@ private:
             }
             else if (stricmp(queryName, "control:systemMonitor")==0)
             {
+#ifndef _CONTAINERIZED
                 unsigned interval = control->getPropInt("@interval", 60000);
                 bool enable = control->getPropBool("@enable", true);
                 if (enable)
                     startPerformanceMonitor(interval, PerfMonStandard, perfMonHook);
                 else
                     stopPerformanceMonitor();
+#else
+                UNIMPLEMENTED; //better than ignoring 'control:systemMonitor' in containerized mode
+#endif
             }
             //MORE: control:stats??
             else

+ 1 - 1
roxie/ccd/hpccprotocol.hpp

@@ -110,7 +110,7 @@ interface IHpccProtocolListener : extends IInterface
     virtual const SocketEndpoint &queryEndpoint() const = 0;
 
     virtual void start() = 0;
-    virtual bool stop(unsigned timeout) = 0;
+    virtual bool stop() = 0;
     virtual void stopListening() = 0;
     virtual void disconnectQueue() = 0;
 

+ 15 - 6
roxie/roxiemem/roxiemem.cpp

@@ -348,7 +348,8 @@ static void initializeHeap(bool allowHugePages, bool allowTransparentHugePages,
                     //If we notify heapBlockSize items at a time it will always be a multiple of hugePageSize so shouldn't trigger defragmentation
                     heapNotifyUnusedEachBlock = !retainMemory;
                 }
-                DBGLOG("Transparent huge pages used for roxiemem heap");
+                if (memTraceLevel)
+                    DBGLOG("Transparent huge pages used for roxiemem heap");
             }
         }
         else
@@ -356,21 +357,29 @@ static void initializeHeap(bool allowHugePages, bool allowTransparentHugePages,
             if (!allowTransparentHugePages)
             {
                 madvise(heapBase,memsize,MADV_NOHUGEPAGE);
-                DBGLOG("Transparent huge pages disabled in configuration by user.");
+                if (memTraceLevel)
+                    DBGLOG("Transparent huge pages disabled in configuration by user.");
             }
-            else
+            else if (memTraceLevel)
                 DBGLOG("Transparent huge pages unsupported or disabled by system.");
         }
 #else
-        DBGLOG("Transparent huge pages are not supported on this kernel.  Requires kernel version > 2.6.38.");
+        if (memTraceLevel)
+            DBGLOG("Transparent huge pages are not supported on this kernel.  Requires kernel version > 2.6.38.");
 #endif
     }
 #endif
 
     if (heapNotifyUnusedEachFree)
-        DBGLOG("Memory released to OS on each %uk 'page'", (unsigned)(HEAP_ALIGNMENT_SIZE/1024));
+    {
+        if (memTraceLevel)
+            DBGLOG("Memory released to OS on each %uk 'page'", (unsigned)(HEAP_ALIGNMENT_SIZE/1024));
+    }
     else if (heapNotifyUnusedEachBlock)
-        DBGLOG("Memory released to OS in %uk blocks", (unsigned)(HEAP_ALIGNMENT_SIZE*HEAP_BITS/1024));
+    {
+        if (memTraceLevel)
+            DBGLOG("Memory released to OS in %uk blocks", (unsigned)(HEAP_ALIGNMENT_SIZE*HEAP_BITS/1024));
+    }
     else
     {
         DBGLOG("MEMORY WILL NOT BE RELEASED TO OS");

+ 1 - 1
roxie/udplib/udpipmap.hpp

@@ -111,7 +111,7 @@ private:
 
 template<class T> T &IpMapOf<T>::lookup(const ServerIdentifier &ip) const
 {
-   unsigned hash = ip.hash() & 0xff;
+   unsigned hash = ip.fasthash() & 0xff;
    for (;;)
    {
        const list *head = table[hash].load(std::memory_order_acquire);

+ 5 - 1
roxie/udplib/udplib.hpp

@@ -61,7 +61,11 @@ public:
     }
     unsigned hash() const
     {
-        return netAddress;
+        return hashc((const byte *)&netAddress,sizeof(netAddress),0);
+    }
+    unsigned fasthash() const
+    {
+        return netAddress >> 24;
     }
     inline void setIp(const IpAddress &_ip)
     {

+ 0 - 1
roxie/udplib/udptopo.cpp

@@ -382,7 +382,6 @@ void TopologyManager::setServers(const StringArray &_topoServers)
 void TopologyManager::setRoles(const std::vector<RoxieEndpointInfo> &myRoles)
 {
     topoBuf.clear();
-    DBGLOG("TopologyManager::setRoles - %d roles", (int) myRoles.size());
     for (const auto &role : myRoles)
     {
         switch (role.role)

+ 1 - 3
system/jlib/jsocket.cpp

@@ -1615,9 +1615,7 @@ int CSocket::logPollError(unsigned revents, const char *rwstr)
     }
     else if (revents & POLLNVAL)
     {
-        StringBuffer errStr;
-        errStr.appendf("%s POLLNVAL", rwstr);
-        LOGERR2(999,3,errStr.str());
+        // These are typically expected - when closing a socket in order to interrupt a thread that waits on it, for example
     }
     else
     {

+ 3 - 1
system/jlib/jthread.cpp

@@ -25,6 +25,7 @@
 #include "jqueue.tpp"
 #include "jregexp.hpp"
 #include "jlog.ipp"
+#include "jisem.hpp"
 #include <assert.h>
 #ifdef _WIN32
 #include <process.h>
@@ -319,7 +320,8 @@ void Thread::handleException(IException *e)
     assertex(exceptionHandlers);
     if (exceptionHandlers->ordinality() == 0)
     {
-        PrintExceptionLog(e,getName());
+        if (!dynamic_cast<InterruptedSemaphoreException *>(e))
+            PrintExceptionLog(e,getName());
         //throw; // don't rethrow unhandled, preferable over alternative of causing process death
         e->Release();
     }

+ 1 - 1
testing/unittests/unittests.cpp

@@ -259,7 +259,7 @@ int main(int argc, char* argv[])
 
     objects.kill();
     ExitModuleObjects();
-    return wasSuccessful;
+    return wasSuccessful ? 0 : 1; // 0 == exit code success
 }
 
 

+ 2 - 1
thorlcr/activities/diskread/thdiskreadslave.cpp

@@ -1175,7 +1175,8 @@ public:
             {
                 distributor->disconnect(true);
                 distributor->join();
-            }            
+            }
+            aggregateStream.clear();
         }
         PARENT::stop();
     }

+ 5 - 0
thorlcr/activities/hashdistrib/thhashdistribslave.cpp

@@ -4297,6 +4297,7 @@ IRowStream *mergeLocalAggs(Owned<IHashDistributor> &distributor, CSlaveActivity
         IHThorRowAggregator &helper;
         IHashDistributor &distributor;
         CSlaveActivity &activity;
+        bool stopped = false;
     public:
         CAggregatingStream(IHThorRowAggregator &_helper, IEngineRowAllocator &_rowAllocator, ICompare &_cmp, IHashDistributor &_distributor, CSlaveActivity &_activity)
             : helper(_helper), rowAllocator(_rowAllocator), cmp(_cmp), distributor(_distributor), rowBuilder(_rowAllocator), activity(_activity)
@@ -4304,6 +4305,7 @@ IRowStream *mergeLocalAggs(Owned<IHashDistributor> &distributor, CSlaveActivity
         }
         void start(IRowStream *_input)
         {
+            stopped = false;
             input.setown(_input);
         }
         // IRowStream
@@ -4344,6 +4346,9 @@ IRowStream *mergeLocalAggs(Owned<IHashDistributor> &distributor, CSlaveActivity
         }
         virtual void stop() override
         {
+            if (stopped)
+                return;
+            stopped = true;
             sz = 0;
             rowBuilder.clear();
             input->stop();

+ 4 - 1
thorlcr/master/thmastermain.cpp

@@ -1012,10 +1012,11 @@ int main( int argc, const char *argv[]  )
 
             writeSentinelFile(sentinelFile);
 
+#ifndef _CONTAINERIZED
             unsigned pinterval = globals->getPropInt("@system_monitor_interval",1000*60);
             if (pinterval)
                 startPerformanceMonitor(pinterval, PerfMonStandard, nullptr);
-
+#endif
             // NB: workunit/graphName only set in one-shot mode (if isCloud())
             thorMain(logHandler, workunit, graphName);
             LOG(MCauditInfo, ",Progress,Thor,Terminate,%s,%s,%s",thorname,nodeGroup.str(),queueName.str());
@@ -1048,7 +1049,9 @@ int main( int argc, const char *argv[]  )
     thorEndHandler->start(30);
 
     PROGLOG("Thor closing down 5");
+#ifndef _CONTAINERIZED
     stopPerformanceMonitor();
+#endif
     disconnectLogMsgManagerFromDali();
     closeThorServerStatus();
     PROGLOG("Thor closing down 4");

+ 4 - 0
thorlcr/slave/thslavemain.cpp

@@ -528,9 +528,11 @@ int main( int argc, const char *argv[]  )
                     multiThorMemoryThreshold = 0;
             }
 
+#ifndef _CONTAINERIZED
             unsigned pinterval = globals->getPropInt("@system_monitor_interval",1000*60);
             if (pinterval)
                 startPerformanceMonitor(pinterval, PerfMonStandard, nullptr);
+#endif
 
 #ifdef _CONTAINERIZED
             class CServerThread : public CSimpleInterfaceOf<IThreaded>
@@ -581,7 +583,9 @@ int main( int argc, const char *argv[]  )
             FLLOG(MCexception(e), thorJob, e,"ThorSlave");
         unregisterException.setown(e);
     }
+#ifndef _CONTAINERIZED
     stopPerformanceMonitor();
+#endif
     ClearTempDirs();
 
     if (multiThorMemoryThreshold)