瀏覽代碼

Merge pull request #13554 from mckellyln/hpcc-23714

HPCC-23714 Thor master hangs on shutdown after an abort

Reviewed-By: Jake Smith <jake.smith@lexisnexis.com>
Reviewed-By: Richard Chapman <rchapman@hpccsystems.com>
Richard Chapman 5 年之前
父節點
當前提交
3c53b249d6
共有 3 個文件被更改,包括 12 次插入2 次删除
  1. 5 1
      thorlcr/graph/thgraphmaster.cpp
  2. 2 0
      thorlcr/graph/thgraphmaster.ipp
  3. 5 1
      thorlcr/master/thgraphmanager.cpp

+ 5 - 1
thorlcr/graph/thgraphmaster.cpp

@@ -1829,7 +1829,6 @@ bool CJobMaster::go()
         EXCLOG(e, NULL); 
         jobDoneException.setown(e);
     }
-    fatalHandler->clear();
     queryTempHandler()->clearTemps();
     slaveMsgHandler->stop();
     if (jobDoneException.get())
@@ -1970,6 +1969,11 @@ bool CJobMaster::fireException(IException *e)
     return true;
 }
 
+IFatalHandler *CJobMaster::clearFatalHandler()
+{
+    return fatalHandler.getClear();
+}
+
 // CJobMasterChannel
 
 CJobMasterChannel::CJobMasterChannel(CJobBase &job, IMPServer *mpServer, unsigned channel) : CJobChannel(job, mpServer, channel)

+ 2 - 0
thorlcr/graph/thgraphmaster.ipp

@@ -265,6 +265,8 @@ public:
     __int64 queryNodeDiskUsage(unsigned node);
     void setNodeDiskUsage(unsigned node, __int64 sz);
     bool queryCreatedFile(const char *file);
+
+    virtual IFatalHandler *clearFatalHandler();
 };
 
 class graphmaster_decl CJobMasterChannel : public CJobChannel

+ 5 - 1
thorlcr/master/thgraphmanager.cpp

@@ -891,6 +891,7 @@ bool CJobManager::executeGraph(IConstWorkUnit &workunit, const char *graphName,
     addJob(*job);
     bool allDone = false;
     Owned<IException> exception;
+    Owned<IFatalHandler> fatalHdlr;
     try
     {
         struct CounterBlock
@@ -921,7 +922,7 @@ bool CJobManager::executeGraph(IConstWorkUnit &workunit, const char *graphName,
         updateWorkunitStat(wu, SSTgraph, graphName, StTimeElapsed, graphTimeStr, graphTimeNs, wfid);
 
         addTimeStamp(wu, SSTgraph, graphName, StWhenFinished, wfid);
-        
+
         removeJob(*job);
     }
     catch (IException *e)
@@ -935,9 +936,12 @@ bool CJobManager::executeGraph(IConstWorkUnit &workunit, const char *graphName,
         setWuid(nullptr);
         throw exception.getClear();
     }
+    fatalHdlr.setown(job->clearFatalHandler());
     job.clear();
     PROGLOG("Finished wuid=%s, graph=%s", wuid.str(), graphName);
 
+    fatalHdlr->clear();
+
     setWuid(NULL);
     return allDone;
 }