فهرست منبع

Merge pull request #15954 from jakesmith/HPCC-27439-job-leak-temp-issue

HPCC-27439 Ensure job done is sent to slaves after part failure

Reviewed-By: Richard Chapman <rchapman@hpccsystems.com>
Richard Chapman 3 سال پیش
والد
کامیت
5d4d0fbf24
2فایلهای تغییر یافته به همراه3 افزوده شده و 1 حذف شده
  1. 1 1
      thorlcr/graph/thgraphmaster.cpp
  2. 2 0
      thorlcr/slave/slavmain.cpp

+ 1 - 1
thorlcr/graph/thgraphmaster.cpp

@@ -1713,9 +1713,9 @@ void CJobMaster::sendQuery()
     compressToBuffer(msg, tmp.length(), tmp.toByteArray());
 
     CTimeMon queryToSlavesTimer;
+    querySent = true;
     broadcast(queryNodeComm(), msg, masterSlaveMpTag, LONGTIMEOUT, "sendQuery");
     PROGLOG("Serialization of query init info (%d bytes) to slaves took %d ms", msg.length(), queryToSlavesTimer.elapsed());
-    querySent = true;
 }
 
 void CJobMaster::jobDone()

+ 2 - 0
thorlcr/slave/slavmain.cpp

@@ -1916,6 +1916,8 @@ public:
                         StringAttr key;
                         msg.read(key);
                         CJobSlave *job = jobs.find(key.get());
+                        if (!job)
+                            throw makeStringException(0, "QueryDone: job not found"); // can happen if job failed during initialization on some slaves
                         StringAttr wuid = job->queryWuid();
                         StringAttr graphName = job->queryGraphName();