Browse Source

HPCC-9208 Correctly catch errors returned by pssh and prevent core

Signed-off-by: Gavin Halliday <gavin.halliday@lexisnexis.com>
Gavin Halliday 12 years ago
parent
commit
6c4448d098
4 changed files with 22 additions and 3 deletions
  1. 3 0
      common/remote/rmtspawn.cpp
  2. 7 0
      common/remote/rmtssh.cpp
  3. 1 1
      dali/dfu/dfurun.cpp
  4. 11 2
      dali/ft/filecopy.cpp

+ 3 - 0
common/remote/rmtspawn.cpp

@@ -467,9 +467,12 @@ void CRemoteSlave::run(int argc, char * argv[])
                     msg.clear();
                     if (catchReadBuffer(masterSocket, msg, RMTTIME_RESPONSE_MASTER))
                     {
+                        LOG(MCdebugProgress, unknownJob, "Terminate acknowledgement received from master for slave %d", info.replyTag);
                         msg.read(ok);
                         assertex(ok);
                     }
+                    else
+                        LOG(MCdebugProgress, unknownJob, "No terminate acknowledgement received from master for slave %d", info.replyTag);
 
                     if (error)
                         break;

+ 7 - 0
common/remote/rmtssh.cpp

@@ -531,6 +531,13 @@ public:
             while (res.length()&&(res.charAt(res.length()-1)<=' '))
                 res.setLength(res.length()-1);
             PROGLOG("%s result(%d):\n%s",useplink?"plink":"ssh",reply.item(0),res.str());
+            if (res.length())
+            {
+                int code = reply.item(0);
+                if (code == 0)
+                    code = -1;
+                throw MakeStringExceptionDirect(code, res.str());
+            }
         }
     }
     void exec(

+ 1 - 1
dali/dfu/dfurun.cpp

@@ -166,7 +166,7 @@ class CDFUengine: public CInterface, implements IDFUengine
                 PROGLOG("ABORT notified");
             abort = true;
         }
-    } abortnotify;
+    };
 
 
     class cDFUlistener: public Thread

+ 11 - 2
dali/ft/filecopy.cpp

@@ -2529,8 +2529,14 @@ void FileSprayer::waitForTransferSem(Semaphore & sem)
             StringBuffer list;
             ForEachItemIn(i, transferSlaves)
                 transferSlaves.item(i).logIfRunning(list);
+
             if (timeSinceProgress>RESPONSE_TIME_TIMEOUT)
-                throwError1(RFSERR_TimeoutWaitSlave, list.str());
+            {
+                //Set an error - the transfer threads will check it after a couple of minutes, and then terminate gracefully
+                CriticalBlock lock(errorCS);
+                if (!error)
+                    error.setown(MakeStringException(RFSERR_TimeoutWaitSlave, RFSERR_TimeoutWaitSlave_Text, list.str()));
+            }
         }
     }
 }
@@ -2545,7 +2551,10 @@ void FileSprayer::addTarget(unsigned idx, INode * node)
 }
 
 bool FileSprayer::isAborting()
-{ 
+{
+    if (aborting || error)
+        return true;
+
     unsigned nowTick = msTick();
     if (abortChecker && (nowTick - lastAbortCheckTick >= abortCheckFrequency))
     {