Kaynağa Gözat

HPCC-13058 Check frunssh failure in run_thor

Signed-off-by: Michael Gardner <Michael.Gardner@lexisnexis.com>
Michael Gardner 10 yıl önce
ebeveyn
işleme
317c88a7fc

+ 17 - 4
common/remote/rmtssh.cpp

@@ -513,6 +513,8 @@ public:
             return;
         if (slaves.ordinality()>1) {
             PROGLOG("Results: (%d of %d finished)",done.ordinality(),slaves.ordinality());
+            int errCode = 0;
+            Owned<IMultiException> multiException = MakeMultiException();
             for (unsigned i=0;i<done.ordinality();i++) {
                 unsigned n = done.item(i);
                 StringBuffer res(replytext.item(n));
@@ -520,19 +522,30 @@ public:
                     res.setLength(res.length()-1);
                 if (res.length()==0)
                     PROGLOG("%d: %s(%d): [OK]",n+1,slaves.item(n),reply.item(n));
-                else if (strchr(res.str(),'\n')==NULL)
+                else if (strchr(res.str(),'\n')==NULL) {
                     PROGLOG("%d: %s(%d): %s",n+1,slaves.item(n),reply.item(n),res.str());
-                else
+                    if (reply.item(n)) {
+                        errCode = reply.item(n);
+                        multiException->append(*MakeStringExceptionDirect(reply.item(n),res.str()));
+                    }
+                }
+                else {
                     PROGLOG("%d: %s(%d):\n---------------------------\n%s\n===========================",n+1,slaves.item(n),reply.item(n),res.str());
+                    if (reply.item(n)) {
+                        errCode = reply.item(n);
+                        multiException->append(*MakeStringExceptionDirect(reply.item(n),res.str()));
+                    }
+                }
             }
+            if (errCode)
+                throw multiException.getClear();
         }
         else {
             StringBuffer res(replytext.item(0));
             while (res.length()&&(res.charAt(res.length()-1)<=' '))
                 res.setLength(res.length()-1);
             PROGLOG("%s result(%d):\n%s",useplink?"plink":"ssh",reply.item(0),res.str());
-            if (res.length())
-            {
+            if (res.length()) {
                 int code = reply.item(0);
                 if (code == 0)
                     code = -1;

+ 8 - 0
initfiles/componentfiles/thor/run_thor

@@ -54,6 +54,14 @@ while [ 1 ]; do
     else
         nslaves=`cat $instancedir/uslaves.start | wc -l`
         $deploydir/frunssh $instancedir/uslaves.start "/bin/sh -c '$deploydir/start_slaves %a $THORMASTER $THORMASTERPORT $logdir $instancedir $deploydir $THORNAME $PATH_PRE $logredirect'" -i:$SSHidentityfile -u:$SSHusername -pe:$SSHpassword -t:$SSHtimeout -a:$SSHretries -n:$nslaves 2>&1
+        FRUNSSH_RC=$?
+        if [[ ${FRUNSSH_RC} -gt 0 ]]; then
+          echo "Error ${FRUNSSH_RC} in frunssh"
+          echo "Please check `dirname ${logdir}`/frunssh for more details"
+          # clean up any slaves it was able to reach
+          $deploydir/stop_thor $deploydir
+          exit 0
+        fi
     fi
 
     echo thormaster cmd : $instancedir/thormaster_$THORNAME MASTER=$THORMASTER:$THORMASTERPORT

+ 0 - 3
initfiles/componentfiles/thor/stop_slaves

@@ -20,9 +20,6 @@ hpcc_setenv=$2
 
 source ${hpcc_setenv}
 
-echo compName=${compName}
-echo PID=${PID}
-
 slavename=thorslave_${compName}
 
 killall -0 $slavename &> /dev/null