Explorar o código

Merge pull request #1512 from jakesmith/swapnode-3.6-v3

gh-1412 - Make slave script independent of init

Reviewed-By: Richard Chapman <rchapman@hpccsystems.com>
Richard Chapman %!s(int64=13) %!d(string=hai) anos
pai
achega
fc0a0bede2

+ 4 - 3
initfiles/componentfiles/configxml/setvars_linux.xsl

@@ -75,9 +75,10 @@ export THORSLAVEPORT=6600
 export localthorportinc=<xsl:value-of select="@localThorPortInc"/>
 </xsl:if>
 export domain=<xsl:value-of select="$domainName"/>
-<xsl:if test="string(@slavesPerNode) != ''">
-export slavespernode=<xsl:value-of select="@slavesPerNode"/>
-</xsl:if>
+export slavespernode=<xsl:choose>
+<xsl:when test="string(@slavesPerNode) != ''"><xsl:value-of select="@slavesPerNode"/></xsl:when>
+<xsl:otherwise>1</xsl:otherwise>
+</xsl:choose>
 <xsl:if test="string(@multiSlaves) != ''">
 export multislaves=<xsl:value-of select="@multiSlaves"/>
 </xsl:if>

+ 15 - 39
initfiles/componentfiles/thor/start_slave

@@ -16,8 +16,8 @@
 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ################################################################################
 
-deploydir=$1
-slavenum=$2
+prog=$1
+master=$2
 logpth=$3
 instancedir=$4
 slaveport=$5
@@ -33,33 +33,23 @@ source  ${INSTALL_DIR}/etc/init.d/hpcc_common
 which_pidof
 
 if [ $# -lt 8 ]; then
-  echo usage: $0 thordeploydir slaveno logdir workingdir slaveport hpcc_compname hpcc_setenv logredirect
+  echo usage: prog master logdir workingdir slaveport hpcc_compname hpcc_setenv logredirect
   exit 1
 fi
 
-sudo /etc/init.d/hpcc-init -c $hpcc_compname setup 2>/dev/null 2>/dev/null
-
+mkdir -p $instancedir
+mkdir -p `dirname $logredirect`
 exec >$logredirect 2>&1
 
 cd $instancedir
-. ./setvars
 
-slaveproc="thorslave_$THORSLAVEPORT"
-if [ "$multislaves" = "true" ] || [ "$localthor" = "true" ]; then
-    logpth=${logpth}_${slaveport}
-fi
+logpth=${logpth}_${slaveport}
 
-ln -s -f $deploydir/thorslave${LCR} $slaveproc
-export PATH=$PATH:$deploydir
 
-echo "slave $slavenum init `date`"
+echo "slave init `date`"
 
-lckfile="start_slave"
-if [ "$multislaves" = "true" ] || [ "$localthor" = "true" ]; then
-    lckfile=${lckfile}_${slaveport}
-fi
+lckfile="start_slave_${slaveport}.lck"
 
-lckfile="${lckfile}.lck"
 # prevent two slaves starting together
 while [ -e $lckfile ]; do
   echo waiting on lckfile: $lckfile
@@ -79,32 +69,18 @@ echo $$ > $lckfile
 ulimit -c unlimited
 ulimit -n 8192
 
+slaveproc="thorslave_$slaveport"
+ln -s -f `which $prog` $slaveproc
 
-if [ "$multislaves" != "true" ] && [ "$localthor" != "true" ]; then
-    # kill stubborn slave
-    oldpid=`${PIDOF} $slaveproc`
-    if [ -n "$oldpid" ]; then
-      # used hard kill as don't want unregistering
-      echo killing pid $oldpid $slaveproc
-      kill -9 $oldpid 
-    fi
-fi
-
-if [ "$localthor" != "true" ]; then
-    ln -s -f $deploydir/thorslave${LCR} $instancedir/$slaveproc
-fi
-
-echo "slave $slavenum starting `date`"
-
-export PATH=$PATH:$BINDIR
+echo "slave starting `date`"
 
-echo $instancedir/$slaveproc master=$THORMASTER:$THORMASTERPORT slave=.:$slaveport daliservers=$DALISERVER logDir=$logpth $remotedeploy
-$instancedir/$slaveproc master=$THORMASTER:$THORMASTERPORT slave=.:$slaveport daliservers=$DALISERVER logDir=$logpth $remotedeploy 2>/dev/null 1>/dev/null &
+echo $instancedir/$slaveproc master=$master slave=.:$slaveport logDir=$logpth
+$instancedir/$slaveproc master=$master slave=.:$slaveport logDir=$logpth 2>/dev/null 1>/dev/null &
 slavepid=$!
 echo $slavepid > $PID_NAME
 if [ "$slavepid" -eq "0" ]; then
-  echo "failed to start $slavenum at `date`"
+  echo "failed to start at `date`"
 else
-  echo "slave  $slavenum pid $slavepid started `date`"
+  echo "slave pid $slavepid started `date`"
 fi
 

+ 3 - 3
initfiles/componentfiles/thor/start_slaves

@@ -28,7 +28,7 @@ if [ "$localthor" = "true" ]; then
         if [ "$slaveport" = "" ]; then
             slaveport=$THORSLAVEPORT
         fi
-        $deploydir/start_slave $deploydir $n $logpth $instancedir $slaveport $THORNAME $PATH_PRE $logdir/start_slave_$logpthtail.$n.log
+        $deploydir/start_slave thorslave${LCR} $THORMASTER:$THORMASTERPORT $logpth $instancedir $slaveport $THORNAME $PATH_PRE $logdir/start_slave_$logpthtail.$n.log
         let "n += 1";
         done
 else
@@ -42,7 +42,7 @@ else
                     slaveport=$THORSLAVEPORT
                 fi
                 logredirect="$logdir/start_slave_$logpthtail.$n.log"
-                frunssh $ip "/bin/sh -c '$deploydir/start_slave $deploydir $n $logpth $instancedir $slaveport $THORNAME $PATH_PRE $logredirect'" -i:$SSHidentityfile -u:$SSHusername -pe:$SSHpassword -t:$SSHtimeout -a:$SSHretries 2>&1
+                frunssh $ip "/bin/sh -c '$deploydir/start_slave thorslave${LCR} $THORMASTER:$THORMASTERPORT $logpth $instancedir $slaveport $THORNAME $PATH_PRE $logredirect'" -i:$SSHidentityfile -u:$SSHusername -pe:$SSHpassword -t:$SSHtimeout -a:$SSHretries 2>&1
                 let "n += 1";
             done
         else
@@ -50,7 +50,7 @@ else
                 mv $instancedir/thorgroup $instancedir/thorgroup.local
             fi
             logredirect="$logdir/start_slave_$logpthtail.log"
-            frunssh $instancedir/slaves "/bin/sh -c '$deploydir/start_slave $deploydir %n $logpth $instancedir $THORSLAVEPORT $THORNAME $PATH_PRE $logredirect'" -i:$SSHidentityfile -u:$SSHusername -pe:$SSHpassword -t:$SSHtimeout -a:$SSHretries 2>&1
+            frunssh $instancedir/slaves "/bin/sh -c '$deploydir/start_slave thorslave${LCR} $THORMASTER:$THORMASTERPORT $logpth $instancedir $THORSLAVEPORT $THORNAME $PATH_PRE $logredirect'" -i:$SSHidentityfile -u:$SSHusername -pe:$SSHpassword -t:$SSHtimeout -a:$SSHretries 2>&1
         fi
 fi
 echo thorslaves started

+ 31 - 34
thorlcr/slave/thslavemain.cpp

@@ -215,7 +215,7 @@ void startSlaveLog()
     ep.getUrlStr(fileName);
     fileName.append("_").append(getMachinePortBase());
 
-    Owned<IComponentLogFileCreator> lf = createComponentLogFileCreator(globals, "thor");
+    Owned<IComponentLogFileCreator> lf = createComponentLogFileCreator(globals->queryProp("@logDir"), "thor");
     lf->setCreateAliasFile(false);
     lf->setName(fileName.str());//override default filename
     lf->setMsgFields(MSGFIELD_timeDate | MSGFIELD_msgID | MSGFIELD_process | MSGFIELD_thread | MSGFIELD_code);
@@ -288,40 +288,8 @@ int main( int argc, char *argv[]  )
         slfEp.port = getMachinePortBase();
         startSlaveLog();
 
-#define ISDALICLIENT // JCSMORE plugins *can* access dali - though I think we should probably prohibit somehow.
-#ifdef ISDALICLIENT
-        const char *daliServers = globals->queryProp("@DALISERVERS");
-        if (!daliServers)
-        {
-            LOG(MCerror, thorJob, "No Dali server list specified\n");
-            return 1;
-        }
-        Owned<IGroup> serverGroup = createIGroup(daliServers, DALI_SERVER_PORT);
-        unsigned retry = 0;
-        loop {
-            try {
-                LOG(MCdebugProgress, thorJob, "calling initClientProcess");
-                initClientProcess(serverGroup,DCR_ThorSlave, getFixedPort(TPORT_mp));
-                break;
-            }
-            catch (IJSOCK_Exception *e) { 
-                if ((e->errorCode()!=JSOCKERR_port_in_use))
-                    throw;
-                FLLOG(MCexception(e), thorJob, e,"InitClientProcess");
-                if (retry++>10) 
-                    throw;
-                e->Release();
-                LOG(MCdebugProgress, thorJob, "Retrying");
-                Sleep(retry*2000);  
-            }
-        }
-        setPasswordsFromSDS();
-#else
         startMPServer(getFixedPort(TPORT_mp));
-#endif
-
 #ifdef USE_MP_LOG
-        startMPServer(getFixedPort(TPORT_mp));
         startLogMsgParentReceiver();
         LOG(MCdebugProgress, thorJob, "MPServer started on port %d", getFixedPort(TPORT_mp));
 #endif
@@ -332,6 +300,35 @@ int main( int argc, char *argv[]  )
         markNodeCentral(masterEp);
         if (RegisterSelf(masterEp))
         {
+#define ISDALICLIENT // JCSMORE plugins *can* access dali - though I think we should probably prohibit somehow.
+#ifdef ISDALICLIENT
+            const char *daliServers = globals->queryProp("@DALISERVERS");
+            if (!daliServers)
+            {
+                LOG(MCerror, thorJob, "No Dali server list specified\n");
+                return 1;
+            }
+            Owned<IGroup> serverGroup = createIGroup(daliServers, DALI_SERVER_PORT);
+            unsigned retry = 0;
+            loop {
+                try {
+                    LOG(MCdebugProgress, thorJob, "calling initClientProcess");
+                    initClientProcess(serverGroup,DCR_ThorSlave, getFixedPort(TPORT_mp));
+                    break;
+                }
+                catch (IJSOCK_Exception *e) {
+                    if ((e->errorCode()!=JSOCKERR_port_in_use))
+                        throw;
+                    FLLOG(MCexception(e), thorJob, e,"InitClientProcess");
+                    if (retry++>10)
+                        throw;
+                    e->Release();
+                    LOG(MCdebugProgress, thorJob, "Retrying");
+                    Sleep(retry*2000);
+                }
+            }
+            setPasswordsFromSDS();
+#endif
             StringBuffer thorPath;
             globals->getProp("@thorPath", thorPath);
             recursiveCreateDirectory(thorPath.str());
@@ -448,8 +445,8 @@ int main( int argc, char *argv[]  )
 
 #ifdef USE_MP_LOG
     stopLogMsgReceivers();
-    stopMPServer();
 #endif
+    stopMPServer();
     ::Release(globals);
     releaseAtoms(); // don't know why we can't use a module_exit to destruct these...