Pārlūkot izejas kodu

HPCC-12923 Added sane logging/debugging to hpcc-init and component init files

Signed-off-by: Michael Gardner <Michael.Gardner@lexisnexis.com>
Michael Gardner 10 gadi atpakaļ
vecāks
revīzija
86f1f6cf78

+ 68 - 56
initfiles/bash/etc/init.d/hpcc-init.in

@@ -89,17 +89,13 @@ source  ${INSTALL_DIR}/etc/init.d/hpcc_common
 source  ${INSTALL_DIR}/etc/init.d/init-functions
 source  ${INSTALL_DIR}/etc/init.d/export-path
 
-# Only root user can write following HPCC_INIT_LOG
+# Only root user can write following logfile
 is_root
 
 [ ! -e ${LOG_DIR} ] && mkdir -p ${LOG_DIR}
-HPCC_INIT_LOG=${LOG_DIR}/hpcc-init.log
+export logfile=${LOG_DIR}/hpcc-init.log
 
-export PS4='+${BASH_SOURCE[1]} ${LINENO}: '
-[ -e ${HPCC_INIT_LOG}  ] && rm -rf ${HPCC_INIT_LOG}
-touch $HPCC_INIT_LOG
-exec 2> ${HPCC_INIT_LOG}
-set -x
+[ ! -e ${logfile}  ] && touch $logfile
 
 ## Debug variable allowing verbose debug output
 ##
@@ -109,13 +105,14 @@ VERBOSE=${VERBOSE:-0}
 COMP_BY_TYPE=${COMP_BY_TYPE:-0}
 DAFILESRV=${DAFILESRV:-0}
 
+
 set_environmentvars
 envfile=$configs/$environment
 
 # Know HPCC user after set_environmentvars
 log_dir_owner=$(ls -ld $LOG_DIR | awk '{print $3}')
 [ "log_dir_owner" != "${user}" ] && chown ${user}:${user} $LOG_DIR
-chown ${user}:${user} $HPCC_INIT_LOG
+chown ${user}:${user} $logfile
 
 #Sourcing the hpcc environment
 configgen_path=${path}/sbin
@@ -125,16 +122,21 @@ source ${configgen_path}/hpcc_setenv
 which_service
 get_commondirs
 
+log "--------------------------"
+log "--------------------------"
+
 #Check for existance of user
 check_user ${user}
 if [ $? -ne 1 ];then
-   echo "$user user does not exits on the system. Exiting ....."
+   log  "$user user does not exist on the system. Exiting ..."
+   echo "$user user does not exits on the system. Exiting ..."
    exit 3
 fi
 
 check_group ${group}
 if [ $? -ne 1 ];then
-  echo "Group for user ${group} does not exist on the system. Exiting....."
+   log  "Group for user ${group} does not exist on the system. Exiting ..."
+   echo "Group for user ${group} does not exist on the system. Exiting ..."
    exit 3
 fi 
 
@@ -148,7 +150,8 @@ COMPS=`${configgen_path}/configgen -env ${envfile} -list`
 comp.parser ${COMPS}
 
 if [ -z ${compArray} ];then
-   echo "There are no components configured to run on this node..."
+   log  "There are no components configured to run on this node ..."
+   echo "There are no components configured to run on this node ..."
    exit 3
 fi
 
@@ -203,20 +206,19 @@ done
 compList[0]=$compDali
 compTypeList[0]="dali"
 
-if [ ${DEBUG} != "NO_DEBUG" ]; then
-    for i in ${compList[@]};do
-        echo $i
-    done
+log "The following components have been located:"
+for i in ${compList[@]};do
+    log "---> $i"
+done
+log "--------------------------"
     
-fi
-
 #declaring all flags here
 isComp=0
 component=""
 runSetupOnly=0
 dafilesrvflag=0
 
-TEMP=`/usr/bin/getopt -o c:h --long help,componentlist,typelist -n 'hpcc-init' -- "$@"`
+TEMP=`/usr/bin/getopt -o c:hd --long help,componentlist,typelist,debug -n 'hpcc-init' -- "$@"`
 if [ $? != 0 ] ; then echo "Failure to parse commandline." >&2 ; exit 1 ; fi
 eval set -- "$TEMP"
 while true ; do
@@ -229,7 +231,7 @@ while true ; do
                 comp.getByType $comp
                 if [ -z $comp_return ]
                 then
-                    echo "Unknown component: $comp"
+                    log "Unknown component: $comp"
                     exit 1
                 fi
                 for (( i=0; i<=${compListLen}; i++ ));do
@@ -247,12 +249,14 @@ while true ; do
                 component=$comp
             fi
             shift 2 ;;
-        -h|--help) print_usage
-                   shift ;;
+        -d|--debug) DEBUG="DEBUG"
+                    shift ;;
+        -h|--help)  print_usage
+                    shift ;;
         --componentlist) print_components
-                   shift ;;
+                    shift ;;
         --typelist) print_types
-                   shift ;;
+                    shift ;;
         --) shift ; break ;;
         *) print_usage ;;
     esac
@@ -263,6 +267,10 @@ if [ -z $arg ] || [ $# -ne 1 ]; then
     print_usage
 fi
 
+log "Debug log written to $LOG_DIR/hpcc-init.debug"
+exec 2>$LOG_DIR/hpcc-init.debug
+set -x
+
 if [ -z ${component} ]; then
     for (( i=0; i<=${compListLen}; i++ ));do
         component="$component ${compList[$i]}"
@@ -299,10 +307,13 @@ case "$arg" in
         ;;
 esac
 
+log "Attempting to execute ${cmd} argument on specified components"
+
 unset IFS
 
 # Create dropzone on a full system start
 if [ ${cmd} = "start" ] || [ "${cmd}" = "restart" ]; then
+    log "Creating dropzone"
     create_dropzone
 fi
 
@@ -312,8 +323,11 @@ if [ ! -z "${compDafilesrv}" ];then
         start)
             /etc/init.d/dafilesrv status 1>/dev/null 2>/dev/null
             if [ $? -ne 0 ];then
+              log "--------------------------"
+              log "${compDafilesrv} ---> ${cmd}"
               /etc/init.d/dafilesrv $1 2>/dev/null
             else
+              log  "Dependent service dafilesrv, ${compDafilesrv} is already running."
               echo "Dependent service dafilesrv, ${compDafilesrv} is already running."
             fi
             ;;
@@ -327,6 +341,8 @@ fi
 
 # Restart handling for entire system
 if [ ${cmd} = "restart" ] && [ "${isComp}" -eq 0 ]; then
+    log "Stopping entire system for a full restart"
+    log "--------------------------"
     echo "*****************************************"
     echo "Stopping entire system for a full restart"
     echo "*****************************************"
@@ -338,33 +354,33 @@ if [ ${cmd} = "restart" ] && [ "${isComp}" -eq 0 ]; then
         fi
         set_componentvars ${compList[$i]}
         xcmd="${cmd}_component ${compList[$i]}"
-        if [ ${DEBUG} != "NO_DEBUG" ]; then
-            echo $xcmd
-        fi
+        log "--------------------------"
+        log "${compName} ---> ${cmd}"
         if strstr ${compType} "thor" && [ ${foundThorSlave} -eq 1 ];
         then
-            if [ ${DEBUG} != "NO_DEBUG" ]; then
-               echo "Thor slave found on the node, hence skipping the component "
-            fi
+            log "Thor slave found on the node, hence skipping the component "
             continue
         elif strstr ${compType} "dafilesrv" ;then
-            if [ ${DEBUG} != "NO_DEBUG" ]; then
-               echo "skipping the component ${compName}"
-            fi
+            log "skipping the component ${compName}"
             continue;
         else
           eval $xcmd
           statForEach=$?
+          log "${xcmd} ---> Exit status ${statForEach}"
           statForStop=$(( ${statForStop} == 3 ? ${statForEach} : ${statForStop} ))
         fi
     done 
 
+    log "Starting the entire System"
+    log "--------------------------"
     echo "***************************************************"
     echo "Starting the entire System"
     echo "***************************************************"
     cmd=start
     /etc/init.d/dafilesrv status 1>/dev/null 2>/dev/null
     if [ $? -ne 0 ];then
+        log "--------------------------"
+        log "${compDafilesrv} ---> ${cmd}"
         /etc/init.d/dafilesrv $1 2>/dev/null
     fi
 
@@ -375,19 +391,17 @@ if [ ${cmd} = "restart" ] && [ "${isComp}" -eq 0 ]; then
         fi
         set_componentvars ${compList[$i]}
         xcmd="${cmd}_component ${compList[$i]}"
-        if [ ${DEBUG} != "NO_DEBUG" ]; then
-            echo $xcmd
-        fi
+        log "--------------------------"
+        log "${compName} ---> ${cmd}"
         if strstr ${compType} "thor" && [ ${foundThorSlave}  -eq 1 ]; 
         then
-            if [ ${DEBUG} != "NO_DEBUG" ]; then
-               echo "Thor slave found on the node, hence just running the setup for thor"
-            fi
-            setup_component 
+            log "Thor slave found on the node, hence just running the setup for thor"
+            setup_component
             continue
         else
            eval $xcmd
            statForEach=$?
+           log "${xcmd} ---> Exit status ${statForEach}"
            statForStart=$(( ${statForStart} == 0 ? ${statForEach} : ${statForStart} ))
         fi
     done
@@ -405,38 +419,36 @@ fi
 STATUS=0
 for C in ${component} ; do
     if [ -z "${C}" ];then
-          continue;
+      continue;
     fi
     set_componentvars ${C}
     xcmd="${cmd}_component ${C}"
-    if [ ${DEBUG} != "NO_DEBUG" ]; then
-        echo $xcmd
-    fi
     if strstr ${compType} "thor" && [ ${foundThorSlave} -eq 1 ] && [ "${cmd}" != "status" ] && [ "${cmd}" != "setup" ]; then
-        if [ ${DEBUG} != "NO_DEBUG" ]; then
-            echo "Thor slave found on the node, hence just running the setup for thor"
-        fi
-        setup_component 
-        continue
+      log "Thor slave found on the node, hence just running the setup for thor"
+      setup_component
+      continue
     else
-        eval $xcmd
-        statForEach=$?
-        STATUS=$(( $STATUS == 0 ? $statForEach : $STATUS))
+      log "--------------------------"
+      log "${compName} ---> ${cmd}"
+      eval $xcmd
+      statForEach=$?
+      log "${cmd}_component ${C} ---> Exit status ${statForEach}"
+      STATUS=$(( $STATUS == 0 ? $statForEach : $STATUS))
     fi
 done 
 
 
 if [ "$cmd" = "stop" ] && [ -n "$compDafilesrv" ]
 then
-     echo 
     /etc/init.d/dafilesrv status 1>/dev/null 2>/dev/null
     if [ $? -ne 0 ];then
-       echo "Service dafilesrv, ${compDafilesrv} is already stopped.".
+      log  "Service dafilesrv, ${compDafilesrv} is already stopped."
+      echo "Service dafilesrv, ${compDafilesrv} is already stopped."
     else
-       echo "Service dafilesrv, ${compDafilesrv} is still running".
-       echo "To stop it, run \"service dafilesrv stop\"."
+      log  "Service dafilesrv, ${compDafilesrv} is still running."
+      echo "Service dafilesrv, ${compDafilesrv} is still running."
+      echo "To stop it, run \"service dafilesrv stop\"."
     fi
-    echo 
 
 fi
 exit ${STATUS}

+ 53 - 111
initfiles/bash/etc/init.d/hpcc_common.in

@@ -251,26 +251,10 @@ set_environmentvars() {
     ## use default of "DEFAULT"
     ##
     SECTION=${SECTION:-DEFAULT}
-    DEBUG=${DEBUG:-NO_DEBUG}
 
     cfg.parser ${HPCC_CONFIG}
     cfg.section.${SECTION}
 
-    if [ ${DEBUG} != "NO_DEBUG" ]; then
-        echo "\$runtime=$runtime"
-        echo "\$path=$path"
-        echo "\$configs=$configs"
-        echo "\$configsbackup=$configsbackup"
-        echo "\$user=$user"
-        echo "\$lock=$lock"
-        echo "\$pid=$pid"
-        echo "\$log=$log"
-        echo "\$environment=$environment"
-        echo "\$interface=$interface"
-        echo "\$autodetectipscript=$autodetectipscript"
-        echo
-    fi
-
     if [ -n "${umask}" ]; then
         umask $umask
     fi
@@ -298,15 +282,10 @@ configGenCmd() {
 
     # Creating logfiles for component
     logDir=$log/${compName}
-    logFile=$log/${compName}/init_${compName}.log
 
     configcmd="${configgen_path}/configgen -env ${envfile} -od ${runtime} -id ${componentFile} -c ${compName}"
-    if [ ${DEBUG} != "NO_DEBUG" ]; then
-        echo $configcmd
-    else
-        echo $configcmd >> $logFile
-    fi
-    su ${user} -c "$configcmd" >> $logFile 2>&1
+    log "$configcmd"
+    su ${user} -c "$configcmd" 2>/dev/null
 }
 
 createRuntime() {
@@ -337,31 +316,23 @@ createRuntime() {
     # Creating Component Specific directories
     # Creating pidfile specific directory and changing its owner permissions
     if [ ! -d "$pid/$compName" ]; then
-        if [ ${DEBUG} != "NO_DEBUG" ]; then
-            echo "Creating Pidfile directory"
-        fi
+        log "Creating Pidfile Directory $pid/$compName"
         createDir "$pid/$compName"
     fi
 
     if [ ! -d "$lock/$compName" ]; then
-        if [ ${DEBUG} != "NO_DEBUG" ]; then
-            echo "Creating lockfile directory"
-        fi
+        log "Creating Lockfile Directory $lock/$compName"
         createDir "$lock/$compName"
     fi
 
     if [ ! -d "$log/$compName" ]; then
-        if [ ${DEBUG} != "NO_DEBUG" ]; then
-            echo "Creating log directory"
-        fi
+        log "Creating Log Directory $log/$compName"
         createDir "$log/$compName"
     fi
 
     # Creating runtime specific directory and changing its owner permissions
     if [ ! -d $compPath ]; then
-        if [ ${DEBUG} != "NO_DEBUG" ]; then
-            echo "Creating Runtime Directory for $compName"
-        fi
+        log "Creating Runtime Directory $compPath"
         createDir "$compPath"
     fi
 
@@ -394,17 +365,14 @@ start_dafilesrv() {
    /etc/init.d/dafilesrv status 1>/dev/null 2>/dev/null
    if [ $? -ne 0 ];then
       #Dafilesrv is not running so start it , before starting cleanup the lock and pid file.
-      if [ ${DEBUG} != "NO_DEBUG" ]; then
-        log_failure_msg "Pid or lock file exists, but process is not running"
-      fi
       cleanupRuntimeEnvironment
-
       noStatusCheck=1
       /etc/init.d/dafilesrv setup 1>/dev/null 2>/dev/null
       startCmd ${compName} ${noStatusCheck}
       return $?
    else
-      printf "Starting %-21s" "$compName.... "
+      log "Component $compName already started ..."
+      printf "Starting %-21s" "$compName ..."
       log_success_msg "Already started"
       return 0
    fi
@@ -412,18 +380,14 @@ start_dafilesrv() {
 
 startCmd() {
     noStatusCheck=$2
-    printf "Starting %-21s" "$compName.... "
-    if [ ${DEBUG} != "NO_DEBUG" ]; then
-        echo "compName=$compName compPath=$compPath compProcessName=$compType"
-    fi
+    printf "Starting %-21s" "$compName ..."
+    log "compType = $compType"
 
     # use less heap when threaded
     export MALLOC_ARENA_MAX=8
 
     # Creating logfiles for component
     logDir=$log/${compName}
-    logFile=$log/${compName}/init_${compName}.log
-
 
     if [ ${noStatusCheck} -ne 1 ]; then
       check_status ${PIDPATH} ${LOCKPATH} ${COMPPIDPATH} 1
@@ -434,6 +398,7 @@ startCmd() {
       fi
       if [ ${RCSTART} -eq 0 ]; then
         #Since component is already started but current script is failed till returning 0
+        log "$compName ---> already started"
         log_success_msg "Already Started"
         return ${RCSTART}
       fi
@@ -469,29 +434,24 @@ startCmd() {
     fi
 
     EXEC_COMMAND="${bin_path}/init_${compType} "
-    startcmd="${START_STOP_DAEMON} -S -p ${pid}/init_${compName}.pid -c ${user}:${group} -d ${compPath} ${UMASK_ARG} -m -x ${EXEC_COMMAND} -b  >>${logFile} 2>&1"
-
-    issueTime=`date`
-    logCommand="COMMAND:: $startcmd  ::Issued at $issueTime "
-    echo $logCommand >> $logFile
+    startcmd="${START_STOP_DAEMON} -S -p ${pid}/init_${compName}.pid -c ${user}:${group} -d ${compPath} ${UMASK_ARG} -m -x ${EXEC_COMMAND} -b"
 
+    log "${startcmd}"
 
     # Creating a Lock
     lockPath=${lock}/${compName}
     if [ ! -d $lockPath ]; then
-        mkdir -p $lockPath >> $logFile 2>&1
+        mkdir -p $lockPath >>/dev/null 2>&1
     fi
     chown -c $user:$group $lockPath >> /dev/null 2>&1
     lock ${lock}/${compName}/${compName}.lock
 
     if [ $__lockCreated -eq 0 ]; then
+        log "Cannot create the lock file. File locked by subsystem"
         log_failure_msg "Cannot create the lock file, File locked by subsystem"
         return 3
     fi
 
-    if [ ${DEBUG} != "NO_DEBUG" ]; then
-        echo $startcmd
-    fi
     eval $startcmd
 
 
@@ -516,6 +476,7 @@ startCmd() {
           COMPONENT_HAS_STARTED=1
         else
           if [ ${COMPONENT_HAS_STARTED} -eq 1 ]; then
+            log "${compName} failed to start cleanly"
             log_failure_msg "${compName} failed to start cleanly"
             return 0;
           fi
@@ -525,6 +486,7 @@ startCmd() {
 
     if [ ${WAITTIME} -eq 0 ]; then
         log_timeout_msg
+        log "${compName} has timed out, but may still be starting"
     fi
 
     chmod 644 ${envfile}
@@ -543,21 +505,15 @@ stop_component() {
     check_status ${PIDPATH} ${LOCKPATH} ${COMPPIDPATH} 0
     RCSTOP=$?
     if [ $RCSTOP -ne 0 ];then
-       if [ ${DEBUG} != "NO_DEBUG" ]; then
-          log_success_msg "Process already stopped :: check_status code is ${RCSTOP}"
-       else
-          log_success_msg "Already stopped"
-       fi
-       cleanup_component
-       cleanupRuntimeEnvironment
-       return 0
+      log "Already stopped"
+      log_success_msg "Already stopped"
+      cleanup_component
+      cleanupRuntimeEnvironment
+      return 0
     fi
 
     stopcmd="${START_STOP_DAEMON} -K -p ${PIDPATH} >> tmp.txt 2>&1"
-
-    if [ ${DEBUG} != "NO_DEBUG" ]; then
-        echo "$stopcmd"
-    fi
+    log "$stopcmd"
 
     eval $stopcmd
 
@@ -598,27 +554,15 @@ stop_component() {
 
 
 start_component() {
-    if [ ${DEBUG} != "NO_DEBUG" ]; then
-        echo "comp_return = $comp_return"
-        echo "compName = $compName compPath = $compPath "
-        echo "path = $PATH"
-    fi
-
 
-    # Creating logfiles for component
+    # Creating logdirs for component
     logDir=$log/${compName}
-    logFile=$log/${compName}/init_${compName}.log
 
     if [ ! -d $logDir ]; then
         mkdir -p $logDir >> tmp.txt 2>&1
         chown -c $user:$group $logDir >> /dev/null 2>&1
     fi
 
-    if [ ! -f $logFile ]; then
-        touch $logFile >> tmp.txt 2>&1
-        chown -c $user:$group $logFile >> /dev/null 2>&1
-    fi
-
     # Creating Runtime
     createRuntime $compName $compPath
 
@@ -628,7 +572,7 @@ start_component() {
 
     if [ ${runSetupOnly} -ne 1 ]
     then
-        cd ${compPath} > $logFile 2>&1
+        cd ${compPath} >>/dev/null 2>&1
         startCmd ${compName} 0
         STAT=$?
     fi
@@ -637,27 +581,29 @@ start_component() {
 
 restart_component() {
     if strstr "${compType}" "dafilesrv" ;then
-       /etc/init.d/dafilesrv status 1>/dev/null 2>/dev/null
-       if [ $? -eq 0 ];then
-         /etc/init.d/dafilesrv stop 2>/dev/null
-       else
-           echo "Component $compName was not running. Will start it now for you ...."
-           cleanupRuntimeEnvironment
-       fi
-         /etc/init.d/dafilesrv start 2>/dev/null
+      /etc/init.d/dafilesrv status 1>/dev/null 2>/dev/null
+      if [ $? -eq 0 ];then
+        /etc/init.d/dafilesrv stop 2>/dev/null
+      else
+        log  "$compName ---> Stopped.  Now Starting ..."
+        echo "Component $compName was not running. Will start it now for you ..."
+        cleanupRuntimeEnvironment
+      fi
+      /etc/init.d/dafilesrv start 2>/dev/null
     else
-       check_status ${PIDPATH} ${LOCKPATH} ${COMPPIDPATH} 1
-       RCRESTART=$?
-       if [ $RCRESTART -ne 0 ];then
-           echo "Component $compName was not running. Will start it now for you ...."
-           cleanupRuntimeEnvironment
-       else
-           stop_component ${compName}
-       fi
-       start_component $compName
-       check_status ${PIDPATH} ${LOCKPATH} ${COMPPIDPATH} 1
-       RCRESTART=$?
-       return $RCRESTART
+      check_status ${PIDPATH} ${LOCKPATH} ${COMPPIDPATH} 1
+      RCRESTART=$?
+      if [ $RCRESTART -ne 0 ];then
+        log  "$compName ---> Stopped.  Now Starting ..."
+        echo "Component $compName was not running. Will start it now for you ..."
+        cleanupRuntimeEnvironment
+      else
+        stop_component ${compName}
+      fi
+      start_component $compName
+      check_status ${PIDPATH} ${LOCKPATH} ${COMPPIDPATH} 1
+      RCRESTART=$?
+      return $RCRESTART
     fi
 }
 
@@ -666,9 +612,11 @@ status_component() {
   RCSTATUS=$?
   getPid ${COMPPIDPATH}
   if [ ${RCSTATUS} -ne 0 ];then
+    log "${compName} ---> Stopped"
     printf "%-15s is stopped" "$compName"
   else
-    printf "%-15s ( pid %8s ) is running..." "${compName}" "${__pidValue}"
+    log "${compName} ---> Running ( pid ${__pidValue} )"
+    printf "%-15s ( pid %8s ) is running ..." "${compName}" "${__pidValue}"
   fi
   echo ""
   return ${RCSTATUS}
@@ -1001,23 +949,17 @@ cleanup_component() {
       WAITTIME=`expr ${WAITTIME} - 1`
       kill -0 -$pgid &> /dev/null
       if [ $? -ne 0 ];then
-        if [ ${DEBUG} != "NO_DEBUG" ];then
-          echo "${compName} orphans cleaned up"
-        fi
+        log "${compName} orphans cleaned up"
         RUNNING=0
         break;
       else
-        if [ ${DEBUG} != "NO_DEBUG" ]; then
-          echo "Waiting for ${compName} orphans to cleanup gracefully"
-        fi
+        log "Waiting for ${compName} orphans to cleanup gracefully"
         sleep 1
       fi
     done
 
     if [[ ${RUNNING} -eq 1 ]]; then
-      if [ ${DEBUG} != "NO_DEBUG" ]; then
-        echo "sending SIGKILL to ${compName} orphans"
-      fi
+      log "sending SIGKILL to ${compName} orphans"
       kill -9 -$pgid
     fi
   fi

+ 16 - 0
initfiles/bash/etc/init.d/init-functions

@@ -198,6 +198,22 @@ log_timeout_msg () {
     fi
 }
 
+# general logging message for init scripts
+# expects $logfile to exist within the context of where it's called
+log() {
+  if [[ -z ${logfile+x} ]]; then
+    # logfile isn't set within the context of this function call
+    return 1
+  fi
+
+  local msg=$@
+  local header=$( date --universal --iso-8601='seconds' )
+  local header="${header:0:19}: "
+  printf "%s%s\n" "$header" "$msg" >> $logfile
+
+  return 0
+}
+
 #log_success_msg () {
 #    if [ -n "${1:-}" ]; then
 #        log_begin_msg $@

+ 8 - 16
initfiles/bash/etc/init.d/lock.sh

@@ -21,13 +21,13 @@ checkLockDir () {
     LOCKPATH=$1
     #echo -n "Checking if Lock path exists "
     if [ -d $LOCKPATH ]; then
-        log_success_msg "$LOCKPATH ..."
+        log "$LOCKPATH ..."
     else 
-        log_failure_msg "$LOCKPATH ..."
-        echo "Creating Lock Path ..."
+        log "$LOCKPATH ..."
+        log "Creating Lock Path ..."
         /bin/mkdir $LOCKPATH
         if [ !-d $LOCKPATH ]; then
-            log_failure_msg "Can not create Lock Path $LOCKPATH ..."
+            log "Can not create Lock Path $LOCKPATH ..."
         fi  
     fi
 
@@ -43,9 +43,7 @@ lock () {
     locked $FILE
     if [ $flagLocked -eq 1 ]; then
         __lockCreated=0
-        if [ ${DEBUG} != "NO_DEBUG" ]; then
-            log_failure_msg "Lock file $FILE already exists"
-        fi
+        log "Lock file $FILE already exists"
     else
         /bin/touch $FILE
         locked $FILE
@@ -53,9 +51,7 @@ lock () {
             #log_success_msg 
             __lockCreated=1
         else
-            if [ ${DEBUG} != "NO_DEBUG" ]; then
-                log_failure_msg "Failed to create file $FILE"
-            fi
+            log "Failed to create file $FILE"
             __lockCreated=0
         fi
     fi          
@@ -80,16 +76,12 @@ unlock () {
     FILE=$1
     #echo -n "Removing lock file $1 "
     if [ ! -e $FILE ]; then
-        if [ ${DEBUG} != "NO_DEBUG" ]; then
-            log_failure_msg "Lock file $FILE does not exist"
-        fi
+        log "Lock file $FILE does not exist"
         __lockRemoved=0
     else
         /bin/rm -rf $FILE
         if [ -e $FILE ]; then
-            if [ ${DEBUG} != "NO_DEBUG" ]; then
-                log_failure_msg "File $FILE can not be removed"
-            fi
+            log "File $FILE can not be removed"
             __lockRemoved=0
         else
             __lockRemoved=1

+ 15 - 6
initfiles/bash/etc/init.d/pid.sh

@@ -25,7 +25,7 @@ checkPidDir () {
         echo "Creating a Pid directory"
         /bin/mkdir -P ${PIDFILEPATH} 
         if [[ ! -e ${PIDFILEPATH} ]]; then
-            echo "Can not create a Pid directory $PIDFILEPATH"
+            log  "Can not create a Pid directory $PIDFILEPATH"
         else
             log_success_msg
         fi
@@ -39,15 +39,17 @@ createPid () {
     checkPid ${PIDFILEPATH}
     if [[ $__flagPid -eq 1 ]]; then
         [[ ${DEBUG} != "NO_DEBUG" ]] && log_failure_msg "Pid file already exists"
+        log "Pid file already exists"
         __pidCreated=0
     else
         echo $PIDNO > ${PIDFILEPATH}
         checkPid ${PIDFILEPATH}
         if [[ $__flagPid -eq 1 ]]; then
-            [[ ${DEBUG} != "NO_DEBUG" ]] && log_success_msg 
+            [[ ${DEBUG} != "NO_DEBUG" ]] && log_success_msg
             __pidCreated=1
         else
             [[ ${DEBUG} != "NO_DEBUG" ]] && log_failure_msg "Failed to create Pid"
+            log "Failed to create Pid"
             __pidCreated=0
         fi
     fi
@@ -80,6 +82,7 @@ removePid () {
     checkPid ${PIDFILEPATH}
     if [[ $__flagPid -eq 0 ]]; then
         [[ ${DEBUG} != "NO_DEBUG" ]] && log_failure_msg "Pidfile doesn't exist"
+        log "Pid file doesn't exist"
         __pidRemoved=0
     else
         rm -rf ${PIDFILEPATH} > /dev/null 2>&1
@@ -87,11 +90,12 @@ removePid () {
             __pidRemoved=1
         else
             [[ ${DEBUG} != "NO_DEBUG" ]] && log_failure_msg "Failed to remove pid"
+            log "Failed to remove pid"
             __pidRemoved=0
         fi
     fi
 }
-    
+
 checkPidExist() {
     PIDFILEPATH=$1
     getPid ${PIDFILEPATH}
@@ -134,12 +138,15 @@ check_status() {
     # check if running and healthy
     if [[ $componentLocked -eq 1 ]] && [[ $initRunning -eq 1 ]] && [[ $compRunning -eq 1 ]]; then
         [[ ${DEBUG} != "NO_DEBUG" ]] && echo "everything is up except sentinel"
+        log "$compName ---> Waiting on Sentinel"
         if [[ ${SENTINELFILECHK} -eq 1 ]]; then
             if [[ ${sentinelFlag} -eq 0 ]]; then
                 [[ ${DEBUG} != "NO_DEBUG" ]] && echo "Sentinel not yet located, process currently unhealthy"
+                log "$compName ---> Currently Unhealthy"
                 return 2 
             fi
             [[ ${DEBUG} != "NO_DEBUG" ]] && echo "Sentinel is now up"
+            log "$compName ---> Sentinel Up"
         fi
         return 0
     # check if shutdown and healthy
@@ -147,16 +154,18 @@ check_status() {
         if [[ ${SENTINELFILECHK} -eq 1 ]]; then
             if [[ ${sentinelFlag} -eq 1 ]]; then
                 [[ ${DEBUG} != "NO_DEBUG" ]] && echo "Sentinel is up but orphaned"
+                log "$compName ---> Orphaned State"
                 return 3
             fi
             [[ ${DEBUG} != "NO_DEBUG" ]] && echo "Sentinel is now down"
+            log "$compName ---> Sentinel Down"
         fi
         return 1
     else
         if [[ "${DEBUG}" != "NO_DEBUG" ]]; then
-            [[ $componentLocked -eq 0 ]] && log_failure_msg "component is not locked: $2"
-            [[ $initRunning -eq 0 ]]     && log_failure_msg "process for ${compName}_init.pid is not running"
-            [[ $compRunning -eq 0 ]]     && log_failure_msg "process for ${compName}.pid is not running"
+            [[ $componentLocked -eq 0 ]] && log "$compName ---> component is not locked: $LOCKFILEPATH"
+            [[ $initRunning -eq 0 ]]     && log "$compName ---> process for init_${compName}.pid is not running"
+            [[ $compRunning -eq 0 ]]     && log "$compName ---> process for ${compName}.pid is not running"
         fi
         return 4
     fi

+ 15 - 1
initfiles/bin/init_configesp

@@ -19,29 +19,43 @@
 PATH_PRE=$(type -path hpcc_setenv)
 source ${PATH_PRE}
 PID_NAME="$PID/$(basename $PWD).pid"
+source ${INSTALL_DIR}/etc/init.d/hpcc_common
+
+component=$(basename $PWD)
+export logfile="${LOG_DIR}/${component}/init_${component}_$(date +%Y_%m_%d_%H_%M_%S).log"
 
 export SENTINEL="configesp.sentinel"
+log "Removing ---> ${SENTINEL}"
 rm -f ${SENTINEL}
 
 killed()
 {
+    log "attempting to kill $component"
     kill_process ${PID_NAME} configesp 3 ${SENTINEL}
+    if [[ $? -eq 1 ]]; then
+      log "could not kill $component"
+    else
+      log "Stopped $component"
+    fi
     exit 255
 }
 
 trap "killed" SIGINT SIGTERM SIGKILL
-
+log "calling configesp 1>/dev/null 2>/dev/null"
 nohup configesp 1>/dev/null 2>/dev/null &
 echo $! > $PID_NAME
 wait
+log "removing $PID_NAME"
 rm $PID_NAME
 
 while [[ -e ${SENTINEL} ]]; do
     sleep 5
     if [[ -e ${SENTINEL} ]]; then
+        log "calling configesp 1>/dev/null 2>/dev/null"
         nohup configesp 1>/dev/null 2>/dev/null &
         echo $! > $PID_NAME
         wait
+        log "removing $PID_NAME"
         rm $PID_NAME
     fi
 done

+ 22 - 5
initfiles/bin/init_dafilesrv.in

@@ -27,35 +27,52 @@ shift
 PATH_PRE=$(type -path hpcc_setenv)
 source ${PATH_PRE}
 PID_NAME="$PID/$(basename $PWD).pid"
-COMP_NAME="$(basename $PWD)"
+source ${INSTALL_DIR}/etc/init.d/hpcc_common
 
 # this must match jsocket hard limit
 export handlelimit=32768
 
+component=$(basename $PWD)
+export logfile="${LOG_DIR}/${component}/init_${component}_$(date +%Y_%m_%d_%H_%M_%S).log"
+
 export SENTINEL="dafilesrv.sentinel"
+log "Removing ${SENTINEL}"
 rm -f ${SENTINEL}
 
 killed()
 {
+    log "Attempting to kill $component"
     kill_process ${PID_NAME} dafilesrv 3 ${SENTINEL}
+    if [[ $? -eq 1 ]]; then
+      log "could not kill $component"
+    else
+      log "$component Stopped"
+    fi
     exit 255
 }
 
-ulimit -c unlimited
-ulimit -n $handlelimit
+log "Setting core and handle limit"
+ulimit -Sc hard
+[[ $? -ne 0 ]] && log "Failed to set core file limit"
+ulimit -Sn hard
+[[ $? -ne 0 ]] && log "Failed to set file descriptor limit"
 
 trap "killed" SIGINT SIGTERM SIGKILL
-dafilesrv -L $log -I ${COMP_NAME} &
+log "Calling dafilesrv -L $log -I $component &"
+dafilesrv -L $log -I $component &
 echo $! > $PID_NAME
 wait
+log "Removing $PID_NAME"
 rm $PID_NAME
 
 while [[ -e ${SENTINEL} ]]; do
     sleep 5
     if [[ -e ${SENTINEL} ]]; then
-        dafilesrv -L $log -I ${COMP_NAME} &
+        log "Calling dafilesrv -L $log -I $component &"
+        dafilesrv -L $log -I $component &
         echo $! > $PID_NAME
         wait
+        log "Removing $PID_NAME"
         rm $PID_NAME
     fi
 done

+ 16 - 0
initfiles/bin/init_dali

@@ -18,29 +18,45 @@
 PATH_PRE=$(type -path hpcc_setenv)
 source ${PATH_PRE}
 PID_NAME="$PID/$(basename $PWD).pid"
+source ${INSTALL_DIR}/etc/init.d/hpcc_common
+
+component=$(basename $PWD)
+export logfile="${LOG_DIR}/${component}/init_${component}_$(date +%Y_%m_%d_%H_%M_%S).log"
 
 export SENTINEL="daserver.sentinel"
+log "Removing $SENTINEL"
 rm -f ${SENTINEL}
 
 killed()
 {
+    log "calling dalistop ."
     dalistop .
+    log "Attempting to kill $component"
     kill_process ${PID_NAME} daserver 3 ${SENTINEL}
+    if [[ $? -eq 1 ]]; then
+      log "could not kill $component"
+    else
+      log "$component Stopped"
+    fi
     exit 255
 }
 
 trap "killed" SIGINT SIGTERM SIGKILL
+log "calling daserver 1>/dev/null 2>/dev/null &"
 daserver 1>/dev/null 2>/dev/null &
 echo $! > $PID_NAME
 wait
+log "$component returned, removing $PID_NAME"
 rm $PID_NAME
 
 while [[ -e ${SENTINEL} ]]; do
     sleep 5
     if [[ -e ${SENTINEL} ]]; then
+        log "calling daserver 1>/dev/null 2>/dev/null &"
         daserver 1>/dev/null 2>/dev/null &
         echo $! > $PID_NAME
         wait
+        log "$component returned, removing $PID_NAME"
         rm $PID_NAME
     fi
 done

+ 15 - 1
initfiles/bin/init_dfuserver

@@ -18,29 +18,43 @@
 PATH_PRE=$(type -path hpcc_setenv)
 source ${PATH_PRE}
 PID_NAME="$PID/$(basename $PWD).pid"
+source ${INSTALL_DIR}/etc/init.d/hpcc_common
+
+component=$(basename $PWD)
+export logfile="${LOG_DIR}/${component}/init_${component}_$(date +%Y_%m_%d_%H_%M_%S).log"
 
 export SENTINEL="dfuserver.sentinel"
+log "Removing ${SENTINEL}"
 rm -f ${SENTINEL}
 
 killed()
 {
+    log "Attempting to kill $component"
     kill_process ${PID_NAME} dfuserver 15 ${SENTINEL}
+    if [[ $? -eq 1 ]]; then
+      log "could not kill $component"
+    else
+      log "$component Stopped"
+    fi
     exit 255
 }
 
 trap "killed" SIGINT SIGTERM SIGKILL
-
+log "Calling dfuserver 1>/dev/null 2>/dev/null &"
 dfuserver 1>/dev/null 2>/dev/null &
 echo $! > $PID_NAME
 wait
+log "Removing $PID_NAME"
 rm $PID_NAME
 
 while [[ -e ${SENTINEL} ]]; do
     sleep 5
     if [[ -e ${SENTINEL} ]]; then
+        log "Calling dfuserver 1>/dev/null 2>/dev/null &"
         dfuserver 1>/dev/null 2>/dev/null &
         echo $! > $PID_NAME
         wait
+        log "Removing $PID_NAME"
         rm $PID_NAME
     fi
 done

+ 16 - 1
initfiles/bin/init_eclagent.in

@@ -20,31 +20,46 @@
 PATH_PRE=$(type -path hpcc_setenv)
 source ${PATH_PRE}
 PID_NAME="$PID/$(basename $PWD).pid"
+source ${INSTALL_DIR}/etc/init.d/hpcc_common
+
+component=$(basename $PWD)
+export logfile="${LOG_DIR}/${component}/init_${component}_$(date +%Y_%m_%d_%H_%M_%S).log"
 
 export SENTINEL="agentexec.sentinel"
+log "Removing $SENTINEL"
 rm -f ${SENTINEL}
 
+log "Removing $PID_DIR/hthortemp/*"
 rm -f ${PID_DIR}/hthortemp/*
 
 killed ()
 {
+    log "Attempting to kill $component"
     kill_process ${PID_NAME} agentexec 3 ${SENTINEL}
+    if [[ $? -eq 1 ]]; then
+      log "could not kill $component"
+    else
+      log "$component Stopped"
+    fi
     exit 255
 }
 
 trap "killed" SIGINT SIGTERM SIGKILL
-
+log "Calling agentexec 1>/dev/null 2>/dev/null &"
 agentexec 1>/dev/null 2>/dev/null &
 echo $! > $PID_NAME
 wait
+log "Removing $PID_NAME"
 rm $PID_NAME
 
 while [[ -e ${SENTINEL} ]]; do
     sleep 1
     if [[ -e ${SENTINEL} ]]; then
+        log "Calling agentexec 1>/dev/null 2>/dev/null &"
         agentexec 1>/dev/null 2>/dev/null &
         echo $! > $PID_NAME
         wait
+        log "Removing $PID_NAME"
         rm $PID_NAME
     fi
 done

+ 9 - 0
initfiles/bin/init_eclccserver

@@ -18,8 +18,13 @@
 PATH_PRE=$(type -path hpcc_setenv)
 source ${PATH_PRE}
 PID_NAME="$PID/$(basename $PWD).pid"
+source ${INSTALL_DIR}/etc/init.d/hpcc_common
+
+component=$(basename $PWD)
+export logfile="${LOG_DIR}/${component}/init_${component}_$(date +%Y_%m_%d_%H_%M_%S).log"
 
 export SENTINEL="eclccserver.sentinel"
+log "Removing $SENTINEL"
 rm -f ${SENTINEL}
 
 killed()
@@ -29,17 +34,21 @@ killed()
 }
 
 trap "killed" SIGINT SIGTERM SIGKILL
+log "Calling eclccserver 1>/dev/null 2>/dev/null &"
 eclccserver 1>/dev/null 2>/dev/null &
 echo $! > $PID_NAME
 wait
+log "Removing $PID_NAME"
 rm $PID_NAME
 
 while [[ -e ${SENTINEL} ]]; do
     sleep 5
     if [[ -e ${SENTINEL} ]]; then
+        log "Calling eclccserver 1>/dev/null 2>/dev/null &"
         eclccserver 1>/dev/null 2>/dev/null & 
         echo $! > $PID_NAME
         wait
+        log "Removing $PID_NAME"
         rm $PID_NAME
     fi
 done

+ 15 - 0
initfiles/bin/init_eclscheduler

@@ -18,28 +18,43 @@
 PATH_PRE=$(type -path hpcc_setenv)
 source ${PATH_PRE}
 PID_NAME="$PID/$(basename $PWD).pid"
+source ${INSTALL_DIR}/etc/init.d/hpcc_common
+
+component=$(basename $PWD)
+export logfile="${LOG_DIR}/${component}/init_${component}_$(date +%Y_%m_%d_%H_%M_%S).log"
 
 export SENTINEL="eclscheduler.sentinel"
+log "Removing $SENTINEL"
 rm -f ${SENTINEL}
 
 killed()
 {
+    log "Attempting to kill $component"
     kill_process ${PID_NAME} eclscheduler 3 ${SENTINEL}
+    if [[ $? -eq 1 ]]; then
+      log "could not kill $component"
+    else
+      log "$component Stopped"
+    fi
     exit 255
 }
 
 trap "killed" SIGINT SIGTERM SIGKILL
+log "Calling eclscheduler 1>/dev/null 2>/dev/null &"
 eclscheduler 1>/dev/null 2>/dev/null &
 echo $! > $PID_NAME
 wait
+log "Removing $PID_NAME"
 rm $PID_NAME
 
 while [[ -e ${SENTINEL} ]]; do
     sleep 5
     if [[ -e ${SENTINEL} ]]; then
+        log "Calling eclscheduler 1>/dev/null 2>/dev/null &"
         eclscheduler 1>/dev/null 2>/dev/null &
         echo $! > $PID_NAME
         wait
+        log "Removing $PID_NAME"
         rm $PID_NAME
     fi
 done

+ 11 - 1
initfiles/bin/init_esp

@@ -18,28 +18,38 @@
 PATH_PRE=$(type -path hpcc_setenv)
 source ${PATH_PRE}
 PID_NAME="$PID/$(basename $PWD).pid"
+source ${INSTALL_DIR}/etc/init.d/hpcc_common
+
+component=$(basename $PWD)
+export logfile="${LOG_DIR}/${component}/init_${component}_$(date +%Y_%m_%d_%H_%M_%S).log"
 
 export SENTINEL="esp.sentinel"
+log "Removing $SENTINEL"
 rm -f ${SENTINEL}
 
 SNMPID=$$
 
-killed() {
+killed()
+{
     kill_process ${PID_NAME} esp 15 ${SENTINEL}
     exit 255
 }
 
 trap "killed" SIGINT SIGTERM SIGKILL
+log "Calling esp snmpid=$SNMPID 1>/dev/null 2>/dev/null &"
 esp snmpid=$SNMPID 1>/dev/null 2>/dev/null &
 echo $! > $PID_NAME
 wait
+log "Removing $PID_NAME"
 rm $PID_NAME
 while [[ -e ${SENTINEL} ]]; do
     sleep 5
     if [[ -e ${SENTINEL} ]]; then
+        log "Calling esp snmpid=$SNMPID 1>/dev/null 2>/dev/null &"
         esp snmpid=$SNMPID 1>/dev/null 2>/dev/null &
         echo $! > $PID_NAME
         wait
+        log "Removing $PID_NAME"
         rm $PID_NAME
     fi
 done

+ 1 - 1
initfiles/bin/init_ftslave

@@ -15,4 +15,4 @@
 #    limitations under the License.
 ################################################################################
 
-# ftslave is executed on demand via ssh - no action needed at service start time
+# ftslave is executed on demand via ssh - no action needed at service start time

+ 28 - 7
initfiles/bin/init_roxie

@@ -18,17 +18,24 @@
 PATH_PRE=$(type -path hpcc_setenv)
 source ${PATH_PRE}
 PID_NAME="$PID/$(basename $PWD).pid"
+source ${INSTALL_DIR}/etc/init.d/hpcc_common
+
+component=$(basename $PWD)
+export logfile="${LOG_DIR}/${component}/init_${component}_$(date +%Y_%m_%d_%H_%M_%S).log"
 
 export SENTINEL="roxie.sentinel"
+log "Removing $SENTINEL"
 rm -f ${SENTINEL}
 
-if [ -n "$1" ]; then
+if [[ -n "$1" ]]; then
+    log "cd $1"
     cd $1
 fi
 
+log "sourcing roxievars"
 source ./roxievars
 
-if [ -n "$2" ]; then
+if [[ -n "$2" ]]; then
     logfilename=$2
 else
     logfilename="`date +%m_%d_%Y_%H_%M_%S`"
@@ -37,32 +44,46 @@ fi
 export LIBC_FATAL_STDERR_=1
 export restarts=0
 
-ulimit -c unlimited
-ulimit -n $NUM_ROXIE_HANDLES
+log "Setting limits for core and open file descriptors"
+ulimit -Sc hard > /dev/null 2>&1
+[[ $? -ne 0 ]] && log "Failed to set core file limit"
+ulimit -Sn hard > /dev/null 2>&1
+[[ $? -ne 0 ]] && log "Failed to set file descriptor limit"
 
 killed()
 {
     if [[ -n "$1" ]]; then
+        log "cd $1"
         cd $1
     fi
+    log "Attempting to kill $component"
     kill_process ${PID_NAME} roxie 3 ${SENTINEL}
+    if [[ $? -eq 1 ]]; then
+      log "could not kill $component"
+    else
+      log "$component Stopped"
+    fi
     exit 255
 }
 
 trap "killed" SIGINT SIGTERM SIGKILL
-nohup roxie --topology=RoxieTopology.xml --logfile=$logfilename --restarts=$restarts --stdlog=0 2>>$logfilename.stderr 1>>$logfilename.stdout &
+log "Calling nohup roxie --topology=RoxieTopology.xml --logfile --restarts=$restarts --stdlog=0 2>>$logfilename.stderr 1>>$logfilename.stdout &"
+nohup roxie --topology=RoxieTopology.xml --logfile --restarts=$restarts --stdlog=0 2>>$logfilename.stderr 1>>$logfilename.stdout &
 echo $! > $PID_NAME 
 wait
+log "Removing $PID_NAME"
 rm $PID_NAME
 
 # Automatically restart roxie when it dies
 while [[ -e ${SENTINEL} ]]; do
     export restarts=$(($restarts+1))
+    log "Restarting $restarts"
     echo Restarting $restarts >> $logfilename.stderr
     echo Restarting $restarts >> $logfilename.stdout
-    nohup roxie --topology=RoxieTopology.xml --logfile=$logfilename --restarts=$restarts --stdlog=0 2>>$logfilename.stderr 1>>$logfilename.stdout &
+    log "Calling nohup roxie --topology=RoxieTopology.xml --logfile --restarts=$restarts --stdlog=0 2>>$logfilename.stderr 1>>$logfilename.stdout &"
+    nohup roxie --topology=RoxieTopology.xml --logfile --restarts=$restarts --stdlog=0 2>>$logfilename.stderr 1>>$logfilename.stdout &
     echo $! > $PID_NAME
     wait
+    log "Removing $PID_NAME"
     rm $PID_NAME
 done
-

+ 14 - 0
initfiles/bin/init_sasha

@@ -22,27 +22,41 @@ PID_NAME="$PID/$(basename $PWD).pid"
 INSTALL_DIR="$(dirname ${PATH_PRE})/.."
 source  ${INSTALL_DIR}/etc/init.d/hpcc_common
 
+component=$(basename $PWD)
+export logfile="${LOG_DIR}/${component}/init_${component}_$(date +%Y_%m_%d_%H_%M_%S).log"
+
 export SENTINEL="saserver.sentinel"
+log "Removing $SENTINEL"
 rm -f ${SENTINEL}
 
 killed()
 {
+    log "Attempting to kill $component"
     kill_process ${PID_NAME} saserver 3 ${SENTINEL}
+    if [[ $? -eq 1 ]]; then
+      log "could not kill $component"
+    else
+      log "$component Stopped"
+    fi
     exit 255
 }
 
 trap "killed" SIGINT SIGTERM SIGKILL
+log "Calling saserver 1>/dev/null 2>/dev/null &"
 saserver 1>/dev/null 2>/dev/null &
 echo $! > $PID_NAME
 wait
+log "Removing $PID_NAME"
 rm $PID_NAME
 
 while [[ -e ${SENTINEL} ]]; do
     sleep 5
     if [[ -e ${SENTINEL} ]]; then
+        log "Calling saserver 1>/dev/null 2>/dev/null &"
         saserver 1>/dev/null 2>/dev/null &
         echo $! > $PID_NAME
         wait
+        log "Removing $PID_NAME"
         rm $PID_NAME
     fi
 done

+ 43 - 41
initfiles/bin/init_thor

@@ -17,15 +17,23 @@
 
 deploydir=$(dirname $(type -path $0))
 PATH_PRE=$(type -path hpcc_setenv)
-source ${PATH_PRE} 
+source ${PATH_PRE}
 
 INSTALL_DIR=$(dirname ${PATH_PRE})/..
 source  ${INSTALL_DIR}/etc/init.d/hpcc_common
+component=$(basename $PWD)
 
-PID_NAME="$PID/$(basename $PWD).pid"
-INIT_PID_NAME="$PID/init_$(basename $PWD).pid"
-echo $$ > $INIT_PID_NAME
+PID_NAME="$PID/${component}.pid"
 
+timestamp="$(date +%Y_%m_%d_%H_%M_%S)"
+export logfile="${LOG_DIR}/${component}/init_${component}_${timestamp}.log"
+
+# for use by init_thorslave call
+logredirect="init_thorslave_${component}_${timestamp}.log"
+
+log "Starting ${component}"
+
+log "removing any previous sentinel file"
 export SENTINEL="thor.sentinel"
 rm -f ${SENTINEL}
 
@@ -33,7 +41,6 @@ if [[ -z "$deploydir" ]]; then
     deploydir=$(pwd -P)
 fi
 
-compname=$(basename $PWD)
 instancedir=$(pwd -P)
 source $instancedir/setvars
 
@@ -45,10 +52,6 @@ fi
 ln -s -f $deploydir/thormaster${LCR} thormaster_$THORNAME
 
 ENV_DIR=$(cat ${HPCC_CONFIG} | sed -n "/\[DEFAULT\]/,/\[/p" | grep "^configs=" | sed -e 's/^configs=//')
-logdir=$(updtdalienv $ENV_DIR/environment.xml -d log thor $THORNAME)
-logfile=$logdir/${THORNAME}.log
-
-exec >> $logfile
 
 contains()
 {
@@ -91,14 +94,14 @@ makethorgroup()
 
 kill_slaves()
 {
+    log "Killing slaves"
     if [[ "$localthor" = "true" ]]; then
-    $deploydir/init_thorslave stop localhost $THORMASTER $THORMASTERPORT $logdir $instancedir $deploydir $THORNAME $PATH_PRE $logredirect
+        $deploydir/init_thorslave stop localhost $THORMASTER $THORMASTERPORT $LOG_DIR $instancedir $deploydir $THORNAME $PATH_PRE $logredirect
     else
         # we want to kill only slaves that have already been started in run_thor
         if [[ -r $instancedir/uslaves.start ]]; then
             nslaves=$(cat $instancedir/uslaves.start 2> /dev/null | wc -l)
-            $deploydir/frunssh $instancedir/uslaves.start "/bin/sh -c '$deploydir/init_thorslave stop %a $THORMASTER $THORMASTERPORT $logdir $instancedir $deploydir $THORNAME $PATH_PRE $logredirect'" -i:$SSHidentityfile -u:$SSHusername -pe:$SSHpassword -t:$SSHtimeout -a:$SSHretries -n:$nslaves 2>&1 | egrep -v "no process killed"
-            echo slaves stopped
+            $deploydir/frunssh $instancedir/uslaves.start "/bin/sh -c '$deploydir/init_thorslave stop %a $THORMASTER $THORMASTERPORT $LOG_DIR $instancedir $deploydir $THORNAME $PATH_PRE $logredirect'" -i:$SSHidentityfile -u:$SSHusername -pe:$SSHpassword -t:$SSHtimeout -a:$SSHretries -n:$nslaves 2>&1
         fi
     fi
 
@@ -107,15 +110,16 @@ kill_slaves()
 
 killed()
 {
-    echo "Stopping"
-    kill_process ${PID_NAME} thormaster_${compname} 30
+    log "Stopping ${component}"
+    kill_process ${PID_NAME} thormaster_${component} 30
     kill_slaves
+    log "removing init.pid file and uslaves.start file"
     rm -f $INIT_PID_NAME $instancedir/uslaves.start > /dev/null 2>&1
     exit 255
 }
 
 trap "killed" SIGINT SIGTERM
-# attempt to clean up any old slaves
+log "Ensuring a clean working environment ..."
 kill_slaves
 thorpid=0
 
@@ -124,87 +128,85 @@ while [[ 1 ]]; do
     daliadmin server=$DALISERVER dfsgroup ${groupName} slaves
     errcode=$?
     if [[ 0 != ${errcode} ]]; then
-    echo "failed to lookup dali group for $groupName"
+    log "failed to lookup dali group for $groupName"
         exit 1
     fi
     makethorgroup
     sort $instancedir/slaves | uniq > $instancedir/uslaves.start
 
-    echo "--------------------------"
-    echo "starting thorslaves ..."
+    log "--------------------------"
+    log "starting thorslaves ..."
 
-    logpthtail=$(date +%m_%d_%Y_%H_%M_%S)
-    logredirect="$logdir/init_thorslave_$logpthtail.log"
     # Would be simpler, if there was simple way to test if ip is local and get rid of 'localthor' setting
     if [[ "$localthor" = "true" ]]; then
         slaveip=$(head -n 1 $instancedir/uslaves.start)
-        $deploydir/init_thorslave start $slaveip $THORMASTER $THORMASTERPORT $logdir $instancedir $deploydir $THORNAME $PATH_PRE $logredirect
+        $deploydir/init_thorslave start $slaveip $THORMASTER $THORMASTERPORT $LOG_DIR $instancedir $deploydir $THORNAME $PATH_PRE $logredirect
     else
         nslaves=$(cat $instancedir/uslaves.start | wc -l)
-        $deploydir/frunssh $instancedir/uslaves.start "/bin/sh -c '$deploydir/init_thorslave start %a $THORMASTER $THORMASTERPORT $logdir $instancedir $deploydir $THORNAME $PATH_PRE $logredirect'" -i:$SSHidentityfile -u:$SSHusername -pe:$SSHpassword -t:$SSHtimeout -a:$SSHretries -n:$nslaves 2>&1
+        $deploydir/frunssh $instancedir/uslaves.start "/bin/sh -c '$deploydir/init_thorslave start %a $THORMASTER $THORMASTERPORT $LOG_DIR $instancedir $deploydir $THORNAME $PATH_PRE $logredirect'" -i:$SSHidentityfile -u:$SSHusername -pe:$SSHpassword -t:$SSHtimeout -a:$SSHretries -n:$nslaves 2>&1
         FRUNSSH_RC=$?
         if [[ ${FRUNSSH_RC} -gt 0 ]]; then
-            echo "Error ${FRUNSSH_RC} in frunssh"
-            echo "Please check $(dirname ${logdir})/frunssh for more details"
+            log "Error ${FRUNSSH_RC} in frunssh"
+            log "Please check $(dirname ${LOG_DIR})/frunssh for more details"
             # clean up any slaves it was able to reach
             killed
         fi
     fi
 
-    echo thormaster cmd : $instancedir/thormaster_$THORNAME MASTER=$THORMASTER:$THORMASTERPORT
+    log "thormaster cmd : $instancedir/thormaster_$THORNAME MASTER=$THORMASTER:$THORMASTERPORT"
     nohup $instancedir/thormaster_$THORNAME MASTER=$THORMASTER:$THORMASTERPORT 2> /dev/null 1>/dev/null &
 
     thorpid=$!
     if [[ "$thorpid" -ne "0" ]]; then
-        echo thormaster$LCR process started pid = $thorpid
+        log "thormaster$LCR process started pid = $thorpid"
 
         echo $thorpid > $PID_NAME
         wait $thorpid
         errcode=$?
         case $errcode in
         # TEC_Clean
-        0)  echo "Thormaster ($thorpid) Exited cleanly"
+        0)  log "Thormaster ($thorpid) Exited cleanly"
             rm -f $instancedir/uslaves.start $PID_NAME $INIT_PID_NAME > /dev/null 2>&1
             exit 0
             ;;
         # TEC_CtrlC
-        1)  echo "Thormaster ($thorpid) Interrupted, Ctrl-C caught"
+        1)  log "Thormaster ($thorpid) Interrupted, Ctrl-C caught"
             killed
             ;;
         # TEC_Idle, TEC_Watchdog, TEC_Swap, TEC_DaliDown
-        2|3|5|6)    [[ $errcode -eq 2 ]] && echo "Thormaster ($thorpid) Idle"
-                    [[ $errcode -eq 3 ]] && echo "Thormaster ($thorpid) Lost connection to slave(s)"
-                    [[ $errcode -eq 5 ]] && echo "Thormaster ($thorpid) Swap node required"
-                    [[ $errcode -eq 6 ]] && echo "Thormaster ($thorpid) Unable to connect to Dali"
-                    echo 'stopping thorslave(s) for restart'
+        2|3|5|6)    [[ $errcode -eq 2 ]] && log "Thormaster ($thorpid) Idle"
+                    [[ $errcode -eq 3 ]] && log "Thormaster ($thorpid) Lost connection to slave(s)"
+                    [[ $errcode -eq 5 ]] && log "Thormaster ($thorpid) Swap node required"
+                    [[ $errcode -eq 6 ]] && log "Thormaster ($thorpid) Unable to connect to Dali"
+                    log "Stopping thorslave(s) for restart"
                     kill_slaves
                     if [[ 0 != $autoSwapNode ]]; then
-                        echo "Running autoswap $THORNAME :: ($thorpid)"
-                        swapnode auto $DALISERVER $compname
+                        log "Running autoswap $THORNAME :: ($thorpid)"
+                        swapnode auto $DALISERVER $component
                         errcode=$?
                         if [[ 0 != ${errcode} ]]; then
-                            echo "auto swap node failed, errcode=${errcode}"
+                            log "auto swap node failed, errcode=${errcode}"
                             killed
                         fi
                     fi
                 # restarting thormaster
                 ;;
         # TEC_SlaveInit
-        4)  echo "Thormaster ($thorpid) Slaves failed to initialize"
-            echo "Shutting down"
+        4)  log "Thormaster ($thorpid) Slaves failed to initialize"
+            log "Shutting down"
             killed
             ;;
-        *)  echo "Thormaster ($thorpid) Unknown error code.  Stopping"
+        *)  log "Thormaster ($thorpid) Unknown error code.  Stopping"
             killed
             ;;
         esac
     else
-        echo failed to start thormaster$LCR, pausing for 30 seconds
+        log "failed to start thormaster$LCR, pausing for 30 seconds"
         sleep 30
         kill_slaves
     fi
     if [[ ! -e $SENTINEL ]]; then
-        echo $SENTINEL 'has been removed or thormaster did not fully start - script stopping'
+        log "$SENTINEL has been removed or thormaster did not fully start - script stopping"
         exit 0
     fi
 done

+ 17 - 15
initfiles/bin/init_thorslave

@@ -24,9 +24,10 @@ instancedir=$6
 deploydir=$7
 hpcc_compname=$8
 hpcc_setenv=$9
-logredirect=${10}
+export logfile="${logpth}/${hpcc_compname}/${10}"
 
-source ${hpcc_setenv}
+source "$hpcc_setenv"
+source "$(dirname $hpcc_setenv)/../etc/init.d/hpcc_common"
 
 slavename=thorslave_${hpcc_compname}
 
@@ -34,6 +35,7 @@ stop_slaves()
 {
     killall -0 $slavename > /dev/null 2>&1
     if [[ $? -eq 0 ]];then
+        log "killing slaves"
         killall -9 $slavename > /dev/null 2>&1
     fi
     rm -f $PID/${slavename}_*.pid > /dev/null 2>&1
@@ -41,6 +43,7 @@ stop_slaves()
 
 start_slaves()
 {
+
     # insuring dafilesrv is running on the machine as it is a prerequisite
     sudo /etc/init.d/dafilesrv status > /dev/null 2>&1
     if [[ $? -ne 0 ]];then
@@ -52,26 +55,25 @@ start_slaves()
 
     # insuring parent directory structure is setup properly
     mkdir -p $instancedir
-    mkdir -p $(dirname $logredirect)
-    exec >>$logredirect 2>&1
+    mkdir -p $(dirname $logfile)
 
     cd $instancedir
 
-    echo "$(date) Dependency dafilesrv is running"
+    log "dependency dafilesrv started"
 
     ulimit -Sc hard > /dev/null 2>&1
-    [[ $? -ne 0 ]] && echo "$(date) Failed to set ulimit for core file size"
+    [[ $? -ne 0 ]] && log "Failed to set ulimit for core file size"
     ulimit -Sn hard > /dev/null 2>&1
-    [[ $? -ne 0 ]] && echo "$(date) Failed to set ulimit for number of file descriptors open"
+    [[ $? -ne 0 ]] && log "Failed to set ulimit for number of file descriptors open"
 
-    echo "$(date) slave(${ip}) init"
-    echo "$(date) slave(s) starting"
+    log "slave(${ip}) init"
+    log "slave(s) starting"
 
     # create symlink for easier identification of slaves by compName
     ln -s -f $deploydir/thorslave_lcr ${slavename}
 
     # sync to current master thorgroup
-    echo "$(date) rsync -e ssh -o StrictHostKeyChecking=no ${master}:${instancedir}/thorgroup ${instancedir}/thorgroup.slave"
+    log "rsync -e ssh -o StrictHostKeyChecking=no ${master}:${instancedir}/thorgroup ${instancedir}/thorgroup.slave"
     rsync -e "ssh -o StrictHostKeyChecking=no" $master:$instancedir/thorgroup $instancedir/thorgroup.slave
 
     let "slavenum = 1";
@@ -82,13 +84,13 @@ start_slaves()
             if [[ "$slaveport" = "" ]]; then
                 slaveport=$THORSLAVEPORT
             fi
-            echo "$(date) $slavename  master=$master:$masterport slave=.:$slaveport slavenum=$slavenum logDir=$logpth"
-            ./$slavename master=$master:$masterport slave=.:$slaveport slavenum=$slavenum logDir=$logpth 2>/dev/null 1>/dev/null &
+            log "$slavename  master=$master:$masterport slave=.:$slaveport slavenum=$slavenum logDir=$logpth/$hpcc_compname"
+            ./$slavename master=$master:$masterport slave=.:$slaveport slavenum=$slavenum logDir=$logpth/$hpcc_compname 2>/dev/null 1>/dev/null &
             slavepid=$!
             if [[ "$slavepid" -eq "0" ]]; then
-                echo "$(date) failed to start"
+                log "failed to start"
             else
-                echo "$(date) slave pid $slavepid started"
+                log "slave pid $slavepid started"
                 PID_NAME="$PID/${slavename}_${slavenum}.pid"
                 echo $slavepid > $PID_NAME
             fi
@@ -99,7 +101,7 @@ start_slaves()
 
 print_usage()
 {
-  echo usage: cmd ip master masterport logdir workingdir deploydir hpcc_compname hpcc_setenv logredirect
+  log "usage: cmd ip master masterport logdir workingdir deploydir hpcc_compname hpcc_setenv logredirect"
 }
 
 ##  Main

+ 30 - 32
initfiles/componentfiles/thor/start_backupnode.in

@@ -26,15 +26,6 @@ if [ $# -lt 1 ]; then
     exit 1
 fi
 
-pid=`${PIDOF} backupnode`
-if [ -n "$pid" ]; then
-   echo stopping previous backupnode processes $pid
-   killall backupnode
-fi
-
-echo ------------------------------
-echo starting backupnode ...
-
 PATH_PRE=`type -path hpcc_setenv`
 if [ -z "$PATH_PRE" ]; then
     # assume default conf location
@@ -47,10 +38,24 @@ ENVPATH=${CONFIG_DIR}/${ENV_XML_FILE}
 RUN_DIR=`cat ${HPCC_CONFIG} | sed -n "/\[DEFAULT\]/,/\[/p" | grep "^runtime=" | sed -e 's/^runtime=//'`
 INSTANCE_DIR=$RUN_DIR/$1
 
+LOGPATH=`updtdalienv $ENVPATH -d log backupnode backupnode`
+export logpthtail="`date +%Y_%m_%d_%H_%M_%S`"
+export logfile="$LOGPATH/backupnode_${logpthtail}.log"
+mkdir -p `dirname $logfile`
+
+pid=`${PIDOF} backupnode`
+if [ -n "$pid" ]; then
+   log "stopping previous backupnode processes $pid"
+   killall backupnode
+fi
+
+log "------------------------------"
+log "starting backupnode ..."
+
 if [ ! -e $INSTANCE_DIR ] ; then
   # perhaps they gave a full path?
   if [ ! -e $1 ] ; then
-    echo Usage: $0 thor_cluster_name
+    echo "Usage: $0 thor_cluster_name"
     exit 1
   fi
   INSTANCE_DIR=$1
@@ -61,7 +66,7 @@ cd $INSTANCE_DIR
 PID_NAME="$PID/`basename $INSTANCE_DIR`.pid"
 BACKUPNODE_DATA=`updtdalienv $ENVPATH -d data backupnode backupnode`
 if [ -z "$BACKUPNODE_DATA" ]; then
-    echo cannot determine backupnode directory 
+    echo "cannot determine backupnode directory"
     exit 1
 fi
 . $INSTANCE_DIR/setvars
@@ -74,7 +79,7 @@ fi
 daliadmin server=$DALISERVER dfsgroup ${groupName} $INSTANCE_DIR/backupnode.slaves
 errcode=$?
 if [ 0 != ${errcode} ]; then
-    echo 'failed to lookup dali group for $groupName'
+    echo "failed to lookup dali group for $groupName"
     exit 1
 fi
 
@@ -89,43 +94,36 @@ mkdir -p $BACKUPNODE_DATA
 rm -f $BACKUPNODE_DATA/*.ERR
 rm -f $BACKUPNODE_DATA/*.DAT
 
-echo Using backupnode directory $BACKUPNODE_DATA
-echo Reading slaves file $INSTANCE_DIR/backupnode.slaves
-echo Scanning files from dali ...
+log "Using backupnode directory $BACKUPNODE_DATA"
+log "Reading slaves file $INSTANCE_DIR/backupnode.slaves"
+log "Scanning files from dali ..."
 
 NODEGROUP=$THORPRIMARY
 if [ -z "$THORPRIMARY" ]; then
   NODEGROUP=$THORNAME
 fi
 
-LOGPATH=`updtdalienv $ENVPATH -d log backupnode backupnode`
-LOGDATE=`date +%m_%d_%Y_%H_%M_%S`
-LOGFILE="$LOGPATH/$LOGDATE".log
-mkdir -p `dirname $LOGFILE` 
 
-$DEPLOY_DIR/backupnode -O $DALISERVER $NODEGROUP $BACKUPNODE_DATA >> $LOGFILE 2>&1
+$DEPLOY_DIR/backupnode -O $DALISERVER $NODEGROUP $BACKUPNODE_DATA >> $logfile 2>&1
 if [ $? -ne 0 ]; then
-  echo Backupnode failed - see $LOGFILE
+  echo Backupnode failed - see $logfile
   exit 1
 fi
 
 # maximum number of threads frunssh will be permitted to use (capped by # slaves)
 MAXTHREADS=1000
 
-frunssh $INSTANCE_DIR/backupnode.slaves "killall backupnode" -i:$SSHidentityfile -u:$SSHusername -pe:$SSHpassword -t:$SSHtimeout -a:$SSHretries -n:$MAXTHREADS -b >> $LOGFILE 2>&1
-echo frunssh $INSTANCE_DIR/backupnode.slaves "/bin/sh -c 'mkdir -p `dirname $LOGPATH/${LOGDATE}_node%n.log`; mkdir -p $INSTANCE_DIR; $DEPLOY_DIR/backupnode -T -X $BACKUPNODE_REMOTEDATA %n %c %a %x $2 > $LOGPATH/${LOGDATE}_node%n.log 2>&1'" -i:$SSHidentityfile -u:$SSHusername -pe:$SSHpassword -t:$SSHtimeout -a:$SSHretries -n:$MAXTHREADS -b >> $LOGFILE 2>&1
-frunssh $INSTANCE_DIR/backupnode.slaves "/bin/sh -c 'mkdir -p `dirname $LOGPATH/${LOGDATE}_node%n.log`; mkdir -p $INSTANCE_DIR; $DEPLOY_DIR/backupnode -T -X $BACKUPNODE_REMOTEDATA %n %c %a %x $2 > $LOGPATH/${LOGDATE}_node%n.log 2>&1'" -i:$SSHidentityfile -u:$SSHusername -pe:$SSHpassword -t:$SSHtimeout -a:$SSHretries -n:$MAXTHREADS -b >> $LOGFILE 2>&1
+frunssh $INSTANCE_DIR/backupnode.slaves "killall backupnode" -i:$SSHidentityfile -u:$SSHusername -pe:$SSHpassword -t:$SSHtimeout -a:$SSHretries -n:$MAXTHREADS -b >> $logfile 2>&1
+log "frunssh $INSTANCE_DIR/backupnode.slaves \"/bin/sh -c 'mkdir -p `dirname $LOGPATH/${logfile}_node%n.log`; mkdir -p $INSTANCE_DIR; $DEPLOY_DIR/backupnode -T -X $BACKUPNODE_REMOTEDATA %n %c %a %x $2 > $LOGPATH/${logfile}_node%n.log 2>&1'\" -i:$SSHidentityfile -u:$SSHusername -pe:$SSHpassword -t:$SSHtimeout -a:$SSHretries -n:$MAXTHREADS -b >> $logfile 2>&1"
+frunssh $INSTANCE_DIR/backupnode.slaves "/bin/sh -c 'mkdir -p `dirname $LOGPATH/${logfile}_node%n.log`; mkdir -p $INSTANCE_DIR; $DEPLOY_DIR/backupnode -T -X $BACKUPNODE_REMOTEDATA %n %c %a %x $2 > $LOGPATH/${logfile}_node%n.log 2>&1'" -i:$SSHidentityfile -u:$SSHusername -pe:$SSHpassword -t:$SSHtimeout -a:$SSHretries -n:$MAXTHREADS -b >> $logfile 2>&1
 
-echo ------------------------------
-sleep 5
-echo ------------------------------
-echo Waiting for backup to complete
+log "------------------------------"
+log "Waiting for backup to complete"
 
-nohup backupnode -W $INSTANCE_DIR/backupnode.slaves $BACKUPNODE_DATA >> $LOGFILE 2>&1 &
+nohup backupnode -W $INSTANCE_DIR/backupnode.slaves $BACKUPNODE_DATA >> $logfile 2>&1 &
 pid=`${PIDOF} backupnode`
 trap "echo start_backupnode exiting, backupnode process $pid still continuing; exit 0" 2
+sleep 5
 if [ -n "$pid" ]; then
-  tail --pid $pid -f $LOGFILE 2>/dev/null
+  tail --pid $pid -f $logfile 2>/dev/null
 fi
-
-