浏览代码

Merge pull request #8265 from jakesmith/hpcc-14998

HPCC-14998 Spot Thor topology vs group mismatch during init

Reviewed-By: Michael Gardner <michael.gardner@lexisnexis.com>
Reviewed-By: Richard Chapman <rchapman@hpccsystems.com>
Richard Chapman 9 年之前
父节点
当前提交
c57bdce4f2
共有 3 个文件被更改,包括 22 次插入10 次删除
  1. 2 2
      dali/base/dadfs.cpp
  2. 18 7
      dali/daliadmin/daliadmin.cpp
  3. 2 1
      initfiles/bin/init_thor

+ 2 - 2
dali/base/dadfs.cpp

@@ -9639,8 +9639,8 @@ IGroup *getClusterNodeGroup(const char *clusterName, const char *type, unsigned
     Owned<IGroup> nodeGroup = queryNamedGroupStore().lookup(nodeGroupName);
     CInitGroups init(timems);
     Owned<IGroup> expandedClusterGroup = init.getGroupFromCluster(type, cluster, true);
-    if (nodeGroup->ordinality() != expandedClusterGroup->ordinality()) // sanity check
-        throwUnexpected();
+    if (!expandedClusterGroup->equals(nodeGroup))
+        throwStringExceptionV(0, "DFS cluster topology for '%s', does not match existing DFS group layout for group '%s'", clusterName, nodeGroupName.str());
     Owned<IGroup> clusterGroup = init.getGroupFromCluster(type, cluster, false);
     ICopyArrayOf<INode> nodes;
     for (unsigned n=0; n<clusterGroup->ordinality(); n++)

+ 18 - 7
dali/daliadmin/daliadmin.cpp

@@ -669,15 +669,26 @@ static void dfsGroup(const char *name, const char *outputFilename)
     writeGroup(group, name, outputFilename);
 }
 
-static void clusterGroup(const char *name, const char *outputFilename)
+static int clusterGroup(const char *name, const char *outputFilename)
 {
-    Owned<IGroup> group = getClusterNodeGroup(name, "ThorCluster");
-    if (!group)
+    StringBuffer errStr;
+    try
     {
-        ERRLOG("cannot find group %s",name);
-        return;
+        Owned<IGroup> group = getClusterNodeGroup(name, "ThorCluster");
+        if (group)
+        {
+            writeGroup(group, name, outputFilename);
+            return 0; // success
+        }
+        errStr.appendf("cannot find group %s", name);
     }
-    writeGroup(group, name, outputFilename);
+    catch (IException *e)
+    {
+        e->errorMessage(errStr);
+        e->Release();
+    }
+    ERRLOG("%s", errStr.str());
+    return 1;
 }
 
 //=============================================================================
@@ -2811,7 +2822,7 @@ int main(int argc, char* argv[])
                     }
                     else if (stricmp(cmd,"clusternodes")==0) {
                         CHECKPARAMS(1,2);
-                        clusterGroup(params.item(1),(np>1)?params.item(2):NULL);
+                        ret = clusterGroup(params.item(1),(np>1)?params.item(2):NULL);
                     }
                     else if (stricmp(cmd,"dfsmap")==0) {
                         CHECKPARAMS(1,1);

+ 2 - 1
initfiles/bin/init_thor

@@ -106,10 +106,11 @@ thorpid=0
 
 while [[ 1 ]]; do
     # update slaves file in case state of environment has been altered since last run
-    daliadmin server=$DALISERVER clusternodes ${component} slaves
+    errorMessage=$( daliadmin server=$DALISERVER clusternodes ${component} $instancedir/slaves 2>&1 )
     errcode=$?
     if [[ 0 != ${errcode} ]]; then
         log "failed to lookup dali group for ${component}"
+        log "$errorMessage"
         exit 1
     fi