Browse Source

Fix gh-685 thorlib.group() gives internal error in eclagent

thorlib.group() (deprecated) is giving an internal error in eclagent. It's
deprecated because it is not always possible to give an unambiguous answer,
since there may be several thor clusters associated with a job with different
nodegroup names.

However, in the interests of harmony with legacy users, we should make the
effort to return an answer where an unambiguous one is available.

If all thors associated with the wu use the same nodegroup, or if there are
no thors associated, return an appropriate value. Otherwise, fail.

See also bugzilla ticket 89275

Signed-off-by: Richard Chapman <rchapman@hpccsystems.com>
Richard Chapman 13 years ago
parent
commit
1954401148
3 changed files with 67 additions and 28 deletions
  1. 23 25
      common/workunit/workunit.cpp
  2. 1 1
      common/workunit/workunit.hpp
  3. 43 2
      ecl/eclagent/eclagent.cpp

+ 23 - 25
common/workunit/workunit.cpp

@@ -4063,39 +4063,37 @@ const char *getTargetClusterComponentName(const char *clustname, const char *pro
     return name.str();
 }
 
-unsigned getEnvironmentThorClusterNames(StringArray &clusternames, StringArray &groupnames, StringArray &qnames)
+unsigned getEnvironmentThorClusterNames(StringArray &thorNames, StringArray &groupNames, StringArray &targetNames)
 {
     Owned<IRemoteConnection> conn = querySDS().connect("Environment", myProcessSession(), RTM_LOCK_READ, SDS_LOCK_TIMEOUT);
     if (!conn)
         return 0;
-    Owned<IPropertyTreeIterator> iter = conn->queryRoot()->getElements("Software/Topology/EclServerProcess/Cluster");
-    ForEach(*iter) {
-        IPropertyTree &tc = iter->query();
-        // I think we want the name here but it isn't exactly clear
-        const char *cname = tc.queryProp("@name");
-        if (cname&&*cname) {
-            Owned<IPropertyTreeIterator> iter2 = tc.getElements("ThorCluster");
-            ForEach(*iter2) {
-                IPropertyTree &tc2 = iter2->query();
-                StringBuffer query;
-                const char *pname = tc2.queryProp("@process");
-                query.appendf("Software/ThorCluster[@name=\"%s\"]",pname);
-                IPropertyTree *t = conn->queryRoot()->queryPropTree(query.str());
-                if (t) {
-                    const char *qname = t->queryProp("@queueName");
-                    if (!qname||!*qname)
-                        qname = pname;
-                    const char *gname = t->queryProp("@nodeGroup");
-                    if (!gname||!*gname)
-                        gname = pname;
-                    clusternames.append(cname);
-                    groupnames.append(gname);
-                    qnames.append(qname);
+    Owned<IPropertyTreeIterator> allTargets = conn->queryRoot()->getElements("Software/Topology/Cluster");
+    ForEach(*allTargets)
+    {
+        IPropertyTree &target = allTargets->query();
+        const char *targetName = target.queryProp("@name");
+        if (targetName && *targetName)
+        {
+            Owned<IPropertyTreeIterator> thorClusters = target.getElements("ThorCluster");
+            ForEach(*thorClusters)
+            {
+                const char *thorName = thorClusters->query().queryProp("@process");
+                VStringBuffer query("Software/ThorCluster[@name=\"%s\"]",thorName);
+                IPropertyTree *thorCluster = conn->queryRoot()->queryPropTree(query.str());
+                if (thorCluster)
+                {
+                    const char *groupName = thorCluster->queryProp("@nodeGroup");
+                    if (!groupName||!*groupName)
+                        groupName = thorName;
+                    thorNames.append(thorName);
+                    groupNames.append(groupName);
+                    targetNames.append(targetName);
                 }
             }
         }
     }
-    return clusternames.ordinality();
+    return thorNames.ordinality();
 }
 
 

+ 1 - 1
common/workunit/workunit.hpp

@@ -1132,7 +1132,7 @@ extern WORKUNIT_API bool getWorkUnitCreateTime(const char *wuid,CDateTime &time)
 extern WORKUNIT_API bool restoreWorkUnit(const char *base,const char *wuid);
 extern WORKUNIT_API void clientShutdownWorkUnit();
 extern WORKUNIT_API IExtendedWUInterface * queryExtendedWU(IWorkUnit * wu);
-extern WORKUNIT_API unsigned getEnvironmentThorClusterNames(StringArray &clusternames, StringArray &groupnames, StringArray &qnames);
+extern WORKUNIT_API unsigned getEnvironmentThorClusterNames(StringArray &thorNames, StringArray &groupNames, StringArray &targetNames);
 extern WORKUNIT_API StringBuffer &formatGraphTimerLabel(StringBuffer &str, const char *graphName, unsigned subGraphNum=0, unsigned __int64 subId=0);
 extern WORKUNIT_API bool parseGraphTimerLabel(const char *label, StringBuffer &graphName, unsigned &subGraphNum, unsigned __int64  &subId);
 extern WORKUNIT_API void addExceptionToWorkunit(IWorkUnit * wu, WUExceptionSeverity severity, const char * source, unsigned code, const char * text, const char * filename, unsigned lineno, unsigned column);

+ 43 - 2
ecl/eclagent/eclagent.cpp

@@ -2707,8 +2707,49 @@ char * EclAgent::getClusterName()
 
 char * EclAgent::getGroupName()
 {
-    // We don't know the name of the thor we will run on - can only tell the queue.
-    throwUnexpected();
+    StringBuffer groupName;
+    if (!isStandAloneExe)
+    {
+        const char * cluster = clusterNames.tos();
+        Owned<IConstWUClusterInfo> clusterInfo = getTargetClusterInfo(cluster);
+        if (!clusterInfo)
+            throw MakeStringException(-1, "Unknown cluster '%s'", cluster);
+        const StringArray &thors = clusterInfo->getThorProcesses();
+        if (thors.length())
+        {
+            StringArray envClusters, envGroups, envQueues;
+            getEnvironmentThorClusterNames(envClusters, envGroups, envQueues);
+            ForEachItemIn(i, thors)
+            {
+                const char *thorName = thors.item(i);
+                ForEachItemIn(j, envClusters)
+                {
+                    if (strieq(thorName, envClusters.item(j)))
+                    {
+                        const char *envGroup = envGroups.item(j);
+                        if (groupName.length())
+                        {
+                            if (!strieq(groupName, envGroup))
+                                throw MakeStringException(-1, "getGroupName(): ambiguous groups %s, %s", groupName.str(), envGroup);
+                        }
+                        else
+                            groupName.append(envGroup);
+                        break;
+                    }
+                }
+            }
+
+        }
+        else
+        {
+            // eclagent group name not stored in cluster info so reverse lookup in dali (bit of kludge)
+            SocketEndpoint ep = queryMyNode()->endpoint();
+            ep.port = 0;
+            Owned<IGroup> grp = createIGroup(1,&ep);
+            queryNamedGroupStore().find(grp, groupName);
+        }
+    }
+    return groupName.detach();
 }
 
 char * EclAgent::queryIndexMetaData(char const * lfn, char const * xpath)