Explorar o código

Merge pull request #6326 from wangkx/h11958m

HPCC-11958 Add Roxie State check to preflight

Reviewed-By: Anthony Fishbeck <anthony.fishbeck@lexisnexis.com>
Reviewed-By: Richard Chapman <rchapman@hpccsystems.com>
Richard Chapman %!s(int64=10) %!d(string=hai) anos
pai
achega
a7b21f4529

+ 86 - 0
common/roxiecommlib/roxiecommunicationclient.cpp

@@ -140,6 +140,52 @@ protected:
         return sendRoxieControlQuery(xml, true);
     }
 
+    bool sendRoxieControlLock(ISocket *sock, bool allOrNothing, unsigned wait)
+    {
+        Owned<IPropertyTree> resp = sendRoxieControlQuery(sock, "<control:lock/>", wait);
+        if (allOrNothing)
+        {
+            int lockCount = resp->getPropInt("Lock", 0);
+            int serverCount = resp->getPropInt("NumServers", 0);
+            return (lockCount && (lockCount == serverCount));
+        }
+
+        return resp->getPropInt("Lock", 0) != 0;
+    }
+
+    void checkRoxieControlExceptions(IPropertyTree *msg)
+    {
+        Owned<IMultiException> me = MakeMultiException();
+        Owned<IPropertyTreeIterator> endpoints = msg->getElements("Endpoint");
+        ForEach(*endpoints)
+        {
+            IPropertyTree &endp = endpoints->query();
+            Owned<IPropertyTreeIterator> exceptions = endp.getElements("Exception");
+            ForEach (*exceptions)
+            {
+                IPropertyTree &ex = exceptions->query();
+                me->append(*MakeStringException(ex.getPropInt("Code"), "Endpoint %s: %s", endp.queryProp("@ep"), ex.queryProp("Message")));
+            }
+        }
+        if (me->ordinality())
+            throw me.getClear();
+    }
+
+    unsigned waitMsToSeconds(unsigned wait)
+    {
+        if (wait==0 || wait==(unsigned)-1)
+            return wait;
+        return wait/1000;
+    }
+
+    unsigned remainingMsWait(unsigned wait, unsigned start)
+    {
+        if (wait==0 || wait==(unsigned)-1)
+            return wait;
+        unsigned waited = msTick()-start;
+        return (wait>waited) ? wait-waited : 0;
+    }
+
     const char* buildRoxieName(const char* orig_name, StringBuffer& roxiename)
     {
         const char *last_dot = strrchr(orig_name, '.');
@@ -265,6 +311,46 @@ public:
         }
     }
 
+    IPropertyTree *sendRoxieControlAllNodes(const char *msg, bool allOrNothing)
+    {
+        Owned<ISocket> sock = ISocket::connect_timeout(ep, roxieTimeout);
+        return sendRoxieControlAllNodes(sock, msg, allOrNothing, roxieTimeout);
+    }
+
+    IPropertyTree *sendRoxieControlAllNodes(ISocket *sock, const char *msg, bool allOrNothing, unsigned wait)
+    {
+        unsigned start = msTick();
+        if (!sendRoxieControlLock(sock, allOrNothing, wait))
+            throw MakeStringException(-1, "Roxie control:lock failed");
+        return sendRoxieControlQuery(sock, msg, remainingMsWait(wait, start));
+    }
+
+    IPropertyTree *sendRoxieControlQuery(ISocket *sock, const char *msg, unsigned wait)
+    {
+        size32_t msglen = strlen(msg);
+        size32_t len = msglen;
+        _WINREV(len);
+        sock->write(&len, sizeof(len));
+        sock->write(msg, msglen);
+
+        StringBuffer resp;
+        loop
+        {
+            sock->read(&len, sizeof(len));
+            if (!len)
+                break;
+            _WINREV(len);
+            size32_t size_read;
+            sock->read(resp.reserveTruncate(len), len, len, size_read, waitMsToSeconds(wait));
+            if (size_read<len)
+                throw MakeStringException(-1, "Error reading roxie control message response");
+        }
+
+        Owned<IPropertyTree> ret = createPTreeFromXMLString(resp.str());
+        checkRoxieControlExceptions(ret);
+        return ret.getClear();
+    }
+
     IPropertyTree * retrieveQuery(const char *id)
     {
         StringBuffer xpath;

+ 20 - 2
esp/eclwatch/ws_XSLT/clusterprocesses.xslt

@@ -391,6 +391,9 @@
                      <xsl:for-each select="../../../Columns/Item[text()='State']">
                         <th align="center"><xsl:value-of select= "$hpccStrings/st[@id='State']"/></th>
                      </xsl:for-each>
+                     <xsl:for-each select="../../../Columns/Item[text()='Roxie State']">
+                        <th align="center"><xsl:value-of select= "$hpccStrings/st[@id='RoxieState']"/></th>
+                     </xsl:for-each>
                      <xsl:for-each select="../../../Columns/Item[text()='UpTime']">
                         <th align="center"><xsl:value-of select= "$hpccStrings/st[@id='UpTime']"/></th>
                      </xsl:for-each>
@@ -404,7 +407,7 @@
                         </th>
                      </xsl:for-each>
                      <!--process disk storage next-->
-                     <xsl:for-each select="../../../Columns/Item[text()!='Processes' and text()!='Up Time' and not(contains(text(), 'Memory')) and not(starts-with(text(), 'CPU')) and text()!='State' and text()!='Condition' and text()!='UpTime' and text()!='Swap']">
+                     <xsl:for-each select="../../../Columns/Item[text()!='Processes' and text()!='Up Time' and not(contains(text(), 'Memory')) and not(starts-with(text(), 'CPU')) and text()!='State' and text()!='Roxie State' and text()!='Condition' and text()!='UpTime' and text()!='Swap']">
                         <th align="center"><xsl:value-of select="."/></th>
                      </xsl:for-each>
                      <!--process physical memory and swap next-->      
@@ -523,6 +526,21 @@
                                     </xsl:choose>
                             </td>
                    </xsl:if>
+                   <xsl:if test="../../../../Columns/Item[text()='Roxie State']">
+                       <xsl:if test="RoxieState">
+                           <td align="center">
+                               <xsl:if test="RoxieState != '' and RoxieState != 'ok'">
+                                   <xsl:attribute name="bgcolor">#FF8800</xsl:attribute>
+                               </xsl:if>
+                               <xsl:if test="RoxieStateDetails">
+                                   <xsl:attribute name="title">
+                                       <xsl:value-of select="RoxieStateDetails"/>
+                                   </xsl:attribute>
+                               </xsl:if>
+                               <xsl:value-of select="RoxieState"/>
+                           </td> 
+                       </xsl:if>
+                   </xsl:if>
                    <xsl:if test="../../../../Columns/Item[text()='UpTime']">                    
                         <td id="uptime_{position()}">
                             <xsl:choose>
@@ -603,7 +621,7 @@
      
      <xsl:for-each select="/GetTargetClusterInfoResponse/Columns/Item">
           <xsl:variable name="text" select="text()"/>
-          <xsl:if test="$text!='Processes' and $text!='Up Time' and $text!='State' and $text!='Condition' and $text!='UpTime' and not(contains($text, 'Memory')) and not(starts-with($text, 'CPU')) and $text!='Swap'">
+          <xsl:if test="$text!='Processes' and $text!='Up Time' and $text!='State' and text()!='Roxie State' and $text!='Condition' and $text!='UpTime' and not(contains($text, 'Memory')) and not(starts-with($text, 'CPU')) and $text!='Swap'">
              <xsl:variable name="storageNode" select="$storageInfo[($OS!=0 and Description=$text) or ($OS=0 and starts-with(Description,$text))]"/>
              <xsl:choose>
                 <xsl:when test="$storageNode">

+ 1 - 0
esp/eclwatch/ws_XSLT/nls/bs/hpcc.xml

@@ -110,6 +110,7 @@
 <st id='Refresh'>Osvježite</st>
 <st id='Rows'>redovi</st>
 <st id='RoxieServer'>Roxie Server</st>
+<st id='RoxieState'>Roxie State</st>
 <st id='SashaServers'>Sasha Serveri</st>
 <st id='SchedulerProcess'>Proces za raspoređivanje poslova</st>
 <st id='SecurityString'>Sigurnosni Tekst</st>

+ 1 - 0
esp/eclwatch/ws_XSLT/nls/en/hpcc.xml

@@ -110,6 +110,7 @@
 <st id='Refresh'>Refresh</st>
 <st id='Rows'>rows</st>
 <st id='RoxieServer'>Roxie Server</st>
+<st id='RoxieState'>Roxie State</st>
 <st id='SashaServers'>Sasha Servers</st>
 <st id='SchedulerProcess'>Scheduler Process</st>
 <st id='SecurityString'>Security String</st>

+ 1 - 0
esp/eclwatch/ws_XSLT/nls/es/hpcc.xml

@@ -110,6 +110,7 @@
 <st id='Refresh'>Actualizar</st>
 <st id='Rows'>Filas</st>
 <st id='RoxieServer'>Servidor de Roxie</st>
+<st id='RoxieState'>Roxie State</st>
 <st id='SashaServers'>Servidores de Sasha</st>
 <st id='SchedulerProcess'>Proceso de Planeador</st>
 <st id='SecurityString'>Texto de Seguridad</st>

+ 1 - 0
esp/eclwatch/ws_XSLT/nls/hr/hpcc.xml

@@ -110,6 +110,7 @@
 <st id='Refresh'>Osvježite</st>
 <st id='Rows'>redovi</st>
 <st id='RoxieServer'>Roxie Server</st>
+<st id='RoxieState'>Roxie State</st>
 <st id='SashaServers'>Sasha Serveri</st>
 <st id='SchedulerProcess'>Proces za raspoređivanje poslova</st>
 <st id='SecurityString'>Sigurnosni Tekst</st>

+ 1 - 0
esp/eclwatch/ws_XSLT/nls/hu/hpcc.xml

@@ -110,6 +110,7 @@
 <st id='Refresh'>Frissítés</st>
 <st id='Rows'>sorok</st>
 <st id='RoxieServer'>Roxie Szerver</st>
+<st id='RoxieState'>Roxie State</st>
 <st id='SashaServers'>Sasha Szerverek</st>
 <st id='SchedulerProcess'>Ütemező</st>
 <st id='SecurityString'>Biztonsági karakterlánc</st>

+ 1 - 0
esp/eclwatch/ws_XSLT/nls/sr/hpcc.xml

@@ -110,6 +110,7 @@
 <st id='Refresh'>Освежите</st>
 <st id='Rows'>редови</st>
 <st id='RoxieServer'>Роxие Сервер</st>
+<st id='RoxieState'>Roxie State</st>
 <st id='SashaServers'>Саша Сервери</st>
 <st id='SchedulerProcess'>Процес за распоређивање послова</st>
 <st id='SecurityString'>Сигурносни Текст</st>

+ 1 - 0
esp/eclwatch/ws_XSLT/nls/zh/hpcc.xml

@@ -110,6 +110,7 @@
 <st id='Refresh'>更新</st>
 <st id='Rows'>行</st>
 <st id='RoxieServer'>Roxie服务器</st>
+<st id='RoxieState'>Roxie State</st>
 <st id='SashaServers'>Sasha服务器</st>
 <st id='SchedulerProcess'>调度器进程</st>
 <st id='SecurityString'>安全码</st>

+ 4 - 0
esp/scm/roxiecommlibscm.ecm

@@ -52,6 +52,10 @@ SCMinterface IRoxieCommunicationClient(IInterface)
 
     unsigned retrieveRoxieStateRevision();
     IPropertyTree *getRoxieBuildVersion();
+
+    IPropertyTree *sendRoxieControlAllNodes(const char *msg, bool allOrNothing);
+    IPropertyTree *sendRoxieControlAllNodes(ISocket *sock, const char *msg, bool allOrNothing, unsigned wait);
+    IPropertyTree *sendRoxieControlQuery(ISocket *sock, const char *msg, unsigned wait);
 };
 
 

+ 3 - 1
esp/scm/ws_machine.ecm

@@ -129,6 +129,8 @@ ESPstruct MachineInfoEx
    string UpTime;
    string ComponentName;
    string ComponentPath;
+   [min_ver("1.13")] string RoxieState;
+   [min_ver("1.13")] string RoxieStateDetails;
    int    OS;
    [min_ver("1.10")] int    ProcessNumber;
    ESParray<ESPstruct ProcessorInfo> Processors;
@@ -308,7 +310,7 @@ ESPresponse [encode(0), exceptions_inline] GetTargetClusterInfoResponse
     [min_ver("1.12")] string AcceptLanguage;
 };
 //-------- service ---------
-ESPservice [version("1.12"), default_client_version("1.12")] ws_machine
+ESPservice [version("1.13"), default_client_version("1.13")] ws_machine
 {
     ESPuses ESPstruct RequestInfoStruct;
     ESPuses ESPstruct MachineInfoEx;

+ 2 - 0
esp/services/ws_machine/CMakeLists.txt

@@ -46,6 +46,7 @@ include_directories (
          ./../../../system/include 
          ./../../../common/remote 
          ./../../../common/environment
+         ./../../../common/workunit
          ./../../clients 
          ./../../../dali/base 
          ./../../bindings 
@@ -70,6 +71,7 @@ target_link_libraries ( ws_machine
          dalibase 
          environment 
          securesocket 
+         workunit
     )
 
 FOREACH ( iFILES

+ 21 - 2
esp/services/ws_machine/machines.xslt

@@ -278,6 +278,9 @@
                      <xsl:for-each select="../Columns/Item[text()='State']">
                         <th align="center"><xsl:value-of select= "$hpccStrings/st[@id='State']"/></th>
                      </xsl:for-each>
+                     <xsl:for-each select="../Columns/Item[text()='Roxie State']">
+                        <th align="center"><xsl:value-of select= "$hpccStrings/st[@id='RoxieState']"/></th>
+                     </xsl:for-each>
                      <xsl:for-each select="../Columns/Item[text()='UpTime']">
                         <th align="center"><xsl:value-of select= "$hpccStrings/st[@id='UpTime']"/></th>
                      </xsl:for-each>
@@ -291,7 +294,7 @@
                         </th>
                      </xsl:for-each>
                      <!--process disk storage next-->
-                     <xsl:for-each select="../Columns/Item[text()!='Processes' and text()!='Up Time' and not(contains(text(), 'Memory')) and not(starts-with(text(), 'CPU')) and text()!='State' and text()!='Condition' and text()!='UpTime' and text()!='Swap']">
+                     <xsl:for-each select="../Columns/Item[text()!='Processes' and text()!='Up Time' and not(contains(text(), 'Memory')) and not(starts-with(text(), 'CPU')) and text()!='State' and text()!='Roxie State' and text()!='Condition' and text()!='UpTime' and text()!='Swap']">
                         <th align="center"><xsl:value-of select="."/></th>
                      </xsl:for-each>
                      <!--process physical memory and swap next-->      
@@ -435,6 +438,21 @@
                                     </xsl:choose>
                             </td>
                    </xsl:if>
+                   <xsl:if test="../../Columns/Item[text()='Roxie State']">
+                       <xsl:if test="RoxieState">
+                           <td align="center">
+                               <xsl:if test="RoxieState != '' and RoxieState != 'ok'">
+                                   <xsl:attribute name="bgcolor">#FF8800</xsl:attribute>
+                               </xsl:if>
+                               <xsl:if test="RoxieStateDetails">
+                                   <xsl:attribute name="title">
+                                       <xsl:value-of select="RoxieStateDetails"/>
+                                   </xsl:attribute>
+                               </xsl:if>
+                               <xsl:value-of select="RoxieState"/>
+                           </td> 
+                       </xsl:if>
+                   </xsl:if>
                    <xsl:if test="../../Columns/Item[text()='UpTime']">                  
                         <td id="uptime_{position()}">
                             <xsl:choose>
@@ -515,7 +533,8 @@
      
      <xsl:for-each select="/GetMachineInfoResponse/Columns/Item">
           <xsl:variable name="text" select="text()"/>
-          <xsl:if test="$text!='Processes' and $text!='Up Time' and $text!='State' and $text!='Condition' and $text!='UpTime' and not(contains($text, 'Memory')) and not(starts-with($text, 'CPU')) and $text!='Swap'">
+          <xsl:if test="$text!='Processes' and $text!='Up Time' and $text!='State' and $text!='Roxie State' and $text!='Condition'
+             and $text!='UpTime' and not(contains($text, 'Memory')) and not(starts-with($text, 'CPU')) and $text!='Swap'">
              <xsl:variable name="storageNode" select="$storageInfo[($OS!=0 and Description=$text) or ($OS=0 and starts-with(Description,$text))]"/>
              <xsl:choose>
                 <xsl:when test="$storageNode">

+ 189 - 19
esp/services/ws_machine/ws_machineService.cpp

@@ -19,6 +19,8 @@
 #include "jarray.hpp"
 #include "dadfs.hpp"
 #include "exception_util.hpp"
+#include "workunit.hpp"
+#include "roxiecommlibscm.hpp"
 
 #ifndef eqHoleCluster
 #define eqHoleCluster  "HoleCluster"
@@ -72,6 +74,8 @@ static const int THREAD_POOL_SIZE = 40;
 static const int THREAD_POOL_STACK_SIZE = 64000;
 static const char* FEATURE_URL = "MachineInfoAccess";
 
+const unsigned ROXIECONTROLSTATETIMEOUT = 5000; //5 second
+
 class CMachineInfoThreadParam : public CWsMachineThreadParam
 {
 public:
@@ -112,6 +116,24 @@ private:
 
 Mutex CMachineInfoThreadParam::s_mutex;
 
+class CRoxieStateInfoThreadParam : public CWsMachineThreadParam
+{
+public:
+    StringAttr                      clusterName;
+    IArrayOf<IEspMachineInfoEx>&    machineInfoTable;     //For response
+
+    CRoxieStateInfoThreadParam(Cws_machineEx* pService, const char* _clusterName, IArrayOf<IEspMachineInfoEx>& _machineInfoTable)
+       : CWsMachineThreadParam(pService), clusterName(_clusterName), machineInfoTable(_machineInfoTable)
+    {
+    }
+
+    virtual void doWork()
+    {
+        m_pService->getRoxieStateInfo(this);
+    }
+};
+
+
 void Cws_machineEx::init(IPropertyTree *cfg, const char *process, const char *service)
 {
     //Read settings from esp.xml
@@ -171,10 +193,9 @@ void Cws_machineEx::init(IPropertyTree *cfg, const char *process, const char *se
     m_threadPoolStackSize = pServiceNode->getPropInt("ThreadPoolStackSize", THREAD_POOL_STACK_SIZE);
 
     //Start thread pool
-    IThreadFactory* pThreadFactory = new CWsMachineThreadFactory();
+    Owned<IThreadFactory> pThreadFactory = new CWsMachineThreadFactory();
     m_threadPool.setown(createThreadPool("WsMachine Thread Pool", pThreadFactory,
         NULL, m_threadPoolSize, 10000, m_threadPoolStackSize)); //10 sec timeout for available thread; use stack size of 2MB
-    pThreadFactory->Release();
 
     setupLegacyFilters();
 }
@@ -318,6 +339,8 @@ void Cws_machineEx::readMachineInfoRequest(IEspContext& context, bool getProcess
         }
 
         setProcessRequest(machineInfoData, uniqueProcesses, address1.str(), address2.str(), processType.str(), compName.str(), path.str(), processNumber);
+        if (strieq(processType.str(), eqRoxieServerProcess))
+            machineInfoData.appendRoxieClusters(compName.str());
     }
 }
 
@@ -656,6 +679,9 @@ void Cws_machineEx::readTargetClusterProcesses(IPropertyTree &processNode, const
         pClusterProcess = pEnvironmentRoot->queryPropTree(path.str());
         if (!pClusterProcess)
             throw MakeStringException(ECLWATCH_INTERNAL_ERROR, "Process not set for %s in environment setting.", path.str());
+
+        if (strieq(nodeType, eqRoxieCluster))
+            machineInfoData.appendRoxieClusters(process);
     }
 
     IPropertyTree *pInfo = targetClustersOut->addPropTree("Process", createPTree("Process"));
@@ -857,30 +883,174 @@ bool Cws_machineEx::isLegacyFilter(const char* processType, const char* dependen
     return false;
 }
 
-//////////////////////////////////////////////////////////////////
-// Get Machine Infomation based on Machine Infomation request   //
-//////////////////////////////////////////////////////////////////
+//The stateHashes stores different state hashes in one roxie cluster.
+//It also stores how many roxie nodes have the same state hashes.
+int Cws_machineEx::addRoxieStateHash(const char* hash, CIArrayOf<CStateHash>& stateHashes)
+{
+    if (!hash || !*hash)
+        return -1;
+    ForEachItemIn(i, stateHashes)
+    {
+        CStateHash& stateHash = stateHashes.item(i);
+        //if the 'hash' matches this 'stateHash', the matchHash() increases the count for this 'stateHash' by 1.
+        if (stateHash.matchHash(hash))
+            return i;
+    }
+    stateHashes.append(*new CStateHash(hash));
+    return stateHashes.length() - 1;
+}
 
-void Cws_machineEx::getMachineInfo(IEspContext& context, CGetMachineInfoData& machineInfoData)
+void Cws_machineEx::updateMajorRoxieStateHash(CIArrayOf<CStateHash>& stateHashes, CIArrayOf<CRoxieStateData>& roxieStates)
 {
-    UnsignedArray threadHandles;
-    CIArrayOf<CMachineData>& machines = machineInfoData.getMachineData();
-    ForEachItemIn(idx, machines)
+    if (stateHashes.length() < 2)
+        return;
+
+    //Find out which state hash is for the most of the roxie nodes inside this roxie cluster.
+    unsigned majorHash = 0;
+    unsigned majorHashCount = 0;
+    ForEachItemIn(i, stateHashes)
+    {
+        unsigned hashCount = stateHashes.item(i).getCount();
+        if (majorHashCount >= hashCount)
+            continue;
+        majorHashCount = hashCount;
+        majorHash = i;
+    }
+
+    //Set the MajorHash to false if the roxie node has a different hash.
+    ForEachItemIn(ii, roxieStates)
+    {
+        CRoxieStateData& roxieState = roxieStates.item(ii);
+        if (roxieState.getHashID() != majorHash)
+            roxieState.setMajorHash(false);
+    }
+}
+
+void Cws_machineEx::readRoxieStatus(const Owned<IPropertyTree> controlResp, CIArrayOf<CRoxieStateData>& roxieStates)
+{
+    CIArrayOf<CStateHash> stateHashes;
+    Owned<IPropertyTreeIterator> roxieEndpoints = controlResp->getElements("Endpoint");
+    ForEach(*roxieEndpoints)
+    {
+        IPropertyTree& roxieEndpoint = roxieEndpoints->query();
+        const char *ep = roxieEndpoint.queryProp("@ep");
+        if (!ep || !*ep)
+            continue;
+
+        bool ok = false, attached = false, detached = false;
+        const char *status = roxieEndpoint.queryProp("Status");
+        if (status && strieq(status, "ok"))
+            ok = true;
+        const char *stateHash = roxieEndpoint.queryProp("State/@hash");
+        if (roxieEndpoint.hasProp("Dali/@connected"))
+        {
+            if (roxieEndpoint.getPropBool("Dali/@connected"))
+                attached = true;
+            else
+                detached = true;
+        }
+
+        StringArray locations;
+        locations.appendListUniq(ep, ":");
+        Owned<CRoxieStateData> roxieState = new CRoxieStateData(locations.item(0), addRoxieStateHash(stateHash, stateHashes));
+        roxieState->setState(ok, attached, detached, stateHash);
+        roxieStates.append(*roxieState.getClear());
+    }
+    updateMajorRoxieStateHash(stateHashes, roxieStates);
+}
+
+void Cws_machineEx::getRoxieStateInfo(CRoxieStateInfoThreadParam* param)
+{
+    const char* clusterName = param->clusterName.get();
+    if (!clusterName || !*clusterName)
+        throw MakeStringException(ECLWATCH_MISSING_PARAMS, "Roxie cluster not specified.");
+
+    SocketEndpointArray servers;
+    getRoxieProcessServers(clusterName, servers);
+    if (!servers.length())
+        throw MakeStringException(ECLWATCH_CANNOT_GET_ENV_INFO, "Roxie Process server not found.");
+
+    Owned<IRoxieCommunicationClient> roxieClient = createRoxieCommunicationClient(servers.item(0), ROXIECONTROLSTATETIMEOUT);
+    Owned<IPropertyTree> controlResp = roxieClient->sendRoxieControlAllNodes("<control:state/>", true);
+    if (!controlResp)
+        throw MakeStringException(ECLWATCH_INTERNAL_ERROR, "Failed to get control response from roxie %s.", clusterName);
+
+    CIArrayOf<CRoxieStateData> roxieStates;
+    readRoxieStatus(controlResp, roxieStates);
+
+    ForEachItemIn(i, param->machineInfoTable)
     {
-        Owned<CMachineInfoThreadParam> pThreadReq = new CMachineInfoThreadParam( this, context, machineInfoData.getOptions(),
-            machines.item(idx), machineInfoData.getMachineInfoTable(), machineInfoData.getMachineInfoColumns());
-        PooledThreadHandle handle = m_threadPool->start( pThreadReq.getClear() );
-        threadHandles.append(handle);
+        IEspMachineInfoEx& machineInfo = param->machineInfoTable.item(i);
+        if (!streq(machineInfo.getProcessType(), eqRoxieServerProcess) || !streq(machineInfo.getComponentName(), clusterName))
+            continue;
+
+        //This method is thread safe because each machineInfo (for one roxie node) belongs to only one Roxie cluster.
+        //It is impossible for different threads to update the same machineInfo.
+        bool foundRoxieState = false;
+        ForEachItemIn(ii, roxieStates)
+        {
+            CRoxieStateData& roxieState = roxieStates.item(ii);
+            if (!roxieState.matchIPAddress(machineInfo.getAddress()))
+                continue;
+
+            StringBuffer state, stateDetails;
+            roxieState.reportState(state, stateDetails);
+            machineInfo.setRoxieState(state.str());
+            machineInfo.setRoxieStateDetails(stateDetails.str());
+            foundRoxieState = true;
+        }
+        if (!foundRoxieState)
+        {
+            machineInfo.setRoxieState("??");
+            machineInfo.setRoxieStateDetails("Roxie state not found");
+        }
     }
+}
 
-    //block for worker theads to finish, if necessary and then collect results
-    PooledThreadHandle* pThreadHandle = threadHandles.getArray();
-    unsigned i=threadHandles.ordinality();
-    while (i--)
+void Cws_machineEx::getMachineInfo(IEspContext& context, bool getRoxieState, CGetMachineInfoData& machineInfoData)
+{
+    UnsignedArray threadHandles;
+    if (!getRoxieState)
     {
-        m_threadPool->join(*pThreadHandle);
-        pThreadHandle++;
+        CIArrayOf<CMachineData>& machines = machineInfoData.getMachineData();
+        ForEachItemIn(idx, machines)
+        {
+            Owned<CMachineInfoThreadParam> pThreadReq = new CMachineInfoThreadParam(this, context, machineInfoData.getOptions(),
+                machines.item(idx), machineInfoData.getMachineInfoTable(), machineInfoData.getMachineInfoColumns());
+            PooledThreadHandle handle = m_threadPool->start( pThreadReq.getClear());
+            threadHandles.append(handle);
+        }
     }
+    else
+    {
+        StringArray& roxieClusters = machineInfoData.getRoxieClusters();
+        ForEachItemIn(i, roxieClusters)
+        {
+            Owned<CRoxieStateInfoThreadParam> pThreadReq = new CRoxieStateInfoThreadParam(this, roxieClusters.item(i),
+                machineInfoData.getMachineInfoTable());
+            PooledThreadHandle handle = m_threadPool->start( pThreadReq.getClear());
+            threadHandles.append(handle);
+        }
+        machineInfoData.getMachineInfoColumns().append("Roxie State");
+    }
+
+    //Block for worker threads to finish, if necessary and then collect results
+    //Not use joinAll() because multiple threads may call this method. Each call uses the pool to create
+    //its own threads of checking query state. Each call should only join the ones created by that call.
+    ForEachItemIn(i, threadHandles)
+        m_threadPool->join(threadHandles.item(i));
+}
+
+////////////////////////////////////////////////////////////////////
+// Get Machine Information based on Machine Information request   //
+////////////////////////////////////////////////////////////////////
+
+void Cws_machineEx::getMachineInfo(IEspContext& context, CGetMachineInfoData& machineInfoData)
+{
+    double version = context.getClientVersion();
+    getMachineInfo(context, false, machineInfoData);
+    if ((version >= 1.13) && !machineInfoData.getRoxieClusters().empty())
+        getMachineInfo(context, true, machineInfoData);
 }
 
 // the following method is invoked on worker threads of CMachineInfoThreadParam

+ 108 - 0
esp/services/ws_machine/ws_machineService.hpp

@@ -26,6 +26,7 @@
 #include <map>
 
 class CMachineInfoThreadParam;
+class CRoxieStateInfoThreadParam;
 class CMetricsThreadParam;
 class CRemoteExecThreadParam;
 
@@ -364,6 +365,88 @@ public:
     }
 };
 
+class CStateHash : public CInterface
+{
+    BoolHash   hash;
+    unsigned   count;
+public:
+    IMPLEMENT_IINTERFACE;
+
+    CStateHash(const char* _hash) : hash(_hash), count(1)
+    {
+        hash.setValue(_hash, true);
+    };
+
+    bool matchHash(const char* _hash)
+    {
+        bool match = hash.getValue(_hash);
+        if (match)
+            count++;
+        return match;
+    }
+
+    unsigned getCount() { return count; }
+};
+
+class CRoxieStateData : public CInterface
+{
+    BoolHash   ipAddress;
+    StringAttr hash;
+    int        hashID; //the position inside cluster's state hash list - used to set the majorHash flag in updateMajorRoxieStateHash().
+    bool       majorHash; //whether its state hash is the same as the most of other roxie cluster nodes or not.
+    bool       ok;
+    bool       attached;
+    bool       detached;
+public:
+    IMPLEMENT_IINTERFACE;
+
+    CRoxieStateData(const char* _ipAddress, int _hashID) : hashID(_hashID), majorHash(true), ok(false), attached(false), detached(false)
+    {
+        ipAddress.setValue(_ipAddress, true);
+    };
+
+    bool matchIPAddress(const char* _ipAddress) { return ipAddress.getValue(_ipAddress); }
+    int getHashID() { return hashID; }
+    void setMajorHash(bool _majorHash) { majorHash = _majorHash; }
+
+    void setState(bool _ok, bool _attached, bool _detached, const char* _hash)
+    {
+        ok = _ok;
+        attached = _attached;
+        detached = _detached;
+        hash.set(_hash);
+    }
+
+    void reportState(StringBuffer& state, StringBuffer& stateDetails)
+    {
+        if (!ok)
+            state.set("Node State: not ok ...");
+        else if (!hash || !*hash)
+            state.set("empty state hash ...");
+        else if (!majorHash)
+            state.set("State hash mismatch ...");
+        else if (!attached)
+            state.set("Not attached to DALI ...");
+        else
+            state.set("ok");
+
+        if (ok)
+            stateDetails.appendf("Node State: ok\n");
+        else
+            stateDetails.appendf("Node State: not ok\n");
+        if (!hash || !*hash)
+            stateDetails.appendf("This node had an empty hash\n");
+        else
+            stateDetails.appendf("State hash: %s\n", hash.get());
+        if (!majorHash)
+            stateDetails.appendf("State hash: mismatch\n");
+        if (attached)
+            stateDetails.appendf("This node attached to DALI\n");
+        if (detached)
+            stateDetails.appendf("This node detached from DALI\n");
+    }
+};
+
 class CMachineData  : public CInterface
 {
     char         m_pathSep;
@@ -598,6 +681,9 @@ class CGetMachineInfoData
     IArrayOf<IEspMachineInfoEx>     m_machineInfoTable;
     StringArray                     m_machineInfoColumns;
 
+    StringArray                     roxieClusters;
+    BoolHash                        uniqueRoxieClusters;
+
 public:
     CGetMachineInfoUserOptions& getOptions()
     {
@@ -618,6 +704,20 @@ public:
     {
         return m_machineInfoColumns;
     }
+
+    StringArray& getRoxieClusters()
+    {
+        return roxieClusters;
+    }
+
+    void appendRoxieClusters(const char* clusterName)
+    {
+        if (uniqueRoxieClusters.getValue(clusterName))
+            return;
+
+        roxieClusters.append(clusterName);
+        uniqueRoxieClusters.setValue(clusterName, true);
+    }
 };
 
 //---------------------------------------------------------------------------------------------
@@ -639,6 +739,7 @@ public:
     bool onStartStopBegin( IEspContext &context, IEspStartStopBeginRequest &req,  IEspStartStopBeginResponse &resp);
     bool onStartStopDone( IEspContext &context, IEspStartStopDoneRequest &req,  IEspStartStopResponse &resp);
 
+    void getRoxieStateInfo(CRoxieStateInfoThreadParam* param);
     void doGetMachineInfo(IEspContext& context, CMachineInfoThreadParam* pReq);
     void doGetMetrics(CMetricsThreadParam* pParam);
     bool doStartStop(IEspContext &context, StringArray& addresses, char* userName, char* password, bool bStop, IEspStartStopResponse &resp);
@@ -670,6 +771,7 @@ private:
 
     void readMachineInfoRequest(IEspContext& context, bool getProcessorInfo, bool getStorageInfo, bool localFileSystemsOnly, bool getSwInfo, bool applyProcessFilter, StringArray& addresses, const char* addProcessesToFilters, CGetMachineInfoData& machineInfoData);
     void readMachineInfoRequest(IEspContext& context, bool getProcessorInfo, bool getStorageInfo, bool localFileSystemsOnly, bool getSwInfo, bool applyProcessFilter, const char* addProcessesToFilters, StringArray& targetClustersIn, CGetMachineInfoData& machineInfoData, IPropertyTree* targetClustersOut);
+    void getMachineInfo(IEspContext& context, bool getRoxieState, CGetMachineInfoData& machineInfoData);
     void getMachineInfo(IEspContext& context, CGetMachineInfoData& machineInfoData);
     void setMachineInfoResponse(IEspContext& context, IEspGetMachineInfoRequest& req, CGetMachineInfoData& machineInfoData, IEspGetMachineInfoResponse& resp);
     void setTargetClusterInfoResponse(IEspContext& context, IEspGetTargetClusterInfoRequest& req, CGetMachineInfoData& machineInfoData, IPropertyTree* targetClusterTree, IEspGetTargetClusterInfoResponse& resp);
@@ -696,6 +798,9 @@ private:
     void doPostProcessing(CFieldInfoMap& myfieldInfoMap, CFieldMap&  myfieldMap);
     void processValue(const char *oid, const char *value, const bool bShow, CFieldInfoMap& myfieldInfoMap, CFieldMap&  myfieldMap);
     void addIpAddressesToBuffer( void** buffer, unsigned& count, const char* address);
+    void readRoxieStatus(const Owned<IPropertyTree> controlResp, CIArrayOf<CRoxieStateData>& roxieStates);
+    int addRoxieStateHash(const char* hash, CIArrayOf<CStateHash>& stateHashes);
+    void updateMajorRoxieStateHash(CIArrayOf<CStateHash>& stateHashes, CIArrayOf<CRoxieStateData>& roxieStates);
     StringBuffer& getAcceptLanguage(IEspContext& context, StringBuffer& acceptLanguage);
 
     //Still used in StartStop/Rexec, so keep them for now.
@@ -737,6 +842,9 @@ public:
 
 protected:
 
+    CWsMachineThreadParam(Cws_machineEx* pService) : m_pService(pService)
+    {
+    }
     CWsMachineThreadParam( const char* pszAddress,
                           const char* pszSecurityString, Cws_machineEx* pService)
         : m_sAddress(pszAddress), m_sSecurityString(pszSecurityString), m_pService(pService)