Browse Source

HPCC-11958 Add Roxie State check to preflight

The similar logic in ecl-roxie is used by this fix to
retrieve and format information about roxie state (hash).
The information is displayed inside ECLwatch preflight
page (a new column 'Roxie State'). WsMachine thread pool
is used to improve the performance of retrieving the
information.

Signed-off-by: wangkx <kevin.wang@lexisnexis.com>
wangkx 11 years ago
parent
commit
cac98b14e3

+ 86 - 0
common/roxiecommlib/roxiecommunicationclient.cpp

@@ -140,6 +140,52 @@ protected:
         return sendRoxieControlQuery(xml, true);
     }
 
+    bool sendRoxieControlLock(ISocket *sock, bool allOrNothing, unsigned wait)
+    {
+        Owned<IPropertyTree> resp = sendRoxieControlQuery(sock, "<control:lock/>", wait);
+        if (allOrNothing)
+        {
+            int lockCount = resp->getPropInt("Lock", 0);
+            int serverCount = resp->getPropInt("NumServers", 0);
+            return (lockCount && (lockCount == serverCount));
+        }
+
+        return resp->getPropInt("Lock", 0) != 0;
+    }
+
+    void checkRoxieControlExceptions(IPropertyTree *msg)
+    {
+        Owned<IMultiException> me = MakeMultiException();
+        Owned<IPropertyTreeIterator> endpoints = msg->getElements("Endpoint");
+        ForEach(*endpoints)
+        {
+            IPropertyTree &endp = endpoints->query();
+            Owned<IPropertyTreeIterator> exceptions = endp.getElements("Exception");
+            ForEach (*exceptions)
+            {
+                IPropertyTree &ex = exceptions->query();
+                me->append(*MakeStringException(ex.getPropInt("Code"), "Endpoint %s: %s", endp.queryProp("@ep"), ex.queryProp("Message")));
+            }
+        }
+        if (me->ordinality())
+            throw me.getClear();
+    }
+
+    unsigned waitMsToSeconds(unsigned wait)
+    {
+        if (wait==0 || wait==(unsigned)-1)
+            return wait;
+        return wait/1000;
+    }
+
+    unsigned remainingMsWait(unsigned wait, unsigned start)
+    {
+        if (wait==0 || wait==(unsigned)-1)
+            return wait;
+        unsigned waited = msTick()-start;
+        return (wait>waited) ? wait-waited : 0;
+    }
+
     const char* buildRoxieName(const char* orig_name, StringBuffer& roxiename)
     {
         const char *last_dot = strrchr(orig_name, '.');
@@ -265,6 +311,46 @@ public:
         }
     }
 
+    IPropertyTree *sendRoxieControlAllNodes(const char *msg, bool allOrNothing)
+    {
+        Owned<ISocket> sock = ISocket::connect_timeout(ep, roxieTimeout);
+        return sendRoxieControlAllNodes(sock, msg, allOrNothing, roxieTimeout);
+    }
+
+    IPropertyTree *sendRoxieControlAllNodes(ISocket *sock, const char *msg, bool allOrNothing, unsigned wait)
+    {
+        unsigned start = msTick();
+        if (!sendRoxieControlLock(sock, allOrNothing, wait))
+            throw MakeStringException(-1, "Roxie control:lock failed");
+        return sendRoxieControlQuery(sock, msg, remainingMsWait(wait, start));
+    }
+
+    IPropertyTree *sendRoxieControlQuery(ISocket *sock, const char *msg, unsigned wait)
+    {
+        size32_t msglen = strlen(msg);
+        size32_t len = msglen;
+        _WINREV(len);
+        sock->write(&len, sizeof(len));
+        sock->write(msg, msglen);
+
+        StringBuffer resp;
+        loop
+        {
+            sock->read(&len, sizeof(len));
+            if (!len)
+                break;
+            _WINREV(len);
+            size32_t size_read;
+            sock->read(resp.reserveTruncate(len), len, len, size_read, waitMsToSeconds(wait));
+            if (size_read<len)
+                throw MakeStringException(-1, "Error reading roxie control message response");
+        }
+
+        Owned<IPropertyTree> ret = createPTreeFromXMLString(resp.str());
+        checkRoxieControlExceptions(ret);
+        return ret.getClear();
+    }
+
     IPropertyTree * retrieveQuery(const char *id)
     {
         StringBuffer xpath;

+ 20 - 2
esp/eclwatch/ws_XSLT/clusterprocesses.xslt

@@ -391,6 +391,9 @@
                      <xsl:for-each select="../../../Columns/Item[text()='State']">
                         <th align="center"><xsl:value-of select= "$hpccStrings/st[@id='State']"/></th>
                      </xsl:for-each>
+                     <xsl:for-each select="../../../Columns/Item[text()='Roxie State']">
+                        <th align="center"><xsl:value-of select= "$hpccStrings/st[@id='RoxieState']"/></th>
+                     </xsl:for-each>
                      <xsl:for-each select="../../../Columns/Item[text()='UpTime']">
                         <th align="center"><xsl:value-of select= "$hpccStrings/st[@id='UpTime']"/></th>
                      </xsl:for-each>
@@ -404,7 +407,7 @@
                         </th>
                      </xsl:for-each>
                      <!--process disk storage next-->
-                     <xsl:for-each select="../../../Columns/Item[text()!='Processes' and text()!='Up Time' and not(contains(text(), 'Memory')) and not(starts-with(text(), 'CPU')) and text()!='State' and text()!='Condition' and text()!='UpTime' and text()!='Swap']">
+                     <xsl:for-each select="../../../Columns/Item[text()!='Processes' and text()!='Up Time' and not(contains(text(), 'Memory')) and not(starts-with(text(), 'CPU')) and text()!='State' and text()!='Roxie State' and text()!='Condition' and text()!='UpTime' and text()!='Swap']">
                         <th align="center"><xsl:value-of select="."/></th>
                      </xsl:for-each>
                      <!--process physical memory and swap next-->      
@@ -523,6 +526,21 @@
                                     </xsl:choose>
                             </td>
                    </xsl:if>
+                   <xsl:if test="../../../../Columns/Item[text()='Roxie State']">
+                       <xsl:if test="RoxieState">
+                           <td align="center">
+                               <xsl:if test="RoxieState != '' and RoxieState != 'ok'">
+                                   <xsl:attribute name="bgcolor">#FF8800</xsl:attribute>
+                               </xsl:if>
+                               <xsl:if test="RoxieStateDetails">
+                                   <xsl:attribute name="title">
+                                       <xsl:value-of select="RoxieStateDetails"/>
+                                   </xsl:attribute>
+                               </xsl:if>
+                               <xsl:value-of select="RoxieState"/>
+                           </td> 
+                       </xsl:if>
+                   </xsl:if>
                    <xsl:if test="../../../../Columns/Item[text()='UpTime']">                    
                         <td id="uptime_{position()}">
                             <xsl:choose>
@@ -603,7 +621,7 @@
      
      <xsl:for-each select="/GetTargetClusterInfoResponse/Columns/Item">
           <xsl:variable name="text" select="text()"/>
-          <xsl:if test="$text!='Processes' and $text!='Up Time' and $text!='State' and $text!='Condition' and $text!='UpTime' and not(contains($text, 'Memory')) and not(starts-with($text, 'CPU')) and $text!='Swap'">
+          <xsl:if test="$text!='Processes' and $text!='Up Time' and $text!='State' and text()!='Roxie State' and $text!='Condition' and $text!='UpTime' and not(contains($text, 'Memory')) and not(starts-with($text, 'CPU')) and $text!='Swap'">
              <xsl:variable name="storageNode" select="$storageInfo[($OS!=0 and Description=$text) or ($OS=0 and starts-with(Description,$text))]"/>
              <xsl:choose>
                 <xsl:when test="$storageNode">

+ 1 - 0
esp/eclwatch/ws_XSLT/nls/bs/hpcc.xml

@@ -110,6 +110,7 @@
 <st id='Refresh'>Osvježite</st>
 <st id='Rows'>redovi</st>
 <st id='RoxieServer'>Roxie Server</st>
+<st id='RoxieState'>Roxie State</st>
 <st id='SashaServers'>Sasha Serveri</st>
 <st id='SchedulerProcess'>Proces za raspoređivanje poslova</st>
 <st id='SecurityString'>Sigurnosni Tekst</st>

+ 1 - 0
esp/eclwatch/ws_XSLT/nls/en/hpcc.xml

@@ -110,6 +110,7 @@
 <st id='Refresh'>Refresh</st>
 <st id='Rows'>rows</st>
 <st id='RoxieServer'>Roxie Server</st>
+<st id='RoxieState'>Roxie State</st>
 <st id='SashaServers'>Sasha Servers</st>
 <st id='SchedulerProcess'>Scheduler Process</st>
 <st id='SecurityString'>Security String</st>

+ 1 - 0
esp/eclwatch/ws_XSLT/nls/es/hpcc.xml

@@ -110,6 +110,7 @@
 <st id='Refresh'>Actualizar</st>
 <st id='Rows'>Filas</st>
 <st id='RoxieServer'>Servidor de Roxie</st>
+<st id='RoxieState'>Roxie State</st>
 <st id='SashaServers'>Servidores de Sasha</st>
 <st id='SchedulerProcess'>Proceso de Planeador</st>
 <st id='SecurityString'>Texto de Seguridad</st>

+ 1 - 0
esp/eclwatch/ws_XSLT/nls/hr/hpcc.xml

@@ -110,6 +110,7 @@
 <st id='Refresh'>Osvježite</st>
 <st id='Rows'>redovi</st>
 <st id='RoxieServer'>Roxie Server</st>
+<st id='RoxieState'>Roxie State</st>
 <st id='SashaServers'>Sasha Serveri</st>
 <st id='SchedulerProcess'>Proces za raspoređivanje poslova</st>
 <st id='SecurityString'>Sigurnosni Tekst</st>

+ 1 - 0
esp/eclwatch/ws_XSLT/nls/hu/hpcc.xml

@@ -110,6 +110,7 @@
 <st id='Refresh'>Frissítés</st>
 <st id='Rows'>sorok</st>
 <st id='RoxieServer'>Roxie Szerver</st>
+<st id='RoxieState'>Roxie State</st>
 <st id='SashaServers'>Sasha Szerverek</st>
 <st id='SchedulerProcess'>Ütemező</st>
 <st id='SecurityString'>Biztonsági karakterlánc</st>

+ 1 - 0
esp/eclwatch/ws_XSLT/nls/sr/hpcc.xml

@@ -110,6 +110,7 @@
 <st id='Refresh'>Освежите</st>
 <st id='Rows'>редови</st>
 <st id='RoxieServer'>Роxие Сервер</st>
+<st id='RoxieState'>Roxie State</st>
 <st id='SashaServers'>Саша Сервери</st>
 <st id='SchedulerProcess'>Процес за распоређивање послова</st>
 <st id='SecurityString'>Сигурносни Текст</st>

+ 1 - 0
esp/eclwatch/ws_XSLT/nls/zh/hpcc.xml

@@ -110,6 +110,7 @@
 <st id='Refresh'>更新</st>
 <st id='Rows'>行</st>
 <st id='RoxieServer'>Roxie服务器</st>
+<st id='RoxieState'>Roxie State</st>
 <st id='SashaServers'>Sasha服务器</st>
 <st id='SchedulerProcess'>调度器进程</st>
 <st id='SecurityString'>安全码</st>

+ 4 - 0
esp/scm/roxiecommlibscm.ecm

@@ -52,6 +52,10 @@ SCMinterface IRoxieCommunicationClient(IInterface)
 
     unsigned retrieveRoxieStateRevision();
     IPropertyTree *getRoxieBuildVersion();
+
+    IPropertyTree *sendRoxieControlAllNodes(const char *msg, bool allOrNothing);
+    IPropertyTree *sendRoxieControlAllNodes(ISocket *sock, const char *msg, bool allOrNothing, unsigned wait);
+    IPropertyTree *sendRoxieControlQuery(ISocket *sock, const char *msg, unsigned wait);
 };
 
 

+ 3 - 1
esp/scm/ws_machine.ecm

@@ -129,6 +129,8 @@ ESPstruct MachineInfoEx
    string UpTime;
    string ComponentName;
    string ComponentPath;
+   [min_ver("1.13")] string RoxieState;
+   [min_ver("1.13")] string RoxieStateDetails;
    int    OS;
    [min_ver("1.10")] int    ProcessNumber;
    ESParray<ESPstruct ProcessorInfo> Processors;
@@ -308,7 +310,7 @@ ESPresponse [encode(0), exceptions_inline] GetTargetClusterInfoResponse
     [min_ver("1.12")] string AcceptLanguage;
 };
 //-------- service ---------
-ESPservice [version("1.12"), default_client_version("1.12")] ws_machine
+ESPservice [version("1.13"), default_client_version("1.13")] ws_machine
 {
     ESPuses ESPstruct RequestInfoStruct;
     ESPuses ESPstruct MachineInfoEx;

+ 2 - 0
esp/services/ws_machine/CMakeLists.txt

@@ -46,6 +46,7 @@ include_directories (
          ./../../../system/include 
          ./../../../common/remote 
          ./../../../common/environment
+         ./../../../common/workunit
          ./../../clients 
          ./../../../dali/base 
          ./../../bindings 
@@ -70,6 +71,7 @@ target_link_libraries ( ws_machine
          dalibase 
          environment 
          securesocket 
+         workunit
     )
 
 FOREACH ( iFILES

+ 21 - 2
esp/services/ws_machine/machines.xslt

@@ -278,6 +278,9 @@
                      <xsl:for-each select="../Columns/Item[text()='State']">
                         <th align="center"><xsl:value-of select= "$hpccStrings/st[@id='State']"/></th>
                      </xsl:for-each>
+                     <xsl:for-each select="../Columns/Item[text()='Roxie State']">
+                        <th align="center"><xsl:value-of select= "$hpccStrings/st[@id='RoxieState']"/></th>
+                     </xsl:for-each>
                      <xsl:for-each select="../Columns/Item[text()='UpTime']">
                         <th align="center"><xsl:value-of select= "$hpccStrings/st[@id='UpTime']"/></th>
                      </xsl:for-each>
@@ -291,7 +294,7 @@
                         </th>
                      </xsl:for-each>
                      <!--process disk storage next-->
-                     <xsl:for-each select="../Columns/Item[text()!='Processes' and text()!='Up Time' and not(contains(text(), 'Memory')) and not(starts-with(text(), 'CPU')) and text()!='State' and text()!='Condition' and text()!='UpTime' and text()!='Swap']">
+                     <xsl:for-each select="../Columns/Item[text()!='Processes' and text()!='Up Time' and not(contains(text(), 'Memory')) and not(starts-with(text(), 'CPU')) and text()!='State' and text()!='Roxie State' and text()!='Condition' and text()!='UpTime' and text()!='Swap']">
                         <th align="center"><xsl:value-of select="."/></th>
                      </xsl:for-each>
                      <!--process physical memory and swap next-->      
@@ -435,6 +438,21 @@
                                     </xsl:choose>
                             </td>
                    </xsl:if>
+                   <xsl:if test="../../Columns/Item[text()='Roxie State']">
+                       <xsl:if test="RoxieState">
+                           <td align="center">
+                               <xsl:if test="RoxieState != '' and RoxieState != 'ok'">
+                                   <xsl:attribute name="bgcolor">#FF8800</xsl:attribute>
+                               </xsl:if>
+                               <xsl:if test="RoxieStateDetails">
+                                   <xsl:attribute name="title">
+                                       <xsl:value-of select="RoxieStateDetails"/>
+                                   </xsl:attribute>
+                               </xsl:if>
+                               <xsl:value-of select="RoxieState"/>
+                           </td> 
+                       </xsl:if>
+                   </xsl:if>
                    <xsl:if test="../../Columns/Item[text()='UpTime']">                  
                         <td id="uptime_{position()}">
                             <xsl:choose>
@@ -515,7 +533,8 @@
      
      <xsl:for-each select="/GetMachineInfoResponse/Columns/Item">
           <xsl:variable name="text" select="text()"/>
-          <xsl:if test="$text!='Processes' and $text!='Up Time' and $text!='State' and $text!='Condition' and $text!='UpTime' and not(contains($text, 'Memory')) and not(starts-with($text, 'CPU')) and $text!='Swap'">
+          <xsl:if test="$text!='Processes' and $text!='Up Time' and $text!='State' and $text!='Roxie State' and $text!='Condition'
+             and $text!='UpTime' and not(contains($text, 'Memory')) and not(starts-with($text, 'CPU')) and $text!='Swap'">
              <xsl:variable name="storageNode" select="$storageInfo[($OS!=0 and Description=$text) or ($OS=0 and starts-with(Description,$text))]"/>
              <xsl:choose>
                 <xsl:when test="$storageNode">

+ 190 - 19
esp/services/ws_machine/ws_machineService.cpp

@@ -19,6 +19,8 @@
 #include "jarray.hpp"
 #include "dadfs.hpp"
 #include "exception_util.hpp"
+#include "workunit.hpp"
+#include "roxiecommlibscm.hpp"
 
 #ifndef eqHoleCluster
 #define eqHoleCluster  "HoleCluster"
@@ -72,6 +74,8 @@ static const int THREAD_POOL_SIZE = 40;
 static const int THREAD_POOL_STACK_SIZE = 64000;
 static const char* FEATURE_URL = "MachineInfoAccess";
 
+const unsigned ROXIECONTROLSTATETIMEOUT = 5000; //5 second
+
 class CMachineInfoThreadParam : public CWsMachineThreadParam
 {
 public:
@@ -112,6 +116,25 @@ private:
 
 Mutex CMachineInfoThreadParam::s_mutex;
 
+class CRoxieStateInfoThreadParam : public CWsMachineThreadParam
+{
+public:
+    IEspContext&                    context;
+    StringAttr                      clusterName;
+    IArrayOf<IEspMachineInfoEx>&    machineInfoTable;     //For response
+
+    CRoxieStateInfoThreadParam(Cws_machineEx* pService, IEspContext& _context, const char* _clusterName, IArrayOf<IEspMachineInfoEx>& _machineInfoTable)
+       : CWsMachineThreadParam(pService), context(_context), clusterName(_clusterName), machineInfoTable(_machineInfoTable)
+    {
+    }
+
+    virtual void doWork()
+    {
+        m_pService->getRoxieStateInfo(context, this);
+    }
+};
+
+
 void Cws_machineEx::init(IPropertyTree *cfg, const char *process, const char *service)
 {
     //Read settings from esp.xml
@@ -171,10 +194,9 @@ void Cws_machineEx::init(IPropertyTree *cfg, const char *process, const char *se
     m_threadPoolStackSize = pServiceNode->getPropInt("ThreadPoolStackSize", THREAD_POOL_STACK_SIZE);
 
     //Start thread pool
-    IThreadFactory* pThreadFactory = new CWsMachineThreadFactory();
+    Owned<IThreadFactory> pThreadFactory = new CWsMachineThreadFactory();
     m_threadPool.setown(createThreadPool("WsMachine Thread Pool", pThreadFactory,
         NULL, m_threadPoolSize, 10000, m_threadPoolStackSize)); //10 sec timeout for available thread; use stack size of 2MB
-    pThreadFactory->Release();
 
     setupLegacyFilters();
 }
@@ -318,6 +340,8 @@ void Cws_machineEx::readMachineInfoRequest(IEspContext& context, bool getProcess
         }
 
         setProcessRequest(machineInfoData, uniqueProcesses, address1.str(), address2.str(), processType.str(), compName.str(), path.str(), processNumber);
+        if (strieq(processType.str(), eqRoxieServerProcess))
+            machineInfoData.appendRoxieClusters(compName.str());
     }
 }
 
@@ -656,6 +680,9 @@ void Cws_machineEx::readTargetClusterProcesses(IPropertyTree &processNode, const
         pClusterProcess = pEnvironmentRoot->queryPropTree(path.str());
         if (!pClusterProcess)
             throw MakeStringException(ECLWATCH_INTERNAL_ERROR, "Process not set for %s in environment setting.", path.str());
+
+        if (strieq(nodeType, eqRoxieCluster))
+            machineInfoData.appendRoxieClusters(process);
     }
 
     IPropertyTree *pInfo = targetClustersOut->addPropTree("Process", createPTree("Process"));
@@ -857,30 +884,174 @@ bool Cws_machineEx::isLegacyFilter(const char* processType, const char* dependen
     return false;
 }
 
-//////////////////////////////////////////////////////////////////
-// Get Machine Infomation based on Machine Infomation request   //
-//////////////////////////////////////////////////////////////////
+//The stateHashes stores different state hashes in one roxie cluster.
+//It also stores how many roxie nodes have the same state hashes.
+int Cws_machineEx::addRoxieStateHash(const char* hash, CIArrayOf<CStateHash>& stateHashes)
+{
+    if (!hash || !*hash)
+        return -1;
+    ForEachItemIn(i, stateHashes)
+    {
+        CStateHash& stateHash = stateHashes.item(i);
+        //if the 'hash' matches this 'stateHash', the matchHash() increases the count for this 'stateHash' by 1.
+        if (stateHash.matchHash(hash))
+            return i;
+    }
+    stateHashes.append(*new CStateHash(hash));
+    return stateHashes.length() - 1;
+}
 
-void Cws_machineEx::getMachineInfo(IEspContext& context, CGetMachineInfoData& machineInfoData)
+void Cws_machineEx::updateMajorRoxieStateHash(CIArrayOf<CStateHash>& stateHashes, CIArrayOf<CRoxieStateData>& roxieStates)
 {
-    UnsignedArray threadHandles;
-    CIArrayOf<CMachineData>& machines = machineInfoData.getMachineData();
-    ForEachItemIn(idx, machines)
+    if (stateHashes.length() < 2)
+        return;
+
+    //Find out which state hash is for the most of the roxie nodes inside this roxie cluster.
+    unsigned majorHash = 0;
+    unsigned majorHashCount = 0;
+    ForEachItemIn(i, stateHashes)
+    {
+        unsigned hashCount = stateHashes.item(i).getCount();
+        if (majorHashCount >= hashCount)
+            continue;
+        majorHashCount = hashCount;
+        majorHash = i;
+    }
+
+    //Set the MajorHash to false if the roxie node has a different hash.
+    ForEachItemIn(ii, roxieStates)
+    {
+        CRoxieStateData& roxieState = roxieStates.item(ii);
+        if (roxieState.getHashID() != majorHash)
+            roxieState.setMajorHash(false);
+    }
+}
+
+void Cws_machineEx::readRoxieStatus(const Owned<IPropertyTree> controlResp, CIArrayOf<CRoxieStateData>& roxieStates)
+{
+    CIArrayOf<CStateHash> stateHashes;
+    Owned<IPropertyTreeIterator> roxieEndpoints = controlResp->getElements("Endpoint");
+    ForEach(*roxieEndpoints)
+    {
+        IPropertyTree& roxieEndpoint = roxieEndpoints->query();
+        const char *ep = roxieEndpoint.queryProp("@ep");
+        if (!ep || !*ep)
+            continue;
+
+        bool ok = false, attached = false, detached = false;
+        const char *status = roxieEndpoint.queryProp("Status");
+        if (status && strieq(status, "ok"))
+            ok = true;
+        const char *stateHash = roxieEndpoint.queryProp("State/@hash");
+        if (roxieEndpoint.hasProp("Dali/@connected"))
+        {
+            if (roxieEndpoint.getPropBool("Dali/@connected"))
+                attached = true;
+            else
+                detached = true;
+        }
+
+        StringArray locations;
+        locations.appendListUniq(ep, ":");
+        Owned<CRoxieStateData> roxieState = new CRoxieStateData(locations.item(0), addRoxieStateHash(stateHash, stateHashes));
+        roxieState->setState(ok, attached, detached, stateHash);
+        roxieStates.append(*roxieState.getClear());
+    }
+    updateMajorRoxieStateHash(stateHashes, roxieStates);
+}
+
+void Cws_machineEx::getRoxieStateInfo(IEspContext& context, CRoxieStateInfoThreadParam* param)
+{
+    const char* clusterName = param->clusterName.get();
+    if (!clusterName || !*clusterName)
+        throw MakeStringException(ECLWATCH_MISSING_PARAMS, "Roxie cluster not specified.");
+
+    SocketEndpointArray servers;
+    getRoxieProcessServers(clusterName, servers);
+    if (!servers.length())
+        throw MakeStringException(ECLWATCH_CANNOT_GET_ENV_INFO, "Roxie Process server not found.");
+
+    Owned<IRoxieCommunicationClient> roxieClient = createRoxieCommunicationClient(servers.item(0), ROXIECONTROLSTATETIMEOUT);
+    Owned<IPropertyTree> controlResp = roxieClient->sendRoxieControlAllNodes("<control:state/>", true);
+    if (!controlResp)
+        throw MakeStringException(ECLWATCH_INTERNAL_ERROR, "Failed to get control response from roxie %s.", clusterName);
+
+    CIArrayOf<CRoxieStateData> roxieStates;
+    readRoxieStatus(controlResp, roxieStates);
+
+    ForEachItemIn(i, param->machineInfoTable)
     {
-        Owned<CMachineInfoThreadParam> pThreadReq = new CMachineInfoThreadParam( this, context, machineInfoData.getOptions(),
-            machines.item(idx), machineInfoData.getMachineInfoTable(), machineInfoData.getMachineInfoColumns());
-        PooledThreadHandle handle = m_threadPool->start( pThreadReq.getClear() );
-        threadHandles.append(handle);
+        IEspMachineInfoEx& machineInfo = param->machineInfoTable.item(i);
+        if (!streq(machineInfo.getProcessType(), eqRoxieServerProcess) || !streq(machineInfo.getComponentName(), clusterName))
+            continue;
+
+        //This method is thread safe because each machineInfo (for one roxie node) belongs to only one Roxie cluster.
+        //It is impossible for different threads to update the same machineInfo.
+        bool foundRoxieState = false;
+        ForEachItemIn(ii, roxieStates)
+        {
+            CRoxieStateData& roxieState = roxieStates.item(ii);
+            if (!roxieState.matchIPAddress(machineInfo.getAddress()))
+                continue;
+
+            StringBuffer state, stateDetails;
+            roxieState.reportState(state, stateDetails);
+            machineInfo.setRoxieState(state.str());
+            machineInfo.setRoxieStateDetails(stateDetails.str());
+            foundRoxieState = true;
+        }
+        if (!foundRoxieState)
+        {
+            machineInfo.setRoxieState("??");
+            machineInfo.setRoxieStateDetails("Roxie state not found");
+        }
     }
+}
 
-    //block for worker theads to finish, if necessary and then collect results
-    PooledThreadHandle* pThreadHandle = threadHandles.getArray();
-    unsigned i=threadHandles.ordinality();
-    while (i--)
+void Cws_machineEx::getMachineInfo(IEspContext& context, bool getRoxieState, CGetMachineInfoData& machineInfoData)
+{
+    UnsignedArray threadHandles;
+    if (!getRoxieState)
     {
-        m_threadPool->join(*pThreadHandle);
-        pThreadHandle++;
+        CIArrayOf<CMachineData>& machines = machineInfoData.getMachineData();
+        ForEachItemIn(idx, machines)
+        {
+            Owned<CMachineInfoThreadParam> pThreadReq = new CMachineInfoThreadParam(this, context, machineInfoData.getOptions(),
+                machines.item(idx), machineInfoData.getMachineInfoTable(), machineInfoData.getMachineInfoColumns());
+            PooledThreadHandle handle = m_threadPool->start( pThreadReq.getClear());
+            threadHandles.append(handle);
+        }
     }
+    else
+    {
+        StringArray& roxieClusters = machineInfoData.getRoxieClusters();
+        ForEachItemIn(i, roxieClusters)
+        {
+            Owned<CRoxieStateInfoThreadParam> pThreadReq = new CRoxieStateInfoThreadParam(this, context, roxieClusters.item(i),
+                machineInfoData.getMachineInfoTable());
+            PooledThreadHandle handle = m_threadPool->start( pThreadReq.getClear());
+            threadHandles.append(handle);
+        }
+        machineInfoData.getMachineInfoColumns().append("Roxie State");
+    }
+
+    //Block for worker threads to finish, if necessary and then collect results
+    //Not use joinAll() because multiple threads may call this method. Each call uses the pool to create
+    //its own threads of checking query state. Each call should only join the ones created by that call.
+    ForEachItemIn(i, threadHandles)
+        m_threadPool->join(threadHandles.item(i));
+}
+
+////////////////////////////////////////////////////////////////////
+// Get Machine Information based on Machine Information request   //
+////////////////////////////////////////////////////////////////////
+
+void Cws_machineEx::getMachineInfo(IEspContext& context, CGetMachineInfoData& machineInfoData)
+{
+    double version = context.getClientVersion();
+    getMachineInfo(context, false, machineInfoData);
+    if ((version >= 1.13) && !machineInfoData.getRoxieClusters().empty())
+        getMachineInfo(context, true, machineInfoData);
 }
 
 // the following method is invoked on worker threads of CMachineInfoThreadParam

+ 108 - 0
esp/services/ws_machine/ws_machineService.hpp

@@ -26,6 +26,7 @@
 #include <map>
 
 class CMachineInfoThreadParam;
+class CRoxieStateInfoThreadParam;
 class CMetricsThreadParam;
 class CRemoteExecThreadParam;
 
@@ -364,6 +365,88 @@ public:
     }
 };
 
+class CStateHash : public CInterface
+{
+    BoolHash   hash;
+    unsigned   count;
+public:
+    IMPLEMENT_IINTERFACE;
+
+    CStateHash(const char* _hash) : hash(_hash), count(1)
+    {
+        hash.setValue(_hash, true);
+    };
+
+    bool matchHash(const char* _hash)
+    {
+        bool match = hash.getValue(_hash);
+        if (match)
+            count++;
+        return match;
+    }
+
+    unsigned getCount() { return count; }
+};
+
+class CRoxieStateData : public CInterface
+{
+    BoolHash   ipAddress;
+    StringAttr hash;
+    int        hashID; //the position inside cluster's state hash list - used to set the majorHash flag in updateMajorRoxieStateHash().
+    bool       majorHash; //whether its state hash is the same as the most of other roxie cluster nodes or not.
+    bool       ok;
+    bool       attached;
+    bool       detached;
+public:
+    IMPLEMENT_IINTERFACE;
+
+    CRoxieStateData(const char* _ipAddress, int _hashID) : hashID(_hashID), majorHash(true), ok(false), attached(false), detached(false)
+    {
+        ipAddress.setValue(_ipAddress, true);
+    };
+
+    bool matchIPAddress(const char* _ipAddress) { return ipAddress.getValue(_ipAddress); }
+    int getHashID() { return hashID; }
+    void setMajorHash(bool _majorHash) { majorHash = _majorHash; }
+
+    void setState(bool _ok, bool _attached, bool _detached, const char* _hash)
+    {
+        ok = _ok;
+        attached = _attached;
+        detached = _detached;
+        hash.set(_hash);
+    }
+
+    void reportState(StringBuffer& state, StringBuffer& stateDetails)
+    {
+        if (!ok)
+            state.set("Node State: not ok ...");
+        else if (!hash || !*hash)
+            state.set("empty state hash ...");
+        else if (!majorHash)
+            state.set("State hash mismatch ...");
+        else if (!attached)
+            state.set("Not attached to DALI ...");
+        else
+            state.set("ok");
+
+        if (ok)
+            stateDetails.appendf("Node State: ok\n");
+        else
+            stateDetails.appendf("Node State: not ok\n");
+        if (!hash || !*hash)
+            stateDetails.appendf("This node had an empty hash\n");
+        else
+            stateDetails.appendf("State hash: %s\n", hash.get());
+        if (!majorHash)
+            stateDetails.appendf("State hash: mismatch\n");
+        if (attached)
+            stateDetails.appendf("This node attached to DALI\n");
+        if (detached)
+            stateDetails.appendf("This node detached from DALI\n");
+    }
+};
+
 class CMachineData  : public CInterface
 {
     char         m_pathSep;
@@ -598,6 +681,9 @@ class CGetMachineInfoData
     IArrayOf<IEspMachineInfoEx>     m_machineInfoTable;
     StringArray                     m_machineInfoColumns;
 
+    StringArray                     roxieClusters;
+    BoolHash                        uniqueRoxieClusters;
+
 public:
     CGetMachineInfoUserOptions& getOptions()
     {
@@ -618,6 +704,20 @@ public:
     {
         return m_machineInfoColumns;
     }
+
+    StringArray& getRoxieClusters()
+    {
+        return roxieClusters;
+    }
+
+    void appendRoxieClusters(const char* clusterName)
+    {
+        if (uniqueRoxieClusters.getValue(clusterName))
+            return;
+
+        roxieClusters.append(clusterName);
+        uniqueRoxieClusters.setValue(clusterName, true);
+    }
 };
 
 //---------------------------------------------------------------------------------------------
@@ -639,6 +739,7 @@ public:
     bool onStartStopBegin( IEspContext &context, IEspStartStopBeginRequest &req,  IEspStartStopBeginResponse &resp);
     bool onStartStopDone( IEspContext &context, IEspStartStopDoneRequest &req,  IEspStartStopResponse &resp);
 
+    void getRoxieStateInfo(IEspContext& context, CRoxieStateInfoThreadParam* param);
     void doGetMachineInfo(IEspContext& context, CMachineInfoThreadParam* pReq);
     void doGetMetrics(CMetricsThreadParam* pParam);
     bool doStartStop(IEspContext &context, StringArray& addresses, char* userName, char* password, bool bStop, IEspStartStopResponse &resp);
@@ -670,6 +771,7 @@ private:
 
     void readMachineInfoRequest(IEspContext& context, bool getProcessorInfo, bool getStorageInfo, bool localFileSystemsOnly, bool getSwInfo, bool applyProcessFilter, StringArray& addresses, const char* addProcessesToFilters, CGetMachineInfoData& machineInfoData);
     void readMachineInfoRequest(IEspContext& context, bool getProcessorInfo, bool getStorageInfo, bool localFileSystemsOnly, bool getSwInfo, bool applyProcessFilter, const char* addProcessesToFilters, StringArray& targetClustersIn, CGetMachineInfoData& machineInfoData, IPropertyTree* targetClustersOut);
+    void getMachineInfo(IEspContext& context, bool getRoxieState, CGetMachineInfoData& machineInfoData);
     void getMachineInfo(IEspContext& context, CGetMachineInfoData& machineInfoData);
     void setMachineInfoResponse(IEspContext& context, IEspGetMachineInfoRequest& req, CGetMachineInfoData& machineInfoData, IEspGetMachineInfoResponse& resp);
     void setTargetClusterInfoResponse(IEspContext& context, IEspGetTargetClusterInfoRequest& req, CGetMachineInfoData& machineInfoData, IPropertyTree* targetClusterTree, IEspGetTargetClusterInfoResponse& resp);
@@ -696,6 +798,9 @@ private:
     void doPostProcessing(CFieldInfoMap& myfieldInfoMap, CFieldMap&  myfieldMap);
     void processValue(const char *oid, const char *value, const bool bShow, CFieldInfoMap& myfieldInfoMap, CFieldMap&  myfieldMap);
     void addIpAddressesToBuffer( void** buffer, unsigned& count, const char* address);
+    void readRoxieStatus(const Owned<IPropertyTree> controlResp, CIArrayOf<CRoxieStateData>& roxieStates);
+    int addRoxieStateHash(const char* hash, CIArrayOf<CStateHash>& stateHashes);
+    void updateMajorRoxieStateHash(CIArrayOf<CStateHash>& stateHashes, CIArrayOf<CRoxieStateData>& roxieStates);
     StringBuffer& getAcceptLanguage(IEspContext& context, StringBuffer& acceptLanguage);
 
     //Still used in StartStop/Rexec, so keep them for now.
@@ -737,6 +842,9 @@ public:
 
 protected:
 
+    CWsMachineThreadParam(Cws_machineEx* pService) : m_pService(pService)
+    {
+    }
     CWsMachineThreadParam( const char* pszAddress,
                           const char* pszSecurityString, Cws_machineEx* pService)
         : m_sAddress(pszAddress), m_sSecurityString(pszSecurityString), m_pService(pService)