Selaa lähdekoodia

Merge pull request #13514 from jakesmith/hpcc-23727-createIGroupRetry

HPCC-23727 Add retry variety of createIGroup

Reviewed-By: Mark Kelly <mark.kelly@lexisnexis.com>
Reviewed-By: Richard Chapman <rchapman@hpccsystems.com>
Richard Chapman 5 vuotta sitten
vanhempi
commit
44fcc0ffe6

+ 1 - 1
dali/dfu/dfuserver.cpp

@@ -168,7 +168,7 @@ int main(int argc, const char *argv[])
     CSDSServerStatus *serverstatus=NULL;
     Owned<IReplicateServer> replserver;
     try {
-        Owned<IGroup> serverGroup = createIGroup(daliServer.str(),DALI_SERVER_PORT);
+        Owned<IGroup> serverGroup = createIGroupRetry(daliServer.str(),DALI_SERVER_PORT);
         initClientProcess(serverGroup, DCR_DfuServer, 0, NULL, NULL, stop?(1000*30):MP_WAIT_FOREVER);
 
         if(!stop)

+ 1 - 1
dali/sasha/saserver.cpp

@@ -330,7 +330,7 @@ int main(int argc, const char* argv[])
 
 
         unsigned short port = (stop||coalescer)?0:DEFAULT_SASHA_PORT;
-        Owned<IGroup> serverGroup = createIGroup(daliServer.str(),DALI_SERVER_PORT);
+        Owned<IGroup> serverGroup = createIGroupRetry(daliServer.str(),DALI_SERVER_PORT);
         initClientProcess(serverGroup, DCR_SashaServer, port, NULL, NULL, MP_WAIT_FOREVER);
         if (!stop&!coalescer) {
             startLogMsgParentReceiver();    // for auditing

+ 1 - 1
ecl/agentexec/agentexec.cpp

@@ -96,7 +96,7 @@ int CEclAgentExecutionServer::run()
     removeSentinelFile(sentinelFile);
     try
     {
-        Owned<IGroup> serverGroup = createIGroup(daliServers, DALI_SERVER_PORT);
+        Owned<IGroup> serverGroup = createIGroupRetry(daliServers, DALI_SERVER_PORT);
         initClientProcess(serverGroup, DCR_AgentExec);
 #ifdef _CONTAINERIZED
         if (streq("thor", apptype))

+ 2 - 2
ecl/eclagent/eclagent.cpp

@@ -3451,10 +3451,9 @@ extern int HTHOR_API eclagent_main(int argc, const char *argv[], StringBuffer *
         Owned<IUserDescriptor> standAloneUDesc;
         if (daliServers)
         {
-            SocketEndpoint daliEp(daliServers, DALI_SERVER_PORT);
             {
                 MTIME_SECTION(queryActiveTimer(), "SDS_Initialize");
-                Owned<IGroup> serverGroup = createIGroup(1, &daliEp);
+                Owned<IGroup> serverGroup = createIGroupRetry(daliServers, DALI_SERVER_PORT);
                 initClientProcess(serverGroup, DCR_EclAgent, 0, NULL, NULL, MP_WAIT_FOREVER);
             }
 #ifdef MONITOR_ECLAGENT_STATUS  
@@ -3491,6 +3490,7 @@ extern int HTHOR_API eclagent_main(int argc, const char *argv[], StringBuffer *
                     }
                 }
             };
+            SocketEndpoint daliEp(daliServers, DALI_SERVER_PORT);
             daliDownMonitor.setown(new CDaliDownMonitor(daliEp));
             addMPConnectionMonitor(daliDownMonitor);
 

+ 1 - 1
ecl/eclccserver/eclccserver.cpp

@@ -835,7 +835,7 @@ int main(int argc, const char *argv[])
         UWARNLOG("No Dali server list specified - assuming local");
         daliServers = ".";
     }
-    Owned<IGroup> serverGroup = createIGroup(daliServers, DALI_SERVER_PORT);
+    Owned<IGroup> serverGroup = createIGroupRetry(daliServers, DALI_SERVER_PORT);
     try
     {
         initClientProcess(serverGroup, DCR_EclCCServer);

+ 1 - 1
ecl/eclscheduler/eclscheduler.cpp

@@ -219,7 +219,7 @@ int main(int argc, const char *argv[])
         OWARNLOG("No Dali server list specified - assuming local");
         daliServers = ".";
     }
-    Owned<IGroup> serverGroup = createIGroup(daliServers, DALI_SERVER_PORT);
+    Owned<IGroup> serverGroup = createIGroupRetry(daliServers, DALI_SERVER_PORT);
     try
     {
         initClientProcess(serverGroup, DCR_EclScheduler);

+ 1 - 1
esp/services/ws_roxie/ws_roxieService.cpp

@@ -71,7 +71,7 @@ void CRoxieEx::init(IPropertyTree *cfg, const char *process, const char *service
         {
             throw MakeStringException(-1, "Please specify daliServers in the config file");
         }
-        Owned<IGroup> serverGroup = createIGroup(daliServers_.str(), DALI_SERVER_PORT);
+        Owned<IGroup> serverGroup = createIGroupRetry(daliServers_.str(), DALI_SERVER_PORT);
         initClientProcess(serverGroup, DCR_EspServer);
     }
 }

+ 22 - 0
system/mp/mpbase.cpp

@@ -19,6 +19,7 @@
 #include "platform.h"
 #include "jlib.hpp"
 #include "jlog.hpp"
+#include "jtime.hpp"
 
 #include "mpbase.hpp"
 
@@ -707,6 +708,27 @@ IGroup *createIGroup(const char *endpointlist,unsigned short defport)
     return CGroup::fromText(endpointlist,defport);
 }
 
+IGroup *createIGroupRetry(const char *endpointlist,unsigned short defport, unsigned timeout)
+{
+    CTimeMon t(timeout);
+    while (!t.timedout())
+    {
+        try
+        {
+            return createIGroup(endpointlist, defport);
+        }
+        catch (IException *e)
+        {
+            VStringBuffer errMsg("Failed to resolve group for: %s", endpointlist);
+            EXCLOG(e, errMsg.str());
+            e->Release();
+        }
+        // on resolve failure, pause for a short time to avoid spinning too fast
+        Sleep(5);
+    }
+    throw makeStringExceptionV(0, "Timedout trying to resolve group: %s", endpointlist);
+}
+
 IGroup *deserializeIGroup(MemoryBuffer &src)
 {
     return CGroup::deserialize(src);

+ 2 - 0
system/mp/mpbase.hpp

@@ -134,6 +134,8 @@ extern mp_decl IGroup *createIGroup(rank_t num,INode **);
 extern mp_decl IGroup *createIGroup(rank_t num,const SocketEndpoint *); 
 extern mp_decl IGroup *createIGroup(SocketEndpointArray &);
 extern mp_decl IGroup *createIGroup(const char *endpointlist,unsigned short defport=0); // takes socketendpointlist or result of toText
+constexpr unsigned defaultGroupResolveTimeout = 10*1000*60;
+extern mp_decl IGroup *createIGroupRetry(const char *endpointlist,unsigned short defport, unsigned timeout = defaultGroupResolveTimeout);
 extern mp_decl IGroup *deserializeIGroup(MemoryBuffer &src); 
 
 extern mp_decl INode *queryNullNode(); 

+ 1 - 1
thorlcr/master/thmastermain.cpp

@@ -647,7 +647,7 @@ int main( int argc, const char *argv[]  )
         LOG(MCdebugProgress, thorJob, "Build %s", BUILD_TAG);
         globals->setProp("@logURL", logUrl.str());
 
-        Owned<IGroup> serverGroup = createIGroup(daliServer.str(), DALI_SERVER_PORT);
+        Owned<IGroup> serverGroup = createIGroupRetry(daliServer.str(), DALI_SERVER_PORT);
 
         unsigned retry = 0;
         for (;;) {