Sfoglia il codice sorgente

HPCC-23727 Add retry variety of createIGroup

Add createIGroupRetry for use by e.g. Dali clients, who might
not be able to resolve the hostname initially.
This can occur for exampple, if in containerized setup, and
the hostname is creates as a service, which is not immediately
ready.

Signed-off-by: Jake Smith <jake.smith@lexisnexisrisk.com>
Jake Smith 5 anni fa
parent
commit
6a8506ddc4

+ 1 - 1
dali/dfu/dfuserver.cpp

@@ -168,7 +168,7 @@ int main(int argc, const char *argv[])
     CSDSServerStatus *serverstatus=NULL;
     Owned<IReplicateServer> replserver;
     try {
-        Owned<IGroup> serverGroup = createIGroup(daliServer.str(),DALI_SERVER_PORT);
+        Owned<IGroup> serverGroup = createIGroupRetry(daliServer.str(),DALI_SERVER_PORT);
         initClientProcess(serverGroup, DCR_DfuServer, 0, NULL, NULL, stop?(1000*30):MP_WAIT_FOREVER);
 
         if(!stop)

+ 1 - 1
dali/sasha/saserver.cpp

@@ -330,7 +330,7 @@ int main(int argc, const char* argv[])
 
 
         unsigned short port = (stop||coalescer)?0:DEFAULT_SASHA_PORT;
-        Owned<IGroup> serverGroup = createIGroup(daliServer.str(),DALI_SERVER_PORT);
+        Owned<IGroup> serverGroup = createIGroupRetry(daliServer.str(),DALI_SERVER_PORT);
         initClientProcess(serverGroup, DCR_SashaServer, port, NULL, NULL, MP_WAIT_FOREVER);
         if (!stop&!coalescer) {
             startLogMsgParentReceiver();    // for auditing

+ 1 - 1
ecl/agentexec/agentexec.cpp

@@ -96,7 +96,7 @@ int CEclAgentExecutionServer::run()
     removeSentinelFile(sentinelFile);
     try
     {
-        Owned<IGroup> serverGroup = createIGroup(daliServers, DALI_SERVER_PORT);
+        Owned<IGroup> serverGroup = createIGroupRetry(daliServers, DALI_SERVER_PORT);
         initClientProcess(serverGroup, DCR_AgentExec);
 #ifdef _CONTAINERIZED
         if (streq("thor", apptype))

+ 2 - 2
ecl/eclagent/eclagent.cpp

@@ -3451,10 +3451,9 @@ extern int HTHOR_API eclagent_main(int argc, const char *argv[], StringBuffer *
         Owned<IUserDescriptor> standAloneUDesc;
         if (daliServers)
         {
-            SocketEndpoint daliEp(daliServers, DALI_SERVER_PORT);
             {
                 MTIME_SECTION(queryActiveTimer(), "SDS_Initialize");
-                Owned<IGroup> serverGroup = createIGroup(1, &daliEp);
+                Owned<IGroup> serverGroup = createIGroupRetry(daliServers, DALI_SERVER_PORT);
                 initClientProcess(serverGroup, DCR_EclAgent, 0, NULL, NULL, MP_WAIT_FOREVER);
             }
 #ifdef MONITOR_ECLAGENT_STATUS  
@@ -3491,6 +3490,7 @@ extern int HTHOR_API eclagent_main(int argc, const char *argv[], StringBuffer *
                     }
                 }
             };
+            SocketEndpoint daliEp(daliServers, DALI_SERVER_PORT);
             daliDownMonitor.setown(new CDaliDownMonitor(daliEp));
             addMPConnectionMonitor(daliDownMonitor);
 

+ 1 - 1
ecl/eclccserver/eclccserver.cpp

@@ -835,7 +835,7 @@ int main(int argc, const char *argv[])
         UWARNLOG("No Dali server list specified - assuming local");
         daliServers = ".";
     }
-    Owned<IGroup> serverGroup = createIGroup(daliServers, DALI_SERVER_PORT);
+    Owned<IGroup> serverGroup = createIGroupRetry(daliServers, DALI_SERVER_PORT);
     try
     {
         initClientProcess(serverGroup, DCR_EclCCServer);

+ 1 - 1
ecl/eclscheduler/eclscheduler.cpp

@@ -219,7 +219,7 @@ int main(int argc, const char *argv[])
         OWARNLOG("No Dali server list specified - assuming local");
         daliServers = ".";
     }
-    Owned<IGroup> serverGroup = createIGroup(daliServers, DALI_SERVER_PORT);
+    Owned<IGroup> serverGroup = createIGroupRetry(daliServers, DALI_SERVER_PORT);
     try
     {
         initClientProcess(serverGroup, DCR_EclScheduler);

+ 1 - 1
esp/services/ws_roxie/ws_roxieService.cpp

@@ -71,7 +71,7 @@ void CRoxieEx::init(IPropertyTree *cfg, const char *process, const char *service
         {
             throw MakeStringException(-1, "Please specify daliServers in the config file");
         }
-        Owned<IGroup> serverGroup = createIGroup(daliServers_.str(), DALI_SERVER_PORT);
+        Owned<IGroup> serverGroup = createIGroupRetry(daliServers_.str(), DALI_SERVER_PORT);
         initClientProcess(serverGroup, DCR_EspServer);
     }
 }

+ 22 - 0
system/mp/mpbase.cpp

@@ -19,6 +19,7 @@
 #include "platform.h"
 #include "jlib.hpp"
 #include "jlog.hpp"
+#include "jtime.hpp"
 
 #include "mpbase.hpp"
 
@@ -707,6 +708,27 @@ IGroup *createIGroup(const char *endpointlist,unsigned short defport)
     return CGroup::fromText(endpointlist,defport);
 }
 
+IGroup *createIGroupRetry(const char *endpointlist,unsigned short defport, unsigned timeout)
+{
+    CTimeMon t(timeout);
+    while (!t.timedout())
+    {
+        try
+        {
+            return createIGroup(endpointlist, defport);
+        }
+        catch (IException *e)
+        {
+            VStringBuffer errMsg("Failed to resolve group for: %s", endpointlist);
+            EXCLOG(e, errMsg.str());
+            e->Release();
+        }
+        // on resolve failure, pause for a short time to avoid spinning too fast
+        Sleep(5);
+    }
+    throw makeStringExceptionV(0, "Timedout trying to resolve group: %s", endpointlist);
+}
+
 IGroup *deserializeIGroup(MemoryBuffer &src)
 {
     return CGroup::deserialize(src);

+ 2 - 0
system/mp/mpbase.hpp

@@ -134,6 +134,8 @@ extern mp_decl IGroup *createIGroup(rank_t num,INode **);
 extern mp_decl IGroup *createIGroup(rank_t num,const SocketEndpoint *); 
 extern mp_decl IGroup *createIGroup(SocketEndpointArray &);
 extern mp_decl IGroup *createIGroup(const char *endpointlist,unsigned short defport=0); // takes socketendpointlist or result of toText
+constexpr unsigned defaultGroupResolveTimeout = 10*1000*60;
+extern mp_decl IGroup *createIGroupRetry(const char *endpointlist,unsigned short defport, unsigned timeout = defaultGroupResolveTimeout);
 extern mp_decl IGroup *deserializeIGroup(MemoryBuffer &src); 
 
 extern mp_decl INode *queryNullNode(); 

+ 1 - 1
thorlcr/master/thmastermain.cpp

@@ -647,7 +647,7 @@ int main( int argc, const char *argv[]  )
         LOG(MCdebugProgress, thorJob, "Build %s", BUILD_TAG);
         globals->setProp("@logURL", logUrl.str());
 
-        Owned<IGroup> serverGroup = createIGroup(daliServer.str(), DALI_SERVER_PORT);
+        Owned<IGroup> serverGroup = createIGroupRetry(daliServer.str(), DALI_SERVER_PORT);
 
         unsigned retry = 0;
         for (;;) {