浏览代码

HPCC-9264 - Avoid possible deadlocks if error gathering lookup RHS

If an exception occurred whilst gathering the RHS, the exception
could be lost by the one of the broadcast or processing threads.
The exception was tucked away waiting for the offending thread to
be joined, which never heappened.
Handle exception, retain and abort the broadcast.

Signed-off-by: Jake Smith <jake.smith@lexisnexis.com>
Jake Smith 12 年之前
父节点
当前提交
6441fe2fc8
共有 3 个文件被更改,包括 134 次插入35 次删除
  1. 2 0
      system/jlib/jthread.cpp
  2. 123 32
      thorlcr/activities/lookupjoin/thlookupjoinslave.cpp
  3. 9 3
      thorlcr/graph/thgraph.cpp

+ 2 - 0
system/jlib/jthread.cpp

@@ -547,6 +547,8 @@ void CThreadedPersistent::main()
         }
         }
         catch (IException *e)
         catch (IException *e)
         {
         {
+            VStringBuffer errMsg("CThreadedPersistent (%s)", athread.getName());
+            EXCLOG(e, errMsg.str());
             exception.setown(e);
             exception.setown(e);
             joinSem.signal(); // leave in running state, signal to join to handle
             joinSem.signal(); // leave in running state, signal to join to handle
             continue;
             continue;

+ 123 - 32
thorlcr/activities/lookupjoin/thlookupjoinslave.cpp

@@ -25,6 +25,7 @@
 #include "thbufdef.hpp"
 #include "thbufdef.hpp"
 #include "jbuff.hpp"
 #include "jbuff.hpp"
 #include "jset.hpp"
 #include "jset.hpp"
+#include "jisem.hpp"
 
 
 #include "thorxmlwrite.hpp"
 #include "thorxmlwrite.hpp"
 
 
@@ -86,7 +87,7 @@ class CBroadcaster : public CSimpleInterface
     mptag_t mpTag;
     mptag_t mpTag;
     unsigned myNode, slaves;
     unsigned myNode, slaves;
     IBCastReceive *recvInterface;
     IBCastReceive *recvInterface;
-    Semaphore allDoneSem;
+    InterruptableSemaphore allDoneSem;
     CriticalSection allDoneLock, bcastOtherCrit;
     CriticalSection allDoneLock, bcastOtherCrit;
     bool allDone, allDoneWaiting, allRequestStop, stopping, stopRecv;
     bool allDone, allDoneWaiting, allRequestStop, stopping, stopRecv;
     Owned<IBitSet> slavesDone, slavesStopping;
     Owned<IBitSet> slavesDone, slavesStopping;
@@ -95,51 +96,96 @@ class CBroadcaster : public CSimpleInterface
     {
     {
         CBroadcaster &broadcaster;
         CBroadcaster &broadcaster;
         CThreadedPersistent threaded;
         CThreadedPersistent threaded;
+        bool aborted;
     public:
     public:
         CRecv(CBroadcaster &_broadcaster) : threaded("CBroadcaster::CRecv", this), broadcaster(_broadcaster)
         CRecv(CBroadcaster &_broadcaster) : threaded("CBroadcaster::CRecv", this), broadcaster(_broadcaster)
         {
         {
+            aborted = false;
         }
         }
-        void start() { threaded.start(); }
-        void stop()
+        void start()
         {
         {
+            aborted = false;
+            threaded.start();
+        }
+        void abort(bool join)
+        {
+            if (aborted)
+                return;
+            aborted = true;
             broadcaster.cancelReceive();
             broadcaster.cancelReceive();
-            threaded.join();
+            if (join)
+                threaded.join();
         }
         }
         void wait()
         void wait()
         {
         {
             threaded.join();
             threaded.join();
         }
         }
     // IThreaded
     // IThreaded
-        virtual void main() { broadcaster.recvLoop(); }
+        virtual void main()
+        {
+            try
+            {
+                broadcaster.recvLoop();
+            }
+            catch (IException *e)
+            {
+                EXCLOG(e, "CRecv");
+                abort(false);
+                broadcaster.cancel(e);
+                e->Release();
+            }
+        }
     } receiver;
     } receiver;
     class CSend : implements IThreaded
     class CSend : implements IThreaded
     {
     {
         CBroadcaster &broadcaster;
         CBroadcaster &broadcaster;
         CThreadedPersistent threaded;
         CThreadedPersistent threaded;
         SimpleInterThreadQueueOf<CSendItem, true> broadcastQueue;
         SimpleInterThreadQueueOf<CSendItem, true> broadcastQueue;
+        Owned<IException> exception;
+        bool aborted;
+        void clearQueue()
+        {
+            loop
+            {
+                Owned<CSendItem> sendItem = broadcastQueue.dequeueNow();
+                if (NULL == sendItem)
+                    break;
+            }
+        }
     public:
     public:
         CSend(CBroadcaster &_broadcaster) : threaded("CBroadcaster::CSend", this), broadcaster(_broadcaster)
         CSend(CBroadcaster &_broadcaster) : threaded("CBroadcaster::CSend", this), broadcaster(_broadcaster)
         {
         {
+            aborted = false;
         }
         }
         ~CSend()
         ~CSend()
         {
         {
-            stop();
+            clearQueue();
         }
         }
         void addBlock(CSendItem *sendItem)
         void addBlock(CSendItem *sendItem)
         {
         {
+            if (exception)
+            {
+                if (sendItem)
+                    sendItem->Release();
+                throw exception.getClear();
+            }
             broadcastQueue.enqueue(sendItem); // will block if queue full
             broadcastQueue.enqueue(sendItem); // will block if queue full
         }
         }
-        void start() { threaded.start(); }
-        void stop()
+        void start()
         {
         {
+            aborted = false;
+            exception.clear();
+            threaded.start();
+        }
+        void abort(bool join)
+        {
+            if (aborted)
+                return;
+            aborted = true;
             broadcastQueue.stop();
             broadcastQueue.stop();
-            loop
-            {
-                Owned<CSendItem> sendItem = broadcastQueue.dequeueNow();
-                if (NULL == sendItem)
-                    break;
-            }
-            threaded.join();
+            clearQueue();
+            if (join)
+                threaded.join();
         }
         }
         void wait()
         void wait()
         {
         {
@@ -150,12 +196,22 @@ class CBroadcaster : public CSimpleInterface
     // IThreaded
     // IThreaded
         virtual void main()
         virtual void main()
         {
         {
-            while (!broadcaster.activity.queryAbortSoon())
+            try
             {
             {
-                Owned<CSendItem> sendItem = broadcastQueue.dequeue();
-                if (NULL == sendItem)
-                    break;
-                broadcaster.broadcastToOthers(sendItem);
+                while (!broadcaster.activity.queryAbortSoon())
+                {
+                    Owned<CSendItem> sendItem = broadcastQueue.dequeue();
+                    if (NULL == sendItem)
+                        break;
+                    broadcaster.broadcastToOthers(sendItem);
+                }
+            }
+            catch (IException *e)
+            {
+                EXCLOG(e, "CSend");
+                exception.setown(e);
+                abort(false);
+                broadcaster.cancel(e);
             }
             }
             ActPrintLog(&broadcaster.activity, "Sender stopped");
             ActPrintLog(&broadcaster.activity, "Sender stopped");
         }
         }
@@ -345,13 +401,19 @@ public:
         receiver.wait(); // terminates when received stop from all others
         receiver.wait(); // terminates when received stop from all others
         sender.wait(); // terminates when any remaining packets, including final stop packets have been re-broadcast
         sender.wait(); // terminates when any remaining packets, including final stop packets have been re-broadcast
     }
     }
-    void cancel()
+    void cancel(IException *e=NULL)
     {
     {
         allDoneWaiting = false;
         allDoneWaiting = false;
         allDone = true;
         allDone = true;
-        receiver.stop();
-        sender.stop();
-        allDoneSem.signal();
+        receiver.abort(true);
+        sender.abort(true);
+        if (e)
+        {
+            allDoneSem.interrupt(LINK(e));
+            activity.fireException(LINK(e));
+        }
+        else
+            allDoneSem.signal();
     }
     }
     bool send(CSendItem *sendItem)
     bool send(CSendItem *sendItem)
     {
     {
@@ -468,6 +530,7 @@ class CLookupJoinActivity : public CSlaveActivity, public CThorDataLink, impleme
         CLookupJoinActivity &owner;
         CLookupJoinActivity &owner;
         bool stopped;
         bool stopped;
         SimpleInterThreadQueueOf<CSendItem, true> blockQueue;
         SimpleInterThreadQueueOf<CSendItem, true> blockQueue;
+        Owned<IException> exception;
     public:
     public:
         CRowProcessor(CLookupJoinActivity &_owner) : threaded("CRowProcessor", this), owner(_owner)
         CRowProcessor(CLookupJoinActivity &_owner) : threaded("CRowProcessor", this), owner(_owner)
         {
         {
@@ -485,23 +548,50 @@ class CLookupJoinActivity : public CSlaveActivity, public CThorDataLink, impleme
             }
             }
             wait();
             wait();
         }
         }
-        void start() { threaded.start(); }
+        void start()
+        {
+            stopped = false;
+            exception.clear();
+            threaded.start();
+        }
+        void abort()
+        {
+            if (!stopped)
+            {
+                stopped = true;
+                blockQueue.enqueue(NULL);
+            }
+        }
         void wait() { threaded.join(); }
         void wait() { threaded.join(); }
         void addBlock(CSendItem *sendItem)
         void addBlock(CSendItem *sendItem)
         {
         {
+            if (exception)
+            {
+                if (sendItem)
+                    sendItem->Release();
+                throw exception.getClear();
+            }
             blockQueue.enqueue(sendItem); // will block if queue full
             blockQueue.enqueue(sendItem); // will block if queue full
         }
         }
     // IThreaded
     // IThreaded
         virtual void main()
         virtual void main()
         {
         {
-            while (!stopped)
+            try
             {
             {
-                Owned<CSendItem> sendItem = blockQueue.dequeue();
-                if (NULL == sendItem)
-                    break;
-                MemoryBuffer expandedMb;
-                ThorExpand(sendItem->queryMsg(), expandedMb);
-                owner.processRHSRows(sendItem->queryOrigin(), expandedMb);
+                while (!stopped)
+                {
+                    Owned<CSendItem> sendItem = blockQueue.dequeue();
+                    if (stopped || (NULL == sendItem))
+                        break;
+                    MemoryBuffer expandedMb;
+                    ThorExpand(sendItem->queryMsg(), expandedMb);
+                    owner.processRHSRows(sendItem->queryOrigin(), expandedMb);
+                }
+            }
+            catch (IException *e)
+            {
+                exception.setown(e);
+                EXCLOG(e, "CRowProcessor");
             }
             }
         }
         }
     } rowProcessor;
     } rowProcessor;
@@ -743,6 +833,7 @@ public:
         gotOtherROs.signal();
         gotOtherROs.signal();
         cancelReceiveMsg(RANK_ALL, mpTag);
         cancelReceiveMsg(RANK_ALL, mpTag);
         broadcaster.cancel();
         broadcaster.cancel();
+        rowProcessor.abort();
     }
     }
     virtual void stop()
     virtual void stop()
     {
     {

+ 9 - 3
thorlcr/graph/thgraph.cpp

@@ -2798,9 +2798,15 @@ IRowInterfaces *CActivityBase::getRowInterfaces()
 bool CActivityBase::receiveMsg(CMessageBuffer &mb, const rank_t rank, const mptag_t mpTag, rank_t *sender, unsigned timeout)
 bool CActivityBase::receiveMsg(CMessageBuffer &mb, const rank_t rank, const mptag_t mpTag, rank_t *sender, unsigned timeout)
 {
 {
     BooleanOnOff onOff(receiving);
     BooleanOnOff onOff(receiving);
-    if (cancelledReceive)
-        return false;
-    return container.queryJob().queryJobComm().recv(mb, rank, mpTag, sender, timeout);
+    CTimeMon t(timeout);
+    unsigned remaining = timeout;
+    // check 'cancelledReceive' every 10 secs
+    while (!cancelledReceive && ((MP_WAIT_FOREVER==timeout) || !t.timedout(&remaining)))
+    {
+        if (container.queryJob().queryJobComm().recv(mb, rank, mpTag, sender, remaining>10000?10000:remaining))
+            return true;
+    }
+    return false;
 }
 }
 
 
 void CActivityBase::cancelReceiveMsg(const rank_t rank, const mptag_t mpTag)
 void CActivityBase::cancelReceiveMsg(const rank_t rank, const mptag_t mpTag)