فهرست منبع

HPCC-22802 Heuristic to disable remote index streaming inside CQ's

Signed-off-by: Jake Smith <jake.smith@lexisnexisrisk.com>
Jake Smith 5 سال پیش
والد
کامیت
7ba1514a6e
4فایلهای تغییر یافته به همراه201 افزوده شده و 99 حذف شده
  1. 176 92
      thorlcr/activities/indexread/thindexreadslave.cpp
  2. 14 6
      thorlcr/graph/thgraph.cpp
  3. 10 0
      thorlcr/graph/thgraph.hpp
  4. 1 1
      thorlcr/thorutil/thormisc.hpp

+ 176 - 92
thorlcr/activities/indexread/thindexreadslave.cpp

@@ -77,6 +77,7 @@ protected:
     bool rowLimitSkips = false;
     rowcount_t keyedProcessed = 0;
     rowcount_t rowLimit = RCMAX;
+    bool useRemoteStreaming = false;
 
 
     class TransformCallback : implements IThorIndexCallback , public CSimpleInterface
@@ -163,117 +164,118 @@ public:
             while (p<partDescs.ordinality()) // will process all parts if localMerge
             {
                 IPartDescriptor &part = partDescs.item(p++);
-
-                Owned<ITranslator> translator = getTranslators(part);
-                IOutputMetaData *actualFormat = translator ? &translator->queryActualFormat() : expectedFormat;
-                bool tryRemoteStream = actualFormat->queryTypeInfo()->canInterpret() && actualFormat->queryTypeInfo()->canSerialize() &&
-                                       projectedFormat->queryTypeInfo()->canInterpret() && projectedFormat->queryTypeInfo()->canSerialize();
-                bool usesBlobs = 0 != (helper->getFlags() & TIRusesblob);
-
                 unsigned crc=0;
                 part.getCrc(crc);
 
-                /* If part can potentially be remotely streamed, 1st check if any part is local,
-                 * then try to remote stream, and otherwise failover to legacy remote access
-                 */
-                if (tryRemoteStream && !usesBlobs && !localMerge)
+                if (useRemoteStreaming)
                 {
-                    std::vector<unsigned> remoteCandidates;
-                    for (unsigned copy=0; copy<part.numCopies(); copy++)
+                    Owned<ITranslator> translator = getTranslators(part);
+                    IOutputMetaData *actualFormat = translator ? &translator->queryActualFormat() : expectedFormat;
+                    bool tryRemoteStream = actualFormat->queryTypeInfo()->canInterpret() && actualFormat->queryTypeInfo()->canSerialize() &&
+                                           projectedFormat->queryTypeInfo()->canInterpret() && projectedFormat->queryTypeInfo()->canSerialize();
+
+                    /* If part can potentially be remotely streamed, 1st check if any part is local,
+                     * then try to remote stream, and otherwise failover to legacy remote access
+                     */
+                    if (tryRemoteStream)
                     {
-                        RemoteFilename rfn;
-                        part.getFilename(copy, rfn);
-                        if (!isRemoteReadCandidate(*this, rfn))
+                        std::vector<unsigned> remoteCandidates;
+                        for (unsigned copy=0; copy<part.numCopies(); copy++)
                         {
-                            StringBuffer path;
-                            rfn.getPath(path);
-                            Owned<IFile> iFile = createIFile(path);
-                            try
+                            RemoteFilename rfn;
+                            part.getFilename(copy, rfn);
+                            if (!isRemoteReadCandidate(*this, rfn))
                             {
-                                if (iFile->exists())
+                                StringBuffer path;
+                                rfn.getPath(path);
+                                Owned<IFile> iFile = createIFile(path);
+                                try
                                 {
-                                    remoteCandidates.clear();
-                                    break;
+                                    if (iFile->exists())
+                                    {
+                                        remoteCandidates.clear();
+                                        break;
+                                    }
+                                }
+                                catch (IException *e)
+                                {
+                                    ActPrintLog(e, "getNextInput()");
+                                    e->Release();
                                 }
                             }
-                            catch (IException *e)
-                            {
-                                ActPrintLog(e, "getNextInput()");
-                                e->Release();
-                            }
+                            else
+                                remoteCandidates.push_back(copy);
                         }
-                        else
-                            remoteCandidates.push_back(copy);
-                    }
-                    Owned<IException> remoteReadException;
-                    StringBuffer remoteReadExceptionPath;
-                    for (unsigned &copy : remoteCandidates) // only if no local part found above
-                    {
-                        RemoteFilename rfn;
-                        part.getFilename(copy, rfn);
-                        StringBuffer path;
-                        rfn.getPath(path);
+                        Owned<IException> remoteReadException;
+                        StringBuffer remoteReadExceptionPath;
+                        for (unsigned &copy : remoteCandidates) // only if no local part found above
+                        {
+                            RemoteFilename rfn;
+                            part.getFilename(copy, rfn);
+                            StringBuffer path;
+                            rfn.getPath(path);
 
-                        // Open a stream from remote file, having passed actual, expected, projected, and filters to it
-                        SocketEndpoint ep(rfn.queryEndpoint());
-                        setDafsEndpointPort(ep);
+                            // Open a stream from remote file, having passed actual, expected, projected, and filters to it
+                            SocketEndpoint ep(rfn.queryEndpoint());
+                            setDafsEndpointPort(ep);
 
-                        IConstArrayOf<IFieldFilter> fieldFilters;  // These refer to the expected layout
-                        struct CIndexReadContext : implements IIndexReadContext
-                        {
-                            IConstArrayOf<IFieldFilter> &fieldFilters;
-                            CIndexReadContext(IConstArrayOf<IFieldFilter> &_fieldFilters) : fieldFilters(_fieldFilters)
-                            {
-                            }
-                            virtual void append(IKeySegmentMonitor *segment) override { throwUnexpected(); }
-                            virtual void append(FFoption option, const IFieldFilter * filter) override
-                            {
-                                fieldFilters.append(*filter);
-                            }
-                        } context(fieldFilters);
-                        helper->createSegmentMonitors(&context);
-
-                        RowFilter actualFilter;
-                        Owned<const IKeyTranslator> keyedTranslator = createKeyTranslator(actualFormat->queryRecordAccessor(true), expectedFormat->queryRecordAccessor(true));
-                        if (keyedTranslator && keyedTranslator->needsTranslate())
-                            keyedTranslator->translate(actualFilter, fieldFilters);
-                        else
-                            actualFilter.appendFilters(fieldFilters);
-
-                        StringBuffer lPath;
-                        rfn.getLocalPath(lPath);
-                        Owned<IIndexLookup> indexLookup = createRemoteFilteredKey(ep, lPath, crc, actualFormat, projectedFormat, actualFilter, remoteLimit);
-                        if (indexLookup)
-                        {
-                            try
+                            IConstArrayOf<IFieldFilter> fieldFilters;  // These refer to the expected layout
+                            struct CIndexReadContext : implements IIndexReadContext
                             {
-                                indexLookup->ensureAvailable();
-                            }
-                            catch (IException *e)
+                                IConstArrayOf<IFieldFilter> &fieldFilters;
+                                CIndexReadContext(IConstArrayOf<IFieldFilter> &_fieldFilters) : fieldFilters(_fieldFilters)
+                                {
+                                }
+                                virtual void append(IKeySegmentMonitor *segment) override { throwUnexpected(); }
+                                virtual void append(FFoption option, const IFieldFilter * filter) override
+                                {
+                                    fieldFilters.append(*filter);
+                                }
+                            } context(fieldFilters);
+                            helper->createSegmentMonitors(&context);
+
+                            RowFilter actualFilter;
+                            Owned<const IKeyTranslator> keyedTranslator = createKeyTranslator(actualFormat->queryRecordAccessor(true), expectedFormat->queryRecordAccessor(true));
+                            if (keyedTranslator && keyedTranslator->needsTranslate())
+                                keyedTranslator->translate(actualFilter, fieldFilters);
+                            else
+                                actualFilter.appendFilters(fieldFilters);
+
+                            StringBuffer lPath;
+                            rfn.getLocalPath(lPath);
+                            Owned<IIndexLookup> indexLookup = createRemoteFilteredKey(ep, lPath, crc, actualFormat, projectedFormat, actualFilter, remoteLimit);
+                            if (indexLookup)
                             {
-#ifdef _DEBUG
-                                EXCLOG(e, nullptr);
-#endif
-                                if (remoteReadException)
-                                    e->Release(); // only record 1st
-                                else
+                                try
+                                {
+                                    indexLookup->ensureAvailable();
+                                }
+                                catch (IException *e)
                                 {
-                                    remoteReadException.setown(e);
-                                    remoteReadExceptionPath.set(path);
+    #ifdef _DEBUG
+                                    EXCLOG(e, nullptr);
+    #endif
+                                    if (remoteReadException)
+                                        e->Release(); // only record 1st
+                                    else
+                                    {
+                                        remoteReadException.setown(e);
+                                        remoteReadExceptionPath.set(path);
+                                    }
+                                    continue; // try next copy and ultimately failover to local when no more copies
                                 }
-                                continue; // try next copy and ultimately failover to local when no more copies
+                                ActPrintLog("[part=%d]: reading remote dafilesrv index '%s' (logical file = %s)", partNum, path.str(), logicalFilename.get());
+                                partNum = p;
+                                return indexLookup.getClear();
                             }
-                            ActPrintLog("[part=%d]: reading remote dafilesrv index '%s' (logical file = %s)", partNum, path.str(), logicalFilename.get());
-                            partNum = p;
-                            return indexLookup.getClear();
                         }
-                    }
-                    if (remoteReadException)
-                    {
-                        VStringBuffer msg("Remote streaming failure, failing over to direct read for: '%s'. ", remoteReadExceptionPath.str());
-                        remoteReadException->errorMessage(msg);
-                        Owned<IThorException> e2 = MakeActivityWarning(this, TE_RemoteReadFailure, "%s", msg.str());
-                        fireException(e2);
+                        if (remoteReadException)
+                        {
+                            VStringBuffer msg("Remote streaming failure, failing over to direct read for: '%s'. ", remoteReadExceptionPath.str());
+                            remoteReadException->errorMessage(msg);
+                            Owned<IThorException> e2 = MakeActivityWarning(this, TE_RemoteReadFailure, "%s", msg.str());
+                            fireException(e2);
+                        }
                     }
                 }
 
@@ -558,6 +560,88 @@ public:
         statsArr = _statsArr.getArray();
         lastSeeks = lastScans = 0;
         localMerge = (localKey && partDescs.ordinality()>1) || seekGEOffset;
+
+        if (parts)
+        {
+            IPartDescriptor &part0 = partDescs.item(0);
+            IFileDescriptor &fileDesc = part0.queryOwner();
+
+            if ((0 == (helper->getFlags() & TIRusesblob)) && !localMerge)
+            {
+                if (!inChildQuery())
+                    useRemoteStreaming = true;
+                else
+                {
+                    /*
+                     * If in a CQ, it is counterproductive to use an index read stream per CQ execution if the index
+                     * involved is relatively small.
+                     * Because, if the index is small and direct reading (and caching) key node pages, it is likely
+                     * that repeated executions will not read any (or few) new key pages (i.e. cache hit).
+                     *
+                     * Example: small 1-way key being remotely read by the whole cluster.
+                     * If it is small it will fit (or mostly fit) in node key cache, and thus mostly read from memory vs over the network etc.
+                     *
+                     */
+
+                    // # data parts excluding TLK if present
+                    unsigned totalNumDataParts = fileDesc.numParts();
+                    if ((totalNumDataParts>1) && !fileDesc.queryProperties().getPropBool("@local"))
+                        totalNumDataParts--; // TLK
+
+                    offset_t logicalFileSize = fileDesc.queryProperties().getPropInt64("@size"); // NB: size is compressed size
+                    if (!logicalFileSize) // not sure when/if this should ever be missing, but..
+                    {
+                        IWARNLOG("Missing @size in meta data for index file '%s'", logicalFilename.get());
+                        // estimate size based on physical size of 1st part
+                        RemoteFilename rfn;
+                        part0.getFilename(0, rfn);
+                        StringBuffer path;
+                        rfn.getPath(path);
+                        Owned<IFile> iFile = createIFile(path);
+                        offset_t partSize = iFile->size();
+                        logicalFileSize = partSize * totalNumDataParts;
+                    }
+
+                    memsize_t keyCacheSize = queryJob().getKeyNodeCacheSize() + queryJob().getKeyLeafCacheSize();
+                    memsize_t minRemoteCQIndexSizeMb = getOptInt64(THOROPT_MIN_REMOTE_CQ_INDEX_SIZE_MB);
+                    if (minRemoteCQIndexSizeMb)
+                    {
+                        // anything larger is streamed, anything smaller is read directly
+                        if (logicalFileSize > (minRemoteCQIndexSizeMb * 0x100000))
+                            useRemoteStreaming = true;
+                    }
+                    else // no min. size to stream set, so use a heuristic
+                    {
+                        /*
+                         * Rough heuristic.
+                         *
+                         * If (([average compressed part size] * [# parts handling] * [compressionMultiple] * [cacheSizeFitPercentage%]) > [keyCacheSize])
+                         *     then useRemoteStreaming = true
+                         *
+                         * i.e. if the [cacheSizeFitPercentage] % of total size of the compressed index data this slave is handling multiplied
+                         * by a rough compression multiplier [compressionMultiple] is larger than the cache size [keyCacheSize], then use streaming.
+                         * If not (useRemoteStreaming=false), direct read (and use the cache).
+                         *
+                         * The cacheSizeFitPercentage (25%) is used, so that the index has to be significantly bigger than the cache to use streaming,
+                         * because it is still worth directly reading on relatively small indexes, even if 1:4 cache hits are acheived.
+                         *
+                         */
+
+                        static const unsigned compressionMultiple = 10; // v. rough approx. of compression ratio (actual compression ratio/uncompressed size unknown)
+                        static const unsigned cacheSizeFitPercentage = 25; // if this much (%) of amount I'm handling fits into cache
+
+                        offset_t avgPartSize = logicalFileSize / totalNumDataParts;
+
+                        // NB: The # parts this slave is dealing with (partDescs.ordinality()) is equal to all data parts (totalNumDataParts) when in a CQ.
+                        offset_t myIndexPartSizeTotal = avgPartSize * partDescs.ordinality() * compressionMultiple;
+
+                        offset_t myIndexPartSizeHitShare = myIndexPartSizeTotal * cacheSizeFitPercentage / 100;
+                        if (myIndexPartSizeHitShare >= keyCacheSize) // e.g. if 25% of my handled index data is larger than cache
+                            useRemoteStreaming = true;
+                    }
+                }
+            }
+        }
     }
     // IThorDataLink
     virtual void start() override

+ 14 - 6
thorlcr/graph/thgraph.cpp

@@ -430,6 +430,11 @@ IThorGraphDependencyIterator *CGraphElementBase::getDependsIterator() const
     return new ArrayIIteratorOf<const CGraphDependencyArray, CGraphDependency, IThorGraphDependencyIterator>(dependsOn);
 }
 
+bool CGraphElementBase::inChildQuery() const
+{
+    return (nullptr != queryOwner().queryOwner()) && !queryOwner().isGlobal();
+}
+
 void CGraphElementBase::reset()
 {
     alreadyUpdated = false;
@@ -2729,12 +2734,15 @@ void CJobBase::startJob()
     setPerformanceMonitorHook(perfmonhook);
     PrintMemoryStatusLog();
     logDiskSpace();
-    unsigned keyNodeCacheMB = (unsigned)getWorkUnitValueInt("keyNodeCacheMB", DEFAULT_KEYNODECACHEMB * queryJobChannels());
-    unsigned keyLeafCacheMB = (unsigned)getWorkUnitValueInt("keyLeafCacheMB", DEFAULT_KEYLEAFCACHEMB * queryJobChannels());
-    unsigned keyBlobCacheMB = (unsigned)getWorkUnitValueInt("keyBlobCacheMB", DEFAULT_KEYBLOBCACHEMB * queryJobChannels());
-    setNodeCacheMem(keyNodeCacheMB * 0x100000);
-    setLeafCacheMem(keyLeafCacheMB * 0x100000);
-    setBlobCacheMem(keyBlobCacheMB * 0x100000);
+    unsigned keyNodeCacheMB = getWorkUnitValueInt("keyNodeCacheMB", DEFAULT_KEYNODECACHEMB * queryJobChannels());
+    unsigned keyLeafCacheMB = getWorkUnitValueInt("keyLeafCacheMB", DEFAULT_KEYLEAFCACHEMB * queryJobChannels());
+    unsigned keyBlobCacheMB = getWorkUnitValueInt("keyBlobCacheMB", DEFAULT_KEYBLOBCACHEMB * queryJobChannels());
+    keyNodeCacheBytes = ((memsize_t)0x100000) * keyNodeCacheMB;
+    keyLeafCacheBytes = ((memsize_t)0x100000) * keyLeafCacheMB;
+    keyBlobCacheBytes = ((memsize_t)0x100000) * keyBlobCacheMB;
+    setNodeCacheMem(keyNodeCacheBytes);
+    setLeafCacheMem(keyLeafCacheBytes);
+    setBlobCacheMem(keyBlobCacheBytes);
     PROGLOG("Key node caching setting: node=%u MB, leaf=%u MB, blob=%u MB", keyNodeCacheMB, keyLeafCacheMB, keyBlobCacheMB);
 
     unsigned keyFileCacheLimit = (unsigned)getWorkUnitValueInt("keyFileCacheLimit", 0);

+ 10 - 0
thorlcr/graph/thgraph.hpp

@@ -327,6 +327,7 @@ public:
     bool isPrepared() const { return prepared; }
 
     CGraphBase &queryOwner() const { return *owner; }
+    bool inChildQuery() const;
     CGraphBase *queryResultsGraph() const { return resultsGraph; }
     IGraphTempHandler *queryTempHandler(bool assert=true) const;
     CJobBase &queryJob() const;
@@ -832,6 +833,10 @@ protected:
     bool jobEnded = false;
     bool failOnLeaks = false;
     unsigned maxLfnBlockTimeMins = DEFAULT_MAXLFN_BLOCKTIME_MINS;
+    memsize_t keyNodeCacheBytes = 0;
+    memsize_t keyLeafCacheBytes = 0;
+    memsize_t keyBlobCacheBytes = 0;
+
 
     class CThorPluginCtx : public SimplePluginCtx
     {
@@ -863,6 +868,10 @@ public:
     inline unsigned queryJobSlaveChannelNum(unsigned slaveNum) const { dbgassertex(slaveNum && slaveNum<=querySlaves()); return jobSlaveChannelNum[slaveNum-1]; }
     ICommunicator &queryNodeComm() const { return ::queryNodeComm(); }
     const rank_t &queryMyNodeRank() const { return myNodeRank; }
+    memsize_t getKeyNodeCacheSize() const { return keyNodeCacheBytes; }
+    memsize_t getKeyLeafCacheSize() const { return keyLeafCacheBytes; }
+    memsize_t getKeyBlobCacheSize() const { return keyBlobCacheBytes; }
+
     void init();
     void setXGMML(IPropertyTree *_xgmml) { xgmml.set(_xgmml); }
     IPropertyTree *queryXGMML() { return xgmml; }
@@ -1084,6 +1093,7 @@ public:
     void cancelReceiveMsg(const rank_t rank, const mptag_t mpTag);
     bool firstNode() const { return 1 == container.queryJobChannel().queryMyRank(); }
     bool lastNode() const { return container.queryJob().querySlaves() == container.queryJobChannel().queryMyRank(); }
+    bool inChildQuery() const { return container.inChildQuery(); }
     unsigned queryMaxCores() const { return container.queryMaxCores(); }
     IThorRowInterfaces *getRowInterfaces();
     IEngineRowAllocator *getRowAllocator(IOutputMetaData * meta, roxiemem::RoxieHeapFlags flags=roxiemem::RHFnone, byte seq=0) const;

+ 1 - 1
thorlcr/thorutil/thormisc.hpp

@@ -98,7 +98,7 @@
 #define THOROPT_ACTINIT_WAITTIME_MINS "actInitWaitTimeMins"     // max time to wait for slave activity initialization message from master
 #define THOROPT_MAXLFN_BLOCKTIME_MINS "maxLfnBlockTimeMins"     // max time permitted to be blocked on a DFS logical file operation.
 #define THOROPT_VALIDATE_FILE_TYPE    "validateFileType"        // validate file type compatibility, e.g. if on fire error if XML reading CSV    (default = true)
-
+#define THOROPT_MIN_REMOTE_CQ_INDEX_SIZE_MB "minRemoteCQIndexSizeMb" // minimum size of index file to enable server side handling                (default = 0, meaning use heuristic to determin)
 
 #define INITIAL_SELFJOIN_MATCH_WARNING_LEVEL 20000  // max of row matches before selfjoin emits warning