Browse Source

HPCC-12982 Add a statistic of the number of key duplicates for an index build

Signed-off-by: Shamser Ahmed <shamser.ahmed@lexisnexis.co.uk>
Shamser Ahmed 7 years ago
parent
commit
3a97ec1432

+ 3 - 0
ecl/hthor/hthor.cpp

@@ -1128,6 +1128,8 @@ void CHThorIndexWriteActivity::execute()
             reccount++;
         }
         builder->finish(metadata, &fileCrc);
+        duplicateKeyCount = builder->getDuplicateCount();
+        cummulativeDuplicateKeyCount += duplicateKeyCount;
         out->flush();
         out.clear();
     }
@@ -1187,6 +1189,7 @@ void CHThorIndexWriteActivity::execute()
     properties.setProp("@owner", agent.queryWorkUnit()->queryUser());
     properties.setProp("@workunit", agent.queryWorkUnit()->queryWuid());
     properties.setProp("@job", agent.queryWorkUnit()->queryJobName());
+    properties.setPropInt64("@duplicateKeyCount",duplicateKeyCount);
     char const * rececl = helper.queryRecordECL();
     if(rececl && *rececl)
         properties.setProp("ECL", rececl);

+ 10 - 1
ecl/hthor/hthor.ipp

@@ -374,16 +374,25 @@ class CHThorIndexWriteActivity : public CHThorActivityBase
     Owned<IFile> file;
     bool incomplete;
     offset_t sizeLimit;
+    unsigned __int64 duplicateKeyCount = 0;
+    unsigned __int64 cummulativeDuplicateKeyCount = 0;
 
     void close();
     void buildUserMetadata(Owned<IPropertyTree> & metadata);
     void buildLayoutMetadata(Owned<IPropertyTree> & metadata);
+    virtual void updateProgress(IStatisticGatherer &progress) const override
+    {
+        CHThorActivityBase::updateProgress(progress);
+        StatsActivityScope scope(progress, activityId);
+        progress.addStatistic(StNumDuplicateKeys, cummulativeDuplicateKeyCount);
+    }
+
 public:
     IMPLEMENT_SINKACTIVITY;
 
     CHThorIndexWriteActivity(IAgentContext &agent, unsigned _activityId, unsigned _subgraphId, IHThorIndexWriteArg &_arg, ThorActivityKind _kind);
     ~CHThorIndexWriteActivity();
-    virtual void execute();
+    virtual void execute() override;
 };
 
 class IPipeWriteOwner

+ 11 - 1
roxie/ccd/ccdserver.cpp

@@ -370,6 +370,7 @@ static const StatisticsMapping diskStatistics(&actStatistics, {StNumServerCacheH
 static const StatisticsMapping soapStatistics(&actStatistics, { StTimeSoapcall });
 static const StatisticsMapping groupStatistics(&actStatistics, { StNumGroups, StNumGroupMax });
 static const StatisticsMapping sortStatistics(&actStatistics, { StTimeSortElapsed });
+static const StatisticsMapping indexWriteStatistics(&actStatistics, { StNumDuplicateKeys });
 
 //=================================================================================
 
@@ -11920,6 +11921,8 @@ class CRoxieServerIndexWriteActivity : public CRoxieServerInternalSinkActivity,
     unsigned __int64 reccount;
     unsigned int fileCrc;
     StringBuffer filename;
+    unsigned __int64 duplicateKeyCount = 0;
+    unsigned __int64 cummulativeDuplicateKeyCount = 0;
 
     void updateWorkUnitResult()
     {
@@ -12120,6 +12123,8 @@ public:
                 }
                 reccount++;
             }
+            duplicateKeyCount = builder->getDuplicateCount();
+            cummulativeDuplicateKeyCount += duplicateKeyCount;
             builder->finish(metadata, &fileCrc);
         }
     }
@@ -12138,6 +12143,7 @@ public:
 
     virtual void reset()
     {
+        noteStatistic(StNumDuplicateKeys, cummulativeDuplicateKeyCount);
         CRoxieServerActivity::reset();
         writer.clear();
     }
@@ -12191,6 +12197,7 @@ public:
         properties.setProp("@kind", "key");
         properties.setPropInt64("@size", indexFileSize);
         properties.setPropInt64("@recordCount", reccount);
+        properties.setPropInt64("@duplicateKeyCount", duplicateKeyCount);
         WorkunitUpdate workUnit = ctx->updateWorkUnit();
         if (workUnit)
         {
@@ -12262,7 +12269,10 @@ public:
     {
         return true;
     }
-
+    virtual const StatisticsMapping &queryStatsMapping() const
+    {
+        return indexWriteStatistics; // Overridden by anyone that needs more
+    }
 };
 
 IRoxieServerActivityFactory *createRoxieServerIndexWriteActivityFactory(unsigned _id, unsigned _subgraphId, IQueryFactory &_queryFactory, HelperFactory *_helperFactory, ThorActivityKind _kind, IPropertyTree &_graphNode, bool _isRoot)

+ 9 - 0
system/jhtree/keybuild.cpp

@@ -335,6 +335,7 @@ class CKeyBuilder : public CKeyBuilderBase, implements IKeyBuilder
 private:
     CWriteNode *activeNode;
     CBlobWriteNode *activeBlobNode;
+    unsigned __int64 duplicateCount;
 
 public:
     IMPLEMENT_IINTERFACE;
@@ -345,6 +346,7 @@ public:
         doCrc = true;
         activeNode = NULL;
         activeBlobNode = NULL;
+        duplicateCount = 0;
     }
 
 public:
@@ -402,6 +404,11 @@ public:
             activeNode = new CWriteNode(nextPos, keyHdr, true);
             nextPos += keyHdr->getNodeSize();
         }
+        else
+        {
+            if (memcmp(keyData,activeNode->getLastKeyValue(),keyedSize)==0)
+                ++duplicateCount;
+        }
         if (!activeNode->add(pos, keyData, recsize, sequence))
         {
             assertex(NULL != activeNode->getLastKeyValue()); // empty and doesn't fit!
@@ -454,6 +461,8 @@ public:
         return head;
     }
 
+    unsigned __int64 getDuplicateCount() { return duplicateCount; };
+
 protected:
     void writeMetadata(char const * data, size32_t size)
     {

+ 1 - 0
system/jhtree/keybuild.hpp

@@ -98,6 +98,7 @@ interface IKeyBuilder : public IInterface
     virtual void processKeyData(const char *keyData, offset_t pos, size32_t recsize) = 0;
     virtual void addLeafInfo(CNodeInfo *info) = 0;
     virtual unsigned __int64 createBlob(size32_t size, const char * _ptr) = 0;
+    virtual unsigned __int64 getDuplicateCount() = 0;
 };
 
 extern jhtree_decl IKeyBuilder *createKeyBuilder(IFileIOStream *_out, unsigned flags, unsigned rawSize, unsigned nodeSize, unsigned keyFieldSize, unsigned __int64 startSequence);

+ 1 - 0
system/jlib/jstatcodes.h

@@ -206,6 +206,7 @@ enum StatisticKind
     StNumTransformExprs,
     StNumUniqueAnalyseExprs,
     StNumUniqueTransformExprs,
+    StNumDuplicateKeys,                 // When generating index, number of duplicate keys found
     StMax,
 
     //For any quantity there is potentially the following variants.

+ 1 - 0
system/jlib/jstats.cpp

@@ -829,6 +829,7 @@ static const StatisticMeta statsMetaData[StMax] = {
     { NUMSTAT(TransformExprs) },
     { NUMSTAT(UniqueAnalyseExprs) },
     { NUMSTAT(UniqueTransformExprs) },
+    { NUMSTAT(DuplicateKeys) },
 };
 
 

+ 9 - 0
thorlcr/activities/indexwrite/thindexwrite.cpp

@@ -30,6 +30,8 @@
 class IndexWriteActivityMaster : public CMasterActivity
 {
     rowcount_t recordsProcessed;
+    unsigned __int64 duplicateKeyCount = 0;
+    unsigned __int64 cummulativeDuplicateKeyCount = 0;
     Owned<IFileDescriptor> fileDesc;
     bool buildTlk, isLocal, singlePartKey;
     StringArray clusters;
@@ -242,11 +244,13 @@ public:
         IHThorIndexWriteArg *helper = (IHThorIndexWriteArg *)queryHelper();
         updateActivityResult(container.queryJob().queryWorkUnit(), helper->getFlags(), helper->getSequence(), fileName, recordsProcessed);
 
+        cummulativeDuplicateKeyCount += duplicateKeyCount;
         // MORE - add in the extra entry somehow
         if (fileName.get())
         {
             IPropertyTree &props = fileDesc->queryProperties();
             props.setPropInt64("@recordCount", recordsProcessed);
+            props.setPropInt64("@duplicateKeyCount", duplicateKeyCount);
             props.setProp("@kind", "key");
             if (0 != (helper->getFlags() & TIWexpires))
                 setExpiryTime(props, helper->getExpiryDays());
@@ -272,8 +276,11 @@ public:
         if (mb.length()) // if 0 implies aborted out from this slave.
         {
             rowcount_t r;
+            unsigned __int64 slaveDuplicateKeyCount;
             mb.read(r);
+            mb.read(slaveDuplicateKeyCount);
             recordsProcessed += r;
+            duplicateKeyCount += slaveDuplicateKeyCount;
             if (!singlePartKey || 0 == slaveIdx)
             {
                 IPartDescriptor *partDesc = fileDesc->queryPart(slaveIdx);
@@ -324,6 +331,7 @@ public:
                 checkSuperFileOwnership(*file);
             }
         }
+        duplicateKeyCount = 0;
     }
     virtual void deserializeStats(unsigned node, MemoryBuffer &mb)
     {
@@ -335,6 +343,7 @@ public:
     virtual void getActivityStats(IStatisticGatherer & stats)
     {
         CMasterActivity::getActivityStats(stats);
+        stats.addStatistic(StNumDuplicateKeys, cummulativeDuplicateKeyCount);
         if (publishReplicatedDone)
         {
             replicateProgress->processInfo();

+ 5 - 0
thorlcr/activities/indexwrite/thindexwriteslave.cpp

@@ -51,6 +51,7 @@ class IndexWriteSlaveActivity : public ProcessSlaveActivity, public ILookAheadSt
     unsigned __int64 totalCount;
 
     size32_t maxDiskRecordSize, lastRowSize, firstRowSize;
+    unsigned __int64 duplicateKeyCount;
     MemoryBuffer rowBuff;
     OwnedConstThorRow lastRow, firstRow;
     bool needFirstRow, enableTlkPart0, receivingTag2;
@@ -86,6 +87,7 @@ public:
         refactor = false;
         enableTlkPart0 = (0 != container.queryJob().getWorkUnitValueInt("enableTlkPart0", globals->getPropBool("@enableTlkPart0", true)));
         reInit = (0 != (TIWvarfilename & helper->getFlags()));
+        duplicateKeyCount = 0;
     }
     virtual void init(MemoryBuffer &data, MemoryBuffer &slaveData) override
     {
@@ -363,6 +365,7 @@ public:
                     close(*partDesc, partCrc);
                     throw;
                 }
+                duplicateKeyCount = builder->getDuplicateCount();
                 close(*partDesc, partCrc);
                 stop();
             }
@@ -421,6 +424,7 @@ public:
                     close(*partDesc, partCrc);
                     throw;
                 }
+                duplicateKeyCount = builder->getDuplicateCount();
                 close(*partDesc, partCrc);
                 stop();
 
@@ -558,6 +562,7 @@ public:
             return;
         rowcount_t _processed = processed & THORDATALINK_COUNT_MASK;
         mb.append(_processed);
+        mb.append(duplicateKeyCount);
         if (!singlePartKey || firstNode())
         {
             StringBuffer partFname;