Browse Source

HPCC-21254 Option -fcheckDuplicateThreshold=<n> to help spot inefficient queries

Signed-off-by: Gavin Halliday <gavin.halliday@lexisnexis.com>
Gavin Halliday 6 years ago
parent
commit
1757a86e10

+ 1 - 0
ecl/eclcc/eclcc.hpp

@@ -131,6 +131,7 @@ const char * const helpText[] = {
     "?!  -fapplyInstantEclTransformations  Limit non-file outputs with a CHOOSEN",
     "?!  -fapplyInstantEclTransformationsLimit  Number of rows to limit outputs to",
     "?!  -fcheckAsserts          Check ASSERT() statements",
+    "?!  -fcheckDuplicateThreshold=n Warn if SEQUENTIAL or workflow may duplicate more than n%% activities",
     "?!  -fexportDependencies    Generate information about inter-definition dependencies",
     "?!  -fmaxCompileThreads     Number of compiler instances to compile the c++",
     "?!  -fmaxErrors             Maximum number of errors to report",

+ 105 - 0
ecl/hql/hqltrans.cpp

@@ -4918,6 +4918,111 @@ bool containsExternalParameter(IHqlExpression * expr, IHqlExpression * params)
 
 //------------------------------------------------------------------------------------------------
 
+//Count the number of unique expressions in a tree - it does not have to be accurate, so based on
+//QuickHqlTransformer rather than NewHqlTransformer for speed (to reduce info allocations).
+static HqlTransformerInfo expressionCountAnalyserInfo("ExpressionCountAnalyser");
+class ExpressionCountAnalyser : public QuickHqlTransformer
+{
+public:
+    ExpressionCountAnalyser(bool _onlyActivities, bool _processColon, bool _processSequential) :
+        QuickHqlTransformer(expressionCountAnalyserInfo, NULL),
+        onlyActivities(_onlyActivities), processColon(_processColon), processSequential(_processSequential)
+    {
+    }
+
+    virtual void doAnalyse(IHqlExpression * expr);
+
+    unsigned __int64 getNumExprs() const { return numExpressions; }
+
+protected:
+    HqlExprCopyArray expanded;
+    bool onlyActivities;
+    bool processColon;
+    bool processSequential;
+    unsigned __int64 numExpressions = 0;
+};
+
+void ExpressionCountAnalyser::doAnalyse(IHqlExpression * expr)
+{
+    //This will only be called if it has not been visited already.
+    IHqlExpression * body = expr->queryBody();
+    node_operator op = body->getOperator();
+    switch (op)
+    {
+    case no_colon:
+    {
+        if (processColon)
+        {
+            if (!expanded.contains(*body))
+            {
+                expanded.append(*body);
+                //Enter a new nested transform so that any expressions previously matched will not be short circuited by analyse()
+                TransformMutexBlock nested;
+                QuickHqlTransformer::doAnalyse(body);
+            }
+            return;
+        }
+        break;
+    }
+    case no_sequential:
+    {
+        if (processSequential)
+        {
+            if (expr->numChildren())
+            {
+                analyse(expr->queryChild(0));
+                ForEachChildFrom(i, expr, 1)
+                {
+                    //Ensure any expressions in subsequent SEQUENTIAL arguments will not be commoned up
+                    TransformMutexBlock nested;
+                    analyse(expr->queryChild(i));
+                }
+            }
+            return;
+        }
+        break;
+    }
+    case no_record:
+    case no_field:
+        return;
+    case no_select:
+        if (!isNewSelector(body))
+            return;
+        break;
+    }
+
+    if (onlyActivities)
+    {
+        if (body->isDataset() || body->isAction())
+        {
+            if (op != no_assign)
+                numExpressions++;
+        }
+    }
+    else
+        numExpressions++;
+
+    QuickHqlTransformer::doAnalyse(body);
+}
+
+
+unsigned getExpressionCount(const HqlExprArray & exprs, bool onlyActivities, bool processColon, bool processSequential)
+{
+    ExpressionCountAnalyser analyser(onlyActivities, processColon, processSequential);
+    analyser.analyseArray(exprs);
+    return analyser.getNumExprs();
+}
+
+unsigned getExpressionCount(IHqlExpression * expr, bool onlyActivities, bool processColon, bool processSequential)
+{
+    ExpressionCountAnalyser analyser(onlyActivities, processColon, processSequential);
+    analyser.analyse(expr);
+    return analyser.getNumExprs();
+}
+
+//------------------------------------------------------------------------------------------------
+
+
 /*
 
 Some notes on selector ambiguity.

+ 2 - 0
ecl/hql/hqltrans.ipp

@@ -1242,6 +1242,8 @@ extern HQL_API IHqlExpression * quickFullReplaceExpressions(IHqlExpression * exp
 extern HQL_API void dbglogTransformStats(bool reset);
 extern HQL_API void gatherTransformStats(IStatisticTarget & target);
 extern HQL_API void clearTransformStats();
+extern HQL_API unsigned getExpressionCount(IHqlExpression * expr, bool onlyActivities, bool processColon, bool processSequential);
+extern HQL_API unsigned getExpressionCount(const HqlExprArray & exprs, bool onlyActivities, bool processColon, bool processSequential);
 
 #ifdef OPTIMIZE_TRANSFORM_ALLOCATOR
 size32_t beginTransformerAllocator();

+ 1 - 0
ecl/hqlcpp/hqlcerrors.hpp

@@ -273,6 +273,7 @@
 #define HQLWRN_OutputScalarInsideChildQuery     4547
 #define HQLWRN_GlobalDatasetFromChildQuery      4548
 #define HQLWRN_NestedSequentialUseOrdered       4214
+#define HQLWRN_ExpressionsDuplicated            4549
 
 //Temporary errors
 #define HQLERR_OrderOnVarlengthStrings          4601

+ 2 - 0
ecl/hqlcpp/hqlcpp.cpp

@@ -1815,6 +1815,8 @@ void HqlCppTranslator::cacheOptions()
         DebugOption(options.transformNestedSequential, "transformNestedSequential", true),
         DebugOption(options.forceAllProjectedDiskSerialized, "internalForceAllProjectedDiskSerialized", false),  // Delete in 8.0 once new code has been proved in anger
         DebugOption(options.newIndexReadMapping, "newIndexReadMapping", false), // Not yet enabled due to problems with merging mapped fields and roxie/thor integration
+        DebugOption(options.checkDuplicateThreshold, "checkDuplicateThreshold", 0), // If non zero, create a warning if duplicates > this percentage increase
+        DebugOption(options.checkDuplicateMinActivities, "checkDuplicateMinActivities", 100),
     };
 
     //get options values from workunit

+ 5 - 1
ecl/hqlcpp/hqlcpp.ipp

@@ -613,8 +613,10 @@ struct HqlCppOptions
     unsigned            searchDistanceThreshold;
     unsigned            generateActivityThreshold;    // Record activities which take more than this value (in ms) to generate (0 disables)
     cycle_t             generateActivityThresholdCycles;
-   int                 defaultNumPersistInstances;
+    int                 defaultNumPersistInstances;
     unsigned            reportDFSinfo;
+    unsigned            checkDuplicateThreshold;
+    unsigned            checkDuplicateMinActivities;
     CompilerType        targetCompiler;
     DBZaction           divideByZeroAction;
     bool                peephole;
@@ -1899,6 +1901,8 @@ protected:
 
     void ensureSerialized(BuildCtx & ctx, const CHqlBoundTarget & variable);
 
+    void checkWorkflowDuplication(HqlExprArray & exprs);
+
     void doBuildExprRowDiff(BuildCtx & ctx, const CHqlBoundTarget & target, IHqlExpression * expr, IHqlExpression * leftSelector, IHqlExpression * rightRecord, IHqlExpression * rightSelector, StringBuffer & selectorText, bool isCount);
     void doBuildExprRowDiff(BuildCtx & ctx, IHqlExpression * expr, CHqlBoundExpr & tgt);
 

+ 19 - 0
ecl/hqlcpp/hqlttcpp.cpp

@@ -13896,6 +13896,21 @@ IHqlExpression * substituteClusterSize(unsigned numNodes, IHqlExpression * expr,
     return transformer.transformRoot(expr);
 }
 
+void HqlCppTranslator::checkWorkflowDuplication(HqlExprArray & exprs)
+{
+    const unsigned minActivities = options.checkDuplicateMinActivities;
+    const unsigned threshold = 100 + options.checkDuplicateThreshold;
+    unsigned __int64 rawCount = getExpressionCount(exprs, true, false, false);
+    if (rawCount < minActivities)
+        return;
+
+    unsigned __int64 sequentialCount = getExpressionCount(exprs, true, false, true);
+    unsigned __int64 workflowCount = getExpressionCount(exprs, true, true, false);
+    if (workflowCount * 100 > rawCount * threshold)
+        reportWarning(CategoryEfficiency, HQLWRN_ExpressionsDuplicated, "Warning workflow actions may significantly increase the graph size (estimate %u%%)", (unsigned)(workflowCount * 100 /rawCount) - 100);
+    if (sequentialCount * 100 > rawCount * threshold)
+        reportWarning(CategoryEfficiency, HQLWRN_ExpressionsDuplicated, "Warning sequential actions may significantly increase the graph size (estimate ~%u%%)", (unsigned)(sequentialCount * 100 /rawCount) - 100);
+}
 
 
 void HqlCppTranslator::substituteClusterSize(HqlExprArray & exprs)
@@ -13941,6 +13956,10 @@ void HqlCppTranslator::normalizeGraphForGeneration(HqlExprArray & exprs, HqlQuer
         expandDelayedFunctionCalls(&queryErrorProcessor(), exprs);
     }
 
+    //This option defaults off - it checks if use of workflow or sequential is likely to increase query size significantly
+    if (options.checkDuplicateThreshold != 0)
+        checkWorkflowDuplication(exprs);
+
     {
         traceExpressions("before normalize", exprs);
         normalizeHqlTree(*this, exprs);