Pārlūkot izejas kodu

Merge pull request #13356 from ghalliday/issue23512

HPCC-23512 Warn if the input files to MERGE do not match the sort order

Reviewed-By: Shamser Ahmed <shamser.ahmed@lexisnexis.co.uk>
Reviewed-By: Richard Chapman <rchapman@hpccsystems.com>
Richard Chapman 5 gadi atpakaļ
vecāks
revīzija
65a2e67b27

+ 4 - 0
ecl/hql/hqlerrors.hpp

@@ -516,6 +516,10 @@
 #define HQLERR_UnnamedOutputInLibrary           3158
 #define HQLERR_AlienUseData                     3159
 #define HQLERR_LibraryParamNoFunctions          3160
+#define HQLWRN_MergeInputUnordered              3161
+#define HQLWRN_MergeInputPartiallyOrdered       3162
+#define HQLWRN_MergeInputIncompatible           3163
+#define HQLWRN_MergeInputLastMissing            3164
 
 #define HQLERR_DedupFieldNotFound_Text          "Field removed from dedup could not be found"
 #define HQLERR_CycleWithModuleDefinition_Text   "Module definition contains an illegal cycle/recursive definition %s"

+ 48 - 0
ecl/hqlcpp/hqlttcpp.cpp

@@ -12277,6 +12277,54 @@ IHqlExpression * HqlTreeNormalizer::transformMerge(IHqlExpression * expr)
     HqlExprArray children;
     transformChildren(expr, children);
 
+    if (!expr->hasAttribute(_implicitSorted_Atom))
+    {
+        //Check to see if the input dataset appear to be sorted by the merge sort criteria.  This must be done on the transformed
+        //expressions, otherwise there will be differences in the non-normalized form of fields etc.  It cannot be done earlier
+        //in the parser because the many expressions will not have been substituted.
+        bool hasLocal = isLocalActivity(expr);
+        bool isLocal = hasLocal || !translator.targetThor();
+        IHqlExpression * sortOrder = queryAttribute(sortedAtom, children);
+        ForEachItemIn(i, children)
+        {
+            IHqlExpression * cur = &children.item(i);
+            if (cur->isDataset())
+            {
+                Owned<IHqlExpression> dsOrder = getExistingSortOrder(cur, isLocal, true);
+                if (dsOrder)
+                {
+                    ForEachChild(iSort, sortOrder)
+                    {
+                        IHqlExpression * expected = sortOrder->queryChild(iSort);
+                        IHqlExpression * actual = dsOrder->queryChild(iSort);
+                        if (!actual || expected->queryBody() != actual->queryBody())
+                        {
+                            if (!actual || actual->isAttribute())
+                            {
+                                //Give a different warning if the last element of the sort order is missing - because it is possible
+                                //each of the inputs only has a single value for the other components and the last item is used
+                                //to control which order the streams are merged in.  Very obscure, but happens in regression suite.
+                                if (iSort +1 != sortOrder->numChildren())
+                                    translator.reportWarning(CategoryMistake, SeverityWarning, cur, HQLWRN_MergeInputPartiallyOrdered, "MERGE() argument %u appears to only be sorted by %u component(s)", i+1, iSort);
+                                else
+                                    translator.reportWarning(CategoryMistake, SeverityWarning, cur, HQLWRN_MergeInputLastMissing, "MERGE() argument %u appears to not be sorted by the last component of the merge order", i+1);
+                            }
+
+                            else
+                            {
+                                EclIR::dump_ir(expected, actual);
+                                translator.reportWarning(CategoryMistake, SeverityWarning, cur, HQLWRN_MergeInputIncompatible, "MERGE() argument %u appears to have a different sort order for component #%u", i+1, iSort+1);
+                            }
+                            break;
+                        }
+                    }
+                }
+                else
+                    translator.reportWarning(CategoryMistake, SeverityWarning, cur, HQLWRN_MergeInputUnordered, "MERGE() argument %u does not appear to be sorted", i+1);
+            }
+        }
+    }
+
     HqlExprArray args;
     reorderAttributesToEnd(args, children);
     return expr->clone(args);

+ 1 - 0
testing/regress/ecl/setup/setupsearchindex.ecl

@@ -18,6 +18,7 @@
 //class=textsearch
 
 #option ('checkAsserts',false);
+#onwarning (3164, ignore); // The merge of tokens has single entries from some of the dataset, and the merge criteria orders between the input datasets
 
 import $.SetupText;
 import $.Options;

+ 4 - 3
testing/regress/ecl/setup/setuptext.ecl

@@ -137,7 +137,7 @@ convertDocumentStreamToTokens(dataset(inputDocumentRecord) inFile) := FUNCTION
 
     pattern emptyLine := ws*;
 
-    doProcess2 := parse(splitFile, text, emptyLine, createMatchPara(left), whole);
+    doProcess2 := sorted(parse(splitFile, text, emptyLine, createMatchPara(left), whole), doc, dpos);
 
     RETURN merge(sorted(doProcess1, doc, dpos), doProcess2, sorted(doc, dpos));
 END;
@@ -170,10 +170,11 @@ processSentanceAndParagraphMarkers(dataset(parseRecord) extractedWords, set of s
         SELF.wip := IF(isOpen, 1, 0);
         SELF := [];
     END;
-    implicitStarts := sorted(normalize(singlePerDoc, count(spanTags), createSpanTag(LEFT.doc, 0, true, COUNTER)), doc, dpos, kind);
-    implicitEnds := normalize(singlePerDoc, count(spanTags), createSpanTag(LEFT.doc, LEFT.maxDocPos+1, false, count(spanTags)+1-COUNTER));
+    implicitStarts := sorted(normalize(singlePerDoc, count(spanTags), createSpanTag(LEFT.doc, 0, true, COUNTER)), doc, dpos, wordKindSortOrder(kind, wip, original));
+    implicitEnds := sorted(normalize(singlePerDoc, count(spanTags), createSpanTag(LEFT.doc, LEFT.maxDocPos+1, false, count(spanTags)+1-COUNTER)), doc, dpos, wordKindSortOrder(kind, wip, original));
 
     //Combine non tags, with end,begin for sentance,paragraph and implicit begin sentance, end sentance etc. for whole document
+    //Each dataset can only have one entry for each (doc,dpos), but they should be merged in wordKindSortOrder() order
     cleaned := MERGE(implicitStarts, markerOpen, withoutMarkers, markerClose, implicitEnds, sorted(doc, dpos, wordKindSortOrder(kind, wip, original)));
     RETURN cleaned;
 END;

+ 1 - 0
testing/regress/ecl/setup/setupwordindex.ecl

@@ -16,6 +16,7 @@
 ############################################################################## */
 
 #option ('checkAsserts',false);
+#onwarning (3164, ignore); // The merge of tokens has single entries from some of the dataset, and the merge criteria orders between the input datasets
 
 import $.SetupText;
 

+ 2 - 0
testing/regress/ecl/textsearch1.ecl

@@ -27,6 +27,8 @@ useLocal := #IFDEFINED(root.useLocal, false);
 //--- end of version configuration ---
 
 #option ('checkAsserts',false);
+#onwarning (3164, ignore);
+
 import $.Common.TextSearch;
 import $.Common.TextSearchQueries;
 

+ 2 - 0
testing/regress/ecl/textsearch1_thorlocal.ecl

@@ -25,6 +25,8 @@ useLocal := #IFDEFINED(root.useLocal, false);
 //--- end of version configuration ---
 
 #option ('checkAsserts',false);
+#onwarning (3164, ignore);
+
 import $.Common.TextSearch;
 import $.Common.TextSearchQueries;
 import $.Setup;

+ 2 - 0
testing/regress/ecl/textsearch2.ecl

@@ -25,6 +25,8 @@ multiPart := #IFDEFINED(root.multiPart, false);
 //--- end of version configuration ---
 
 #option ('checkAsserts',false);
+#onwarning (3164, ignore);
+
 import $.Common.TextSearch;
 import $.Common.TextSearchQueries;
 

+ 2 - 0
testing/regress/ecl/textsearch3.ecl

@@ -25,6 +25,8 @@ multiPart := #IFDEFINED(root.multiPart, false);
 //--- end of version configuration ---
 
 #option ('checkAsserts',false);
+#onwarning (3164, ignore);
+
 import $.Common.TextSearch;
 import $.Common.TextSearchQueries;
 

+ 2 - 0
testing/regress/ecl/textsearch4.ecl

@@ -26,6 +26,8 @@ multiPart := #IFDEFINED(root.multiPart, false);
 //--- end of version configuration ---
 
 #option ('checkAsserts',false);
+#onwarning (3164, ignore);
+
 import $.Setup.TS;
 import $.Common.TextSearch;
 import $.Common.TextSearchQueries;