소스 검색

Merge pull request #4959 from ghalliday/issue10147

HPCC-10147 Initial implementation of JOIN(..., GROUP(x,y))

Reviewed-By: Jamie Noss <james.noss@lexisnexis.com>
Reviewed-By: Richard Chapman <rchapman@hpccsystems.com>
Richard Chapman 11 년 전
부모
커밋
21f8f0db2a

+ 1 - 0
ecl/hql/hqlerrors.hpp

@@ -421,6 +421,7 @@
 #define HQLERR_CannotAccessShared   2390
 #define HQLERR_CannotAccessShared   2390
 #define ERR_PluginNoScripting       2391
 #define ERR_PluginNoScripting       2391
 #define ERR_ZERO_SIZE_VIRTUAL       2392
 #define ERR_ZERO_SIZE_VIRTUAL       2392
+#define ERR_BAD_JOINGROUP_FIELD     2393
 
 
 #define ERR_ASSERTION_FAILS         100000
 #define ERR_ASSERTION_FAILS         100000
 
 

+ 9 - 0
ecl/hql/hqlgram.y

@@ -10147,6 +10147,15 @@ JoinFlag
                             $$.setExpr(createAttribute(unorderedAtom));
                             $$.setExpr(createAttribute(unorderedAtom));
                             $$.setPosition($1);
                             $$.setPosition($1);
                         }
                         }
+    | GROUP '(' startSortOrder heterogeneous_expr_list ')' endSortOrder
+                        {
+                            HqlExprArray args;
+                            $4.unwindCommaList(args);
+                            OwnedHqlExpr sortlist = createSortList(args);
+                            OwnedHqlExpr groupAttr = createExprAttribute(groupAtom, sortlist.getClear());
+                            OwnedHqlExpr impliedAttr = createComma(createAttribute(lookupAtom), createAttribute(manyAtom));
+                            $$.setExpr(createComma(groupAttr.getClear(), impliedAttr.getClear()), $1);
+                        }
     ;
     ;
 
 
 
 

+ 31 - 0
ecl/hql/hqlgram2.cpp

@@ -7790,6 +7790,37 @@ void HqlGram::checkJoinFlags(const attribute &err, IHqlExpression * join)
             reportWarning(ERR_BAD_JOINFLAG, err.pos, "Filtered RIGHT prevents a keyed join being used.  Consider including the filter in the join condition.");
             reportWarning(ERR_BAD_JOINFLAG, err.pos, "Filtered RIGHT prevents a keyed join being used.  Consider including the filter in the join condition.");
     }
     }
 
 
+    IHqlExpression * group = join->queryAttribute(groupAtom);
+    if (group)
+    {
+        //Check that each of the fields mentioned in the group are projected into the output.
+        OwnedHqlExpr left = createSelector(no_left, join->queryChild(0), querySelSeq(join));
+        OwnedHqlExpr right = createSelector(no_right, join->queryChild(1), querySelSeq(join));
+        NewProjectMapper2 mapper;
+        mapper.setMapping(join->queryChild(3));
+        IHqlExpression * sortlist = group->queryChild(0);
+        ForEachChild(i, sortlist)
+        {
+            IHqlExpression * cur = sortlist->queryChild(i);
+            if (cur->usesSelector(right))
+            {
+                StringBuffer s;
+                getExprECL(cur, s);
+                reportError(ERR_BAD_JOINGROUP_FIELD, err, "GROUP expression '%s' cannot include fields from RIGHT", s.str());
+            }
+            else
+            {
+                bool matchedAll = true;
+                OwnedHqlExpr mapped = mapper.collapseFields(cur, left, queryActiveTableSelector(), left, &matchedAll);
+                if (!matchedAll)
+                {
+                    StringBuffer s;
+                    getExprECL(cur, s);
+                    reportError(ERR_BAD_JOINGROUP_FIELD, err, "GROUP expression '%s' is not included in the JOIN output", s.str());
+                }
+            }
+        }
+    }
 }
 }
 
 
 
 

+ 9 - 1
ecl/hql/hqlmeta.cpp

@@ -2235,6 +2235,12 @@ void calculateDatasetMeta(CHqlMetaInfo & meta, IHqlExpression * expr)
                 }
                 }
                 else
                 else
                     meta.removeAllKeepGrouping();
                     meta.removeAllKeepGrouping();
+
+                //The grouping fields could be mapped using the transform to provide more information, but it is
+                //unlikely to provide scope for other optimizations, and it will soon be replaced with the expanded
+                //implementation which will track map the information.
+                if (expr->queryAttribute(groupAtom))
+                    meta.setUnknownGrouping();
             }
             }
             else if (isLocal)
             else if (isLocal)
             {
             {
@@ -3098,7 +3104,9 @@ ITypeInfo * calculateDatasetType(node_operator op, const HqlExprArray & parms)
             bool isKeyedJoin = !isAllJoin && !isLookupJoin && !isSmartJoin && (queryAttribute(keyedAtom, parms) || isKey(&parms.item(1)));
             bool isKeyedJoin = !isAllJoin && !isLookupJoin && !isSmartJoin && (queryAttribute(keyedAtom, parms) || isKey(&parms.item(1)));
 
 
             recordArg = 3;
             recordArg = 3;
-            if (isKeyedJoin || isAllJoin || isLookupJoin)
+            if (queryAttribute(groupAtom, parms))
+                nowGrouped = true;
+            else if (isKeyedJoin || isAllJoin || isLookupJoin)
                 nowGrouped = isGrouped(datasetType);
                 nowGrouped = isGrouped(datasetType);
             else
             else
                 nowGrouped = false;
                 nowGrouped = false;

+ 95 - 2
ecl/hqlcpp/hqlttcpp.cpp

@@ -1939,7 +1939,10 @@ IHqlExpression * ThorHqlTransformer::createTransformed(IHqlExpression * expr)
     case no_selfjoin:
     case no_selfjoin:
     case no_denormalize:
     case no_denormalize:
     case no_denormalizegroup:
     case no_denormalizegroup:
-        normalized = normalizeJoinOrDenormalize(transformed);
+        if (transformed->hasAttribute(groupAtom))
+            normalized = normalizeJoinAndGroup(transformed);
+        else
+            normalized = normalizeJoinOrDenormalize(transformed);
         break;
         break;
     case no_cosort:
     case no_cosort:
     case no_sort:
     case no_sort:
@@ -2595,6 +2598,96 @@ static IHqlExpression * createDistributedInput(IHqlExpression * ds, const HqlExp
     return createDistributedInput(ds, sortlist, internal);
     return createDistributedInput(ds, sortlist, internal);
 }
 }
 
 
+/*
+
+Perform the following transformation:
+
+R := JOIN(l, r, LEFT.key = RIGHT.key AND fuzzy(LEFT,RIGHT), t(LEFT,RIGHT), GROUP(LEFT.id1, LEFT.id2), ATMOST(optional))
+
+DL := DISTRIBUTE(L, HASH(key));
+DR := DISTRIBUTE(R, HASH(key));
+SL := SORT(DL, id, LOCAL);          // Later replace this with a LEFTSORT() attribute on the join (so can optimize self join)
+//If it is a self join, SR == SL
+JR := JOIN(SL, DR, LEFT.key = RIGHT.key, t(LEFT,RIGHT), LOOKUP MANY, LOCAL);
+DJ := DISTRIBUTE(J, HASH(leftid1, leftid2), MERGE(leftid1, leftid2));
+R := GROUP(DJ, leftid1, leftid2, LOCAL);
+
+*/
+
+IHqlExpression * ThorHqlTransformer::normalizeJoinAndGroup(IHqlExpression * expr)
+{
+    IHqlExpression * oldLeft = expr->queryChild(0);
+    IHqlExpression * oldRight = expr->queryChild(1);
+    LinkedHqlExpr newLeft = oldLeft;
+    LinkedHqlExpr newRight = oldRight;
+    IHqlExpression * groupOrder = queryAttributeChild(expr, groupAtom, 0);
+    node_operator op = expr->getOperator();
+
+    bool hasLocal = isLocalActivity(expr);
+    bool alwaysLocal = !translator.targetThor();
+    if (!hasLocal && !alwaysLocal)
+    {
+        JoinSortInfo joinInfo;
+        joinInfo.findJoinSortOrders(expr, false);
+
+        OwnedHqlExpr leftList = createValueSafe(no_sortlist, makeSortListType(NULL), joinInfo.queryLeftReq());
+        OwnedHqlExpr mappedLeftList = replaceSelector(leftList, queryActiveTableSelector(), newLeft->queryNormalizedSelector());
+        OwnedHqlExpr hashLeft = createValue(no_hash32, makeIntType(4, false), mappedLeftList.getClear());
+        newLeft.setown(createDataset(no_distribute, LINK(newLeft), LINK(hashLeft)));
+
+        if (oldRight == oldLeft)
+            newRight.set(newLeft);
+        else if (op != no_selfjoin)
+        {
+            OwnedHqlExpr rightList = createValueSafe(no_sortlist, makeSortListType(NULL), joinInfo.queryRightReq());
+            OwnedHqlExpr mappedRightList = replaceSelector(rightList, queryActiveTableSelector(), newRight->queryNormalizedSelector());
+            OwnedHqlExpr hashRight = createValue(no_hash32, makeIntType(4, false), mappedRightList.getClear());
+            newRight.setown(createDataset(no_distribute, LINK(newRight), LINK(hashRight)));
+        }
+    }
+
+    OwnedHqlExpr newLocalAttr = alwaysLocal ? NULL : createLocalAttribute();
+
+    //Sort the left hand dataset into grouping order.
+    assertex(groupOrder);
+    OwnedHqlExpr left = createSelector(no_left, expr->queryChild(0), querySelSeq(expr));
+    OwnedHqlExpr leftSortOrder = replaceSelector(groupOrder, left, newLeft);
+    newLeft.setown(createDatasetF(no_sort, newLeft.getClear(), LINK(leftSortOrder), LINK(newLocalAttr), NULL));
+
+    if (oldRight == oldLeft)
+        newRight.set(newLeft);
+
+    //Now create the modified join
+    HqlExprArray joinArgs;
+    joinArgs.append(*LINK(newLeft));
+    joinArgs.append(*LINK(newRight));
+    unwindChildren(joinArgs, expr, 2);
+    removeProperty(joinArgs, groupAtom);
+    if (!hasLocal && !alwaysLocal)
+        joinArgs.append(*createLocalAttribute());
+    OwnedHqlExpr newJoin = expr->clone(joinArgs);
+
+    //Now need to map the fields from the input dataset to the join output
+    NewProjectMapper2 mapper;
+    mapper.setMapping(newJoin->queryChild(3));
+    bool matchedAll = true;
+    OwnedHqlExpr mappedOrder = mapper.collapseFields(groupOrder, left, newJoin->queryNormalizedSelector(), left, &matchedAll);
+    assertex(matchedAll); // This is checked in the parser, so shouldn't be triggered here.
+
+    //Distribute the result
+    LinkedHqlExpr distributed = newJoin;
+    if (!hasLocal && !alwaysLocal)
+    {
+        OwnedHqlExpr hashOut = createValue(no_hash32, makeIntType(4, false), LINK(mappedOrder));
+        OwnedHqlExpr mergeOut = createExprAttribute(mergeAtom, LINK(mappedOrder));
+        distributed.setown(createDatasetF(no_distribute, LINK(newJoin), hashOut.getClear(), mergeOut.getClear(), NULL));
+    }
+
+    //And finally group it.
+    return createDatasetF(no_group, LINK(distributed), LINK(mappedOrder), LINK(newLocalAttr), NULL);
+}
+
+
 IHqlExpression * ThorHqlTransformer::normalizeJoinOrDenormalize(IHqlExpression * expr)
 IHqlExpression * ThorHqlTransformer::normalizeJoinOrDenormalize(IHqlExpression * expr)
 {
 {
     IHqlExpression * leftDs = expr->queryChild(0);
     IHqlExpression * leftDs = expr->queryChild(0);
@@ -4892,7 +4985,7 @@ void GlobalAttributeInfo::extractStoredInfo(IHqlExpression * expr, IHqlExpressio
         extraOutputAttr.setown(createComma(LINK(expr->queryAttribute(expireAtom)), LINK(expr->queryAttribute(clusterAtom))));
         extraOutputAttr.setown(createComma(LINK(expr->queryAttribute(expireAtom)), LINK(expr->queryAttribute(clusterAtom))));
         numPersistInstances = multiplePersistInstances ? -1 : 0;
         numPersistInstances = multiplePersistInstances ? -1 : 0;
         if (expr->hasAttribute(multipleAtom))
         if (expr->hasAttribute(multipleAtom))
-            numPersistInstances = getIntValue(queryAttributeChild(expr, multipleAtom, 0), -1);
+            numPersistInstances = (int)getIntValue(queryAttributeChild(expr, multipleAtom, 0), -1);
         else if (expr->hasAttribute(singleAtom))
         else if (expr->hasAttribute(singleAtom))
             numPersistInstances = 0;
             numPersistInstances = 0;
 
 

+ 1 - 0
ecl/hqlcpp/hqlttcpp.ipp

@@ -196,6 +196,7 @@ protected:
     IHqlExpression * normalizeDedup(IHqlExpression * expr);
     IHqlExpression * normalizeDedup(IHqlExpression * expr);
 //  IHqlExpression * normalizeIndexBuild(IHqlExpression * expr);
 //  IHqlExpression * normalizeIndexBuild(IHqlExpression * expr);
     IHqlExpression * normalizeGroup(IHqlExpression * expr);
     IHqlExpression * normalizeGroup(IHqlExpression * expr);
+    IHqlExpression * normalizeJoinAndGroup(IHqlExpression * expr);
     IHqlExpression * normalizeJoinOrDenormalize(IHqlExpression * expr);
     IHqlExpression * normalizeJoinOrDenormalize(IHqlExpression * expr);
     IHqlExpression * normalizeTableToAggregate(IHqlExpression * expr, bool canOptimizeCasts);
     IHqlExpression * normalizeTableToAggregate(IHqlExpression * expr, bool canOptimizeCasts);
     IHqlExpression * normalizeTableGrouping(IHqlExpression * expr);
     IHqlExpression * normalizeTableGrouping(IHqlExpression * expr);

+ 34 - 0
ecl/regress/groupjoin1_err.ecl

@@ -0,0 +1,34 @@
+
+namesRec := RECORD
+    string name;
+    string addr;
+END;
+
+nameDataset := DATASET([
+    { 'Smith', 'Apple street' },
+    { 'Smith', 'Banana road' },
+    { 'Smith', 'Date street' },
+    { 'Jones', 'Banana road' },
+    { 'Jones', 'Cherry street' },
+    { 'Jones', 'Date street' },
+    { 'Bloggs', 'Cherry steet' },
+    { 'Bloggs', 'Eggplant ave' }
+    ], namesRec);
+    
+outRec := RECORD
+    string lname;
+    string rname;
+END;    
+
+outRec makeOut(namesRec l, namesRec r) := TRANSFORM
+    SELF.lname := l.name;
+    SELF.rname := r.name;
+END;
+
+//Error: RIGHT.name isn't legal becuase you can only group on fields from LEFT
+//Error: LEFT.addr isn't legal because it isn't projected into the output
+j := JOIN(nameDataset, nameDataset, LEFT.addr = RIGHT.addr AND LEFT.name != RIGHT.name, makeOut(LEFT, RIGHT), GROUP(LEFT.name, RIGHT.name, LEFT.addr));
+
+r := TABLE(j, { lname, rname, unsigned cnt := COUNT(GROUP) });
+output(ungroup(r));
+output(r(cnt > 1));

+ 34 - 0
ecl/regress/groupjoin2_err.ecl

@@ -0,0 +1,34 @@
+
+namesRec := RECORD
+    string name;
+    string addr;
+END;
+
+nameDataset := DATASET([
+    { 'Smith', 'Apple street' },
+    { 'Smith', 'Banana road' },
+    { 'Smith', 'Date street' },
+    { 'Jones', 'Banana road' },
+    { 'Jones', 'Cherry street' },
+    { 'Jones', 'Date street' },
+    { 'Bloggs', 'Cherry steet' },
+    { 'Bloggs', 'Eggplant ave' }
+    ], namesRec);
+    
+outRec := RECORD
+    string lname;
+    string rname;
+END;    
+
+outRec makeOut(namesRec l, namesRec r) := TRANSFORM
+    SELF.lname := l.name;
+    SELF.rname := r.name;
+END;
+
+// Error: The expressions referred to in the GROUP are members of LEFT, not the output of the transform.
+// This syntax might be preferrable, but would require a new parser to allow the dynamic scoping of the names
+j := JOIN(nameDataset, nameDataset, LEFT.addr = RIGHT.addr AND LEFT.name != RIGHT.name, makeOut(LEFT, RIGHT), GROUP(lname, rname));
+
+r := TABLE(j, { lname, rname, unsigned cnt := COUNT(GROUP) });
+output(ungroup(r));
+output(r(cnt > 1));

+ 36 - 0
testing/ecl/groupjoin1.ecl

@@ -0,0 +1,36 @@
+
+namesRec := RECORD
+    string name;
+    string addr;
+END;
+
+nameDataset := DATASET([
+    { 'Smith', 'Apple street' },
+    { 'Smith', 'Banana road' },
+    { 'Smith', 'Date street' },
+    { 'Jones', 'Banana road' },
+    { 'Jones', 'Cherry street' },
+    { 'Jones', 'Date street' },
+    { 'Bloggs', 'Cherry street' },
+    { 'Bloggs', 'Eggplant ave' }
+    ], namesRec);
+    
+outRec := RECORD
+    string lname;
+    string rname;
+END;    
+
+outRec makeOut(namesRec l, namesRec r) := TRANSFORM
+    SELF.lname := l.name;
+    SELF.rname := r.name;
+END;
+
+j := JOIN(nameDataset, nameDataset, LEFT.addr = RIGHT.addr AND LEFT.name != RIGHT.name, makeOut(LEFT, RIGHT), GROUP(LEFT.name));
+
+s := SORT(j, rname);    // sort the groups.
+gr2 := GROUP(s, lname, rname, local);
+r := TABLE(gr2, { lname, rname, unsigned cnt := COUNT(GROUP) });
+
+sr := sort(NOFOLD(r), lname, rname);
+output(sr);
+output(sr(cnt > 1));

+ 10 - 0
testing/ecl/key/groupjoin1.xml

@@ -0,0 +1,10 @@
+<Dataset name='Result 1'>
+ <Row><lname>Bloggs</lname><rname>Jones</rname><cnt>1</cnt></Row>
+ <Row><lname>Jones</lname><rname>Bloggs</rname><cnt>1</cnt></Row>
+ <Row><lname>Jones</lname><rname>Smith</rname><cnt>2</cnt></Row>
+ <Row><lname>Smith</lname><rname>Jones</rname><cnt>2</cnt></Row>
+</Dataset>
+<Dataset name='Result 2'>
+ <Row><lname>Jones</lname><rname>Smith</rname><cnt>2</cnt></Row>
+ <Row><lname>Smith</lname><rname>Jones</rname><cnt>2</cnt></Row>
+</Dataset>