瀏覽代碼

HPCC-14402 Splitting strings efficiently

Implementation of new REGEXFINDSET builtin function.

Usage: REGEXFINDSET(regular expression, string) return set of matching sub-strings

Example code to return words in string
str := 'this hello, this is a a test hello this';
r := REGEXFINDSET('[[:alpha:]]+', str);

UNICODE str2:= U'this hello, this is a a test hello this';
r := REGEXFINDSET(U'[[:alpha:]]+', str);

Signed-off-by: Shamser Ahmed <shamser.ahmed@lexisnexis.co.uk>
Shamser Ahmed 9 年之前
父節點
當前提交
232fd03b15

+ 2 - 1
ecl/hql/hqlattr.cpp

@@ -190,6 +190,7 @@ unsigned getOperatorMetaFlags(node_operator op)
     case no_matchunicode:
     case no_matchutf8:
     case no_regex_find:
+    case no_regex_findset:
     case no_regex_replace:
     case no_toxml:
     case no_tojson:
@@ -630,7 +631,7 @@ unsigned getOperatorMetaFlags(node_operator op)
     case no_unused30: case no_unused31: case no_unused32: case no_unused33: case no_unused34: case no_unused35: case no_unused36: case no_unused37: case no_unused38:
     case no_unused40: case no_unused41: case no_unused42: case no_unused43: case no_unused44: case no_unused45: case no_unused46: case no_unused47: case no_unused48: case no_unused49:
     case no_unused50: case no_unused52:
-    case no_unused80: case no_unused83:
+    case no_unused80:
     case no_unused102:
     case no_is_null:
     case no_position:

+ 2 - 1
ecl/hql/hqlexpr.cpp

@@ -1145,6 +1145,7 @@ const char *getOpString(node_operator op)
     case no_realformat: return "REALFORMAT";
     case no_intformat: return "INTFORMAT";
     case no_regex_find: return "REGEXFIND";
+    case no_regex_findset: return "REGEXFINDSET";
     case no_regex_replace: return "REGEXREPLACE";
 
     case no_current_date: return "current_date";
@@ -1529,7 +1530,6 @@ const char *getOpString(node_operator op)
     case no_unused50: case no_unused52:
     case no_unused80:
     case no_unused81:
-    case no_unused83:
     case no_unused102:
         return "unused";
     /* if fail, use "hqltest -internal" to find out why. */
@@ -1779,6 +1779,7 @@ int getPrecedence(node_operator op)
     case no_intformat:
     case no_realformat:
     case no_regex_find:
+    case no_regex_findset:
     case no_regex_replace:
     case no_fromunicode:
     case no_tounicode:

+ 1 - 1
ecl/hql/hqlexpr.hpp

@@ -538,7 +538,7 @@ enum _node_operator {
         no_outputscalar,
         no_matchunicode,
         no_pat_validate,
-   no_unused83,
+        no_regex_findset,
         no_existsgroup,
         no_pat_use,
         no_unused13,

+ 35 - 0
ecl/hql/hqlfold.cpp

@@ -2154,6 +2154,41 @@ IHqlExpression * foldConstantOperator(IHqlExpression * expr, unsigned foldOption
             }
             break;
         }
+    case no_regex_findset:
+        {
+            IValue * v0 = expr->queryChild(0)->queryValue();
+            IValue * v1 = expr->queryChild(1)->queryValue();
+            if (v0 && v1)
+            {
+                bool isAllResult;
+                size32_t resultBytes;
+                rtlDataAttr matchResults;
+
+                if(isUnicodeType(v0->queryType()))
+                {
+                    size32_t plen = v0->queryType()->getStringLen();
+                    OwnedMalloc<UChar> pattern (plen+1);
+                    v0->getUCharStringValue(plen+1, pattern.get()); //plen+1 so get null-terminated
+                    size32_t slen = v1->queryType()->getStringLen();
+                    OwnedMalloc<UChar> search (slen);
+                    v1->getUCharStringValue(slen, search);
+                    ICompiledUStrRegExpr * compiled = rtlCreateCompiledUStrRegExpr(pattern, !expr->hasAttribute(noCaseAtom));
+                    compiled->getMatchSet(isAllResult, resultBytes, matchResults.refdata(), slen, search.get());
+                    rtlDestroyCompiledUStrRegExpr(compiled);
+                }
+                else
+                {
+                    StringBuffer pattern, search;
+                    v0->getStringValue(pattern);
+                    v1->getStringValue(search);
+                    rtlCompiledStrRegex compiled;
+                    compiled.setPattern(pattern.str(), !expr->hasAttribute(noCaseAtom));
+                    compiled->getMatchSet(isAllResult, resultBytes, matchResults.refdata(), search.length(), search.str());
+                }
+                return convertSetToExpression(isAllResult, resultBytes, matchResults.getdata(), expr->queryType());
+            }
+            break;
+        }
     case no_regex_replace:
         {
             IValue * t0 = expr->queryChild(0)->queryValue();

+ 18 - 0
ecl/hql/hqlgram.y

@@ -374,6 +374,7 @@ static void eclsyntaxerror(HqlGram * parser, const char * s, short yystate, int
   RECORDOF
   RECOVERY
   REGEXFIND
+  REGEXFINDSET
   REGEXREPLACE
   REGROUP
   REJECTED
@@ -5986,6 +5987,23 @@ primexpr1
                             parser->normalizeExpression($7, type_int, false);
                             $$.setExpr(createValue(no_regex_find, subType.getLink(), $3.getExpr(), $5.getExpr(), $7.getExpr(), $8.getExpr()));
                         }
+    | REGEXFINDSET '(' expression ',' expression regexOpt ')'
+                        {
+                            parser->normalizeExpression($3, type_stringorunicode, false);
+                            Owned<ITypeInfo> retType;
+                            if(isUnicodeType($3.queryExprType()))
+                            {
+                                parser->normalizeExpression($5, type_unicode, false);
+                                retType.setown(makeUnicodeType(UNKNOWN_LENGTH, $3.queryExprType()->queryLocale()));
+                            }
+                            else
+                            {
+                                parser->normalizeExpression($5, type_string, false);
+                                retType.setown(makeStringType(UNKNOWN_LENGTH));
+                            }
+
+                            $$.setExpr(createValue(no_regex_findset, makeSetType(retType.getLink()), $3.getExpr(), $5.getExpr(), $6.getExpr()), $1);
+                        }
     | REGEXREPLACE '(' expression ',' expression ',' expression regexOpt ')'
                         {
                             parser->normalizeExpression($3, type_stringorunicode, false);

+ 1 - 0
ecl/hql/hqlgram2.cpp

@@ -10708,6 +10708,7 @@ static void getTokenText(StringBuffer & msg, int token)
     case RECOVERY: msg.append("RECOVERY"); break;
     case REGEXFIND: msg.append("REGEXFIND"); break;
     case REGEXREPLACE: msg.append("REGEXREPLACE"); break;
+    case REGEXFINDSET: msg.append("REGEXFINDSET"); break;
     case REGROUP: msg.append("REGROUP"); break;
     case REJECTED: msg.append("REJECTED"); break;
     case RELATIONSHIP: msg.append("RELATIONSHIP"); break;

+ 1 - 1
ecl/hql/hqlir.cpp

@@ -457,7 +457,7 @@ const char * getOperatorIRText(node_operator op)
     EXPAND_CASE(no,outputscalar);
     EXPAND_CASE(no,matchunicode);
     EXPAND_CASE(no,pat_validate);
-    EXPAND_CASE(no,unused83);
+    EXPAND_CASE(no,regex_findset);
     EXPAND_CASE(no,existsgroup);
     EXPAND_CASE(no,pat_use);
     EXPAND_CASE(no,unused13);

+ 1 - 0
ecl/hql/hqllex.l

@@ -871,6 +871,7 @@ RECORD              { RETURNHARD(RECORD); }
 RECORDOF            { RETURNSYM(RECORDOF); }
 RECOVERY            { RETURNSYM(RECOVERY); }
 REGEXFIND           { RETURNSYM(REGEXFIND); }
+REGEXFINDSET        { RETURNSYM(REGEXFINDSET); }
 REGEXREPLACE        { RETURNSYM(REGEXREPLACE); }
 REGROUP             { RETURNSYM(REGROUP); }
 REJECTED            { RETURNSYM(REJECTED); }

+ 38 - 0
ecl/hql/hqlutil.cpp

@@ -9011,3 +9011,41 @@ IHqlExpression * queryTransformAssignValue(IHqlExpression * transform, IHqlExpre
         return value->queryChild(1);
     return NULL;
 }
+
+//-------------------------------------------------------------------------------------------------
+
+IHqlExpression * convertSetToExpression(bool isAll, size32_t len, const void * ptr, ITypeInfo * setType)
+{
+    HqlExprArray results;
+    const byte *presult = (const byte *) ptr;
+    const byte *presult_end = presult + len;
+
+    if (isAll)
+        return createValue(no_all, LINK(setType));
+
+    ITypeInfo * elementType = setType->queryChildType();
+    switch(elementType->getTypeCode())
+    {
+        case type_unicode:
+            while (presult < presult_end)
+            {
+                const size32_t numUChars = *((size32_t *) presult);
+                presult += sizeof(size32_t);
+                results.append(*createConstant(createUnicodeValue(numUChars, presult, LINK(elementType))));
+                presult += numUChars*sizeof(UChar);
+            };
+            break;
+        case type_string:
+            while (presult < presult_end)
+            {
+                const size32_t numUChars = *((size32_t *) presult);
+                presult += sizeof(size32_t);
+                results.append(*createConstant(createStringValue( (const char*)presult, (unsigned)numUChars)));
+                presult += numUChars;
+            };
+            break;
+        default:
+            UNIMPLEMENTED;
+    }
+    return createValue(no_list, LINK(setType), results);
+}

+ 1 - 0
ecl/hql/hqlutil.hpp

@@ -684,6 +684,7 @@ extern HQL_API StringBuffer & appendLocation(StringBuffer & s, IHqlExpression *
 extern HQL_API bool userPreventsSort(IHqlExpression * noSortAttr, node_operator side);
 extern HQL_API IHqlExpression * queryTransformAssign(IHqlExpression * transform, IHqlExpression * searchField);
 extern HQL_API IHqlExpression * queryTransformAssignValue(IHqlExpression * transform, IHqlExpression * searchField);
+extern HQL_API IHqlExpression * convertSetToExpression(bool isAll, size32_t len, const void * ptr, ITypeInfo * setType);
 
 extern HQL_API bool isCommonSubstringRange(IHqlExpression * expr);
 extern HQL_API bool isFileOutput(IHqlExpression * expr);

+ 4 - 0
ecl/hqlcpp/hqlcatom.cpp

@@ -528,6 +528,8 @@ IIdAtom * regexNewUStrFindId;
 IIdAtom * regexNewUStrFoundId;
 IIdAtom * regexNewUStrFoundXId;
 IIdAtom * regexNewUStrReplaceXId;
+IIdAtom * regexMatchSetId;
+IIdAtom * regexUStrMatchSetId;
 IIdAtom * regexReplaceXId;
 IIdAtom * releaseRowId;
 IIdAtom * releaseRowsetId;
@@ -1192,6 +1194,8 @@ MODULE_INIT(INIT_PRIORITY_HQLATOM-1)
     MAKEID(regexNewUStrFound);
     MAKEID(regexNewUStrFoundX);
     MAKEID(regexNewUStrReplaceX);
+    MAKEID(regexMatchSet);
+    MAKEID(regexUStrMatchSet);
     MAKEID(regexReplaceX);
     MAKEID(releaseRow);
     MAKEID(releaseRowset);

+ 2 - 0
ecl/hqlcpp/hqlcatom.hpp

@@ -528,6 +528,8 @@ extern IIdAtom * regexNewUStrFindId;
 extern IIdAtom * regexNewUStrFoundId;
 extern IIdAtom * regexNewUStrFoundXId;
 extern IIdAtom * regexNewUStrReplaceXId;
+extern IIdAtom * regexMatchSetId;
+extern IIdAtom * regexUStrMatchSetId;
 extern IIdAtom * regexReplaceXId;
 extern IIdAtom * releaseRowId;
 extern IIdAtom * releaseRowsetId;

+ 3 - 0
ecl/hqlcpp/hqlcpp.cpp

@@ -2937,6 +2937,9 @@ void HqlCppTranslator::buildExpr(BuildCtx & ctx, IHqlExpression * expr, CHqlBoun
     case no_regex_replace:
         doBuildExprRegexFindReplace(ctx, expr, tgt);
         return;
+    case no_regex_findset:
+        doBuildExprRegexFindSet(ctx, expr, tgt);
+        return;
     case no_skip:
     case no_assert:
         {

+ 1 - 0
ecl/hqlcpp/hqlcpp.ipp

@@ -1345,6 +1345,7 @@ public:
     void doBuildExprRank(BuildCtx & ctx, IHqlExpression * expr, CHqlBoundExpr & tgt);
     void doBuildExprRanked(BuildCtx & ctx, IHqlExpression * expr, CHqlBoundExpr & tgt);
     void doBuildExprRegexFindReplace(BuildCtx & ctx, IHqlExpression * expr, CHqlBoundExpr & bound);
+    void doBuildExprRegexFindSet(BuildCtx & ctx, IHqlExpression * expr, CHqlBoundExpr & bound);
     void doBuildExprRound(BuildCtx & ctx, IHqlExpression * expr, CHqlBoundExpr & tgt);
     void doBuildExprSelect(BuildCtx & ctx, IHqlExpression * expr, CHqlBoundExpr & tgt);
     void doBuildExprSizeof(BuildCtx & ctx, IHqlExpression * expr, CHqlBoundExpr & tgt);

+ 3 - 1
ecl/hqlcpp/hqlcppsys.ecl

@@ -554,13 +554,15 @@ const char * cppSystemText[]  = {
     "   boolean regexNewStrFound() : method,pure,entrypoint='found';"
     "   string regexNewStrFoundX(unsigned4 idx) : method,pure,entrypoint='getMatchX';"
     "   string regexNewStrReplaceX(const string _search, const string _replace) : method,pure,entrypoint='replace';"
+    "   set of string regexMatchSet(const string _search) : method,pure,entrypoint='getMatchSet';"
 
     "   regexNewSetUStrPattern(const varunicode _pattern, boolean isCaseSensitive) : omethod,entrypoint='setPattern';"
     "   regexNewUStrFind(boolean _compiled, const unicode _search) : omethod,entrypoint='find';"
     "   boolean regexNewUStrFound() : method,pure,entrypoint='found';"
     "   unicode regexNewUStrFoundX(unsigned4 idx) : method,pure,entrypoint='getMatchX';"
     "   unicode regexNewUStrReplaceX(const unicode _search, const unicode _replace) : method,pure,entrypoint='replace';"
-    
+    "   set of unicode regexUStrMatchSet(const unicode _search) : method,pure,entrypoint='getMatchSet';"
+
     //clibrary functions that are called from the code generation
     "   free(data1 src) : eclrtl,library='eclrtl',entrypoint='rtlFree';",
     "   integer4 memcmp(const data1 target, const data1 src, unsigned4 len) : sys,pure,entrypoint='memcmp';",

+ 21 - 0
ecl/hqlcpp/hqlhtcpp.cpp

@@ -17800,6 +17800,27 @@ void HqlCppTranslator::doBuildAssignRegexFindReplace(BuildCtx & ctx, const CHqlB
     doBuildNewRegexFindReplace(ctx, &target, expr, NULL);
 }
 
+void HqlCppTranslator::doBuildExprRegexFindSet(BuildCtx & ctx, IHqlExpression * expr, CHqlBoundExpr & bound)
+{
+    CHqlBoundExpr boundMatch;
+    if (ctx.getMatchExpr(expr, boundMatch))
+    {
+        bound.set(boundMatch);
+        return;
+    }
+
+    IHqlExpression * pattern = expr->queryChild(0);
+    IHqlExpression * search = expr->queryChild(1);
+    bool isUnicode = isUnicodeType(search->queryType());
+    IHqlExpression * compiled = doBuildRegexCompileInstance(ctx, pattern, isUnicode, !expr->hasAttribute(noCaseAtom));
+
+    HqlExprArray args;
+    args.append(*LINK(compiled));
+    args.append(*LINK(search));
+    IIdAtom * func = isUnicode ? regexUStrMatchSetId : regexMatchSetId;
+    OwnedHqlExpr call = bindFunctionCall(func, args);
+    buildExprOrAssign(ctx, NULL, call, &bound);
+}
 
 //---------------------------------------------------------------------------
 //-- no_null [DATASET] --

+ 62 - 1
rtl/eclrtl/eclrtl.cpp

@@ -4856,7 +4856,7 @@ public:
                 sample[_len] = (char)NULL;
                 matched = boost::regex_search(sample, subs, *regEx);
             }
-            else 
+            else
             {
                 matched = boost::regex_search(_str + _from, _str + _len, subs, *regEx);
             }
@@ -4961,6 +4961,34 @@ public:
         CStrRegExprFindInstance * findInst = new CStrRegExprFindInstance(&regEx, str, from, len, needToKeepSearchString);
         return findInst;
     }
+
+    void getMatchSet(bool  & __isAllResult, size32_t & __resultBytes, void * & __result, size32_t _srcLen, const char * _search)
+    {
+        rtlRowBuilder out;
+        size32_t outBytes = 0;
+        const char * search_end = _search+_srcLen;
+
+        boost::regex_iterator<const char *> cur(_search, search_end, regEx);
+        boost::regex_iterator<const char *> end; // Default contructor creates an end of list marker
+        for (; cur != end; ++cur)
+        {
+            const boost::match_results<const char *> &match = *cur;
+            if (match[0].first==search_end) break;
+
+            const size32_t lenBytes = match[0].second - match[0].first;
+            out.ensureAvailable(outBytes+lenBytes+sizeof(size32_t));
+            byte *outData = out.getbytes()+outBytes;
+
+            * (size32_t *) outData = lenBytes;
+            rtlStrToStr(lenBytes, outData+sizeof(size32_t), lenBytes, match[0].first);
+
+            outBytes += lenBytes+sizeof(size32_t);
+        }
+        __isAllResult = false;
+        __resultBytes = outBytes;
+        __result = out.detachdata();
+    };
+
 };
 
 //---------------------------------------------------------------------------
@@ -5021,6 +5049,7 @@ public:
             int32_t start = n ? matcher->start(n, uerr) : matcher->start(uerr);
             int32_t end = n ? matcher->end(n, uerr) : matcher->end(uerr);
             outlen = end - start;
+
             out = (UChar *)rtlMalloc(outlen*2);
             sample.extract(start, outlen, out);
         }
@@ -5115,6 +5144,38 @@ public:
         CUStrRegExprFindInstance * findInst = new CUStrRegExprFindInstance(matcher, str, from, len);
         return findInst;
     }
+
+    void getMatchSet(bool  & __isAllResult, size32_t & __resultBytes, void * & __result, size32_t _srcLen, const UChar * _search)
+    {
+        rtlRowBuilder out;
+        size32_t outBytes = 0;
+        UErrorCode uerr = U_ZERO_ERROR;
+        UnicodeString uStrSearch;
+
+        uStrSearch.setTo(_search, _srcLen);
+        matcher->reset(uStrSearch);
+        while (matcher->find())
+        {
+            uerr = U_ZERO_ERROR;
+            int32_t start = matcher->start(uerr);
+            assertex(uerr<=U_ZERO_ERROR);
+            if (start==_srcLen) break;
+            int32_t end = matcher->end(uerr);
+            assertex(uerr<=U_ZERO_ERROR);
+            int32_t numUChars = end - start;
+
+            out.ensureAvailable(outBytes+numUChars*sizeof(UChar)+sizeof(size32_t));
+            byte *outData = out.getbytes()+outBytes;
+            * (size32_t *) outData = numUChars;
+            uStrSearch.extract(start,numUChars,(UChar *) (outData+sizeof(size32_t)));
+
+            outBytes += numUChars*sizeof(UChar) + sizeof(size32_t);
+        }
+        __isAllResult = false;
+        __resultBytes = outBytes;
+        __result = out.detachdata();
+    }
+
 };
 
 //---------------------------------------------------------------------------

+ 2 - 2
rtl/eclrtl/eclrtl.hpp

@@ -80,6 +80,7 @@ interface ICompiledStrRegExpr
 {
     virtual void replace(size32_t & outlen, char * & out, size32_t slen, char const * str, size32_t rlen, char const * replace) const = 0;
     virtual IStrRegExprFindInstance * find(const char * str, size32_t from, size32_t len, bool needToKeepSearchString) const = 0;
+    virtual void getMatchSet(bool  & __isAllResult, size32_t & __resultBytes, void * & __result, size32_t _srcLen, const char * _search) = 0;
 };
 
 // RegEx Compiler for unicode strings
@@ -93,6 +94,7 @@ interface ICompiledUStrRegExpr
 {
     virtual void replace(size32_t & outlen, UChar * & out, size32_t slen, UChar const * str, size32_t rlen, UChar const * replace) const = 0;
     virtual IUStrRegExprFindInstance * find(const UChar * str, size32_t from, size32_t len) const = 0;
+    virtual void getMatchSet(bool  & __isAllResult, size32_t & __resultBytes, void * & __result, size32_t _srcLen, const UChar * _search) = 0;
 };
 
 //-----------------------------------------------------------------------------
@@ -675,12 +677,10 @@ ECLRTL_API ICompiledStrRegExpr * rtlCreateCompiledStrRegExpr(const char * regExp
 ECLRTL_API void rtlDestroyCompiledStrRegExpr(ICompiledStrRegExpr * compiled);
 ECLRTL_API void rtlDestroyStrRegExprFindInstance(IStrRegExprFindInstance * compiled);
 
-
 ECLRTL_API ICompiledUStrRegExpr * rtlCreateCompiledUStrRegExpr(const UChar * regExpr, bool isCaseSensitive);
 ECLRTL_API void rtlDestroyCompiledUStrRegExpr(ICompiledUStrRegExpr * compiled);
 ECLRTL_API void rtlDestroyUStrRegExprFindInstance(IUStrRegExprFindInstance * compiled);
 
-
 ECLRTL_API void rtlCreateRange(size32_t & outlen, char * & out, unsigned fieldLen, unsigned compareLen, size32_t len, const char * str, byte fill, byte pad);
 ECLRTL_API void rtlCreateRangeLow(size32_t & outlen, char * & out, unsigned fieldLen, unsigned compareLen, size32_t len, const char * str);
 ECLRTL_API void rtlCreateRangeHigh(size32_t & outlen, char * & out, unsigned fieldLen, unsigned compareLen, size32_t len, const char * str);

+ 28 - 0
testing/regress/ecl/key/regexfindset.xml

@@ -0,0 +1,28 @@
+<Dataset name='Result 1'>
+ <Row><Result_1><Item>add-subtract</Item><Item>123.90</Item><Item>or</Item><Item>123</Item></Result_1></Row>
+</Dataset>
+<Dataset name='Result 2'>
+ <Row><Result_2><Item>add-subtract</Item><Item>123.90</Item><Item>or</Item><Item>123</Item></Result_2></Row>
+</Dataset>
+<Dataset name='Result 3'>
+ <Row><Result_3><Item>add-subtract</Item><Item>123.90</Item><Item>or</Item><Item>123</Item></Result_3></Row>
+</Dataset>
+<Dataset name='Result 4'>
+ <Row><Result_4><Item>add-subtract</Item><Item>123.90</Item><Item>or</Item><Item>123</Item></Result_4></Row>
+</Dataset>
+<Dataset name='Result 5'>
+ <Row><parsed><Item>subtract</Item><Item>35</Item><Item>from</Item><Item>50</Item></parsed></Row>
+ <Row><parsed><Item>divide</Item><Item>100</Item><Item>by</Item><Item>25</Item><Item>and</Item><Item>add</Item><Item>5</Item></parsed></Row>
+</Dataset>
+<Dataset name='Result 6'>
+ <Row><parsed><Item>subtract</Item><Item>35</Item><Item>from</Item><Item>50</Item></parsed></Row>
+ <Row><parsed><Item>divide</Item><Item>100</Item><Item>by</Item><Item>25</Item><Item>and</Item><Item>add</Item><Item>5</Item></parsed></Row>
+</Dataset>
+<Dataset name='Result 7'>
+ <Row><parsed><Item>subtract</Item><Item>35</Item><Item>from</Item><Item>50</Item></parsed></Row>
+ <Row><parsed><Item>divide</Item><Item>100</Item><Item>by</Item><Item>25</Item><Item>and</Item><Item>add</Item><Item>5</Item></parsed></Row>
+</Dataset>
+<Dataset name='Result 8'>
+ <Row><parsed><Item>subtract</Item><Item>35</Item><Item>from</Item><Item>50</Item></parsed></Row>
+ <Row><parsed><Item>divide</Item><Item>100</Item><Item>by</Item><Item>25</Item><Item>and</Item><Item>add</Item><Item>5</Item></parsed></Row>
+</Dataset>

+ 80 - 0
testing/regress/ecl/regexfindset.ecl

@@ -0,0 +1,80 @@
+/*##############################################################################
+
+    HPCC SYSTEMS software Copyright (C) 2016 HPCC Systems®.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+############################################################################## */
+
+
+/* ascii strings */
+str := 'add-subtract 123.90, or 123';
+REGEXFINDSET(NOFOLD('(([[:alpha:]]|(-))+)|([[:digit:]]+([.][[:digit:]]+)?)'), str);
+
+REGEXFINDSET('(([[:alpha:]]|(-))+)|([[:digit:]]+([.][[:digit:]]+)?)', str);
+
+/* Unicode strings */
+ustr := U'add-subtract 123.90, or 123';
+REGEXFINDSET(NOFOLD(U'(([[:alpha:]]|(-))+)|([[:digit:]]+([.][[:digit:]]+)?)'), ustr);
+
+REGEXFINDSET(U'(([[:alpha:]]|(-))+)|([[:digit:]]+([.][[:digit:]]+)?)', ustr);
+
+/* in Project test */
+RecLayout := RECORD
+    STRING50 rawstr;
+END;
+ParsedLayout := RECORD
+    SET OF STRING parsed;
+END;
+
+ParsedLayout parseThem(recLayout L) := TRANSFORM
+    SELF.parsed := REGEXFINDSET('(([[:alpha:]]|(-))+)|([[:digit:]]+([.][[:digit:]]+)?)', l.rawstr);
+END;
+
+recTable := dataset([
+    {'subtract 35 from 50'},
+    {'divide 100 by 25 and add 5'}
+    ], RecLayout);
+
+PROJECT(recTable, parseThem(LEFT));
+
+recTable2 := NOFOLD(dataset([
+    {'subtract 35 from 50'},
+    {'divide 100 by 25 and add 5'}
+    ], RecLayout));
+
+PROJECT(recTable2, parseThem(LEFT));
+
+URecLayout := RECORD
+    UNICODE50 rawstr;
+END;
+UParsedLayout := RECORD
+    SET OF UNICODE parsed;
+END;
+
+UParsedLayout uparseThem(URecLayout L) := TRANSFORM
+    SELF.parsed := REGEXFINDSET(U'(([[:alpha:]]|(-))+)|([[:digit:]]+([.][[:digit:]]+)?)', l.rawstr);
+END;
+
+urecTable := dataset([
+    {U'subtract 35 from 50'},
+    {U'divide 100 by 25 and add 5'}
+    ], URecLayout);
+
+PROJECT(urecTable, uparseThem(LEFT));
+
+urecTable2 := NOFOLD(dataset([
+    {U'subtract 35 from 50'},
+    {U'divide 100 by 25 and add 5'}
+    ], URecLayout));
+
+PROJECT(urecTable2, uparseThem(LEFT));