12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022 |
- /*##############################################################################
- HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ############################################################################## */
- #ifndef U_OVERRIDE_CXX_ALLOCATION
- #define U_OVERRIDE_CXX_ALLOCATION 0 // Enabling this forces all allocation of ICU objects to ICU's heap, but is incompatible with jmemleak
- #endif
- #include "jliball.hpp"
- #include "hqlexpr.hpp"
- #include "hqlcerrors.hpp"
- #include "hqlnlp.ipp"
- #include "hqlhtcpp.ipp"
- #include "hqlcatom.hpp"
- #include "thorralgo.ipp"
- #include "hqlfold.hpp"
- #include "hqlcse.ipp"
- #include "hqlccommon.hpp"
- #include "hqlutil.hpp"
- #include "hqltcppc.hpp"
- #include "hqlcpputil.hpp"
- #define DEFAULT_NLP_DETAIL 1
- #define DEFAULT_PATTERN_MAX_LENGTH 4096
- #ifdef __64BIT__
- #define __DEFINED_64BIT__ true
- #else
- #define __DEFINED_64BIT__ false
- #endif
- //---------------------------------------------------------------------------
- ValidateKind getValidateKind(IHqlExpression * expr)
- {
- switch (expr->getOperator())
- {
- case no_matchtext: return ValidateIsString;
- case no_matchunicode: return ValidateIsUnicode;
- }
- ForEachChild(idx, expr)
- {
- ValidateKind kind = getValidateKind(expr->queryChild(idx));
- if (kind != ValidateIsEither)
- return kind;
- }
- return ValidateIsEither;
- }
- //---------------------------------------------------------------------------
- IHqlExpression * optimizeParse(IHqlExpression * parseExpr)
- {
- return LINK(parseExpr);
- }
- //===========================================================================
- MatchReference::MatchReference(IHqlExpression * expr)
- {
- expand(expr, true);
- }
- bool MatchReference::equals(const MatchReference & other) const
- {
- if (names.ordinality() != other.names.ordinality())
- return false;
- ForEachItemIn(idx, names)
- {
- if (&names.item(idx) != &other.names.item(idx))
- return false;
- if (&indices.item(idx) != &other.indices.item(idx))
- return false;
- }
- return true;
- }
- void MatchReference::expand(IHqlExpression * expr, bool isLast)
- {
- switch (expr->getOperator())
- {
- case no_pat_select:
- expand(expr->queryChild(0), false);
- expand(expr->queryChild(1), true);
- break;
- case no_pat_index:
- names.append(*LINK(expr->queryChild(0)));
- indices.append(*LINK(expr->queryChild(1)));
- break;
- default:
- names.append(*LINK(expr));
- indices.append(*createConstant((int)UNKNOWN_INSTANCE));
- break;
- }
- }
- void MatchReference::getPath(StringBuffer & path)
- {
- ForEachItemIn(idx, names)
- {
- if (idx)
- path.append(".");
- path.append(lower(queryPatternName(&names.item(idx))));
- }
- }
- StringBuffer & MatchReference::getDebugText(StringBuffer & out, RegexIdAllocator & idAllocator)
- {
- ForEachItemIn(i1, names)
- {
- IHqlExpression & curName = names.item(i1);
- IAtom * name = lower(queryPatternName(&curName));
- if (i1)
- out.append("/");
- out.append(str(name));
- out.append("{").append(idAllocator.queryID(curName.queryChild(0), name)).append("}");
- unsigned inst = (unsigned)indices.item(i1).queryValue()->getIntValue();
- if (inst != UNKNOWN_INSTANCE)
- out.append("[").append(inst).append("]");
- }
- return out;
- }
- void MatchReference::compileMatched(RegexIdAllocator & idAllocator, UnsignedArray & ids, UnsignedArray & indexValues)
- {
- ForEachItemIn(idx, names)
- {
- IHqlExpression & curName = names.item(idx);
- ids.append(idAllocator.queryID(curName.queryChild(0), lower(queryPatternName(&curName))));
- indexValues.append((unsigned)indices.item(idx).queryValue()->getIntValue());
- }
- }
- //---------------------------------------------------------------------------
- NlpParseContext::NlpParseContext(IHqlExpression * _expr, IWorkUnit * _wu, const HqlCppOptions & options)
- {
- workunit = _wu;
- expr.set(_expr);
- IHqlExpression * search = expr->queryChild(1);
- switch (search->queryType()->getTypeCode())
- {
- case type_unicode:
- case type_varunicode:
- info.type = type_unicode;
- break;
- case type_utf8:
- info.type = type_utf8;
- break;
- default:
- info.type = type_string;
- break;
- }
- info.charSize = (info.type == type_unicode) ? sizeof(UChar) : sizeof(char);
- IHqlExpression * sepAttr = expr->queryAttribute(separatorAtom);
- if (sepAttr)
- info.separator.set(sepAttr->queryChild(0));
- info.caseSensitive = !expr->hasAttribute(noCaseAtom); // default true.
- info.dfaRepeatMax = options.dfaRepeatMax;
- info.dfaRepeatMaxScore = options.dfaRepeatMaxScore;
- info.uidBase = getUniqueId();
- allMatched = false;
- }
- void NlpParseContext::addAllMatched()
- {
- allMatched = true;
- }
- unsigned NlpParseContext::addMatchReference(IHqlExpression * matchPathExpr)
- {
- if (!matchPathExpr) matchPathExpr = expr->queryChild(2);
- Owned<MatchReference> ref = new MatchReference(matchPathExpr);
- ForEachItemIn(idx, matches)
- {
- if (ref->equals(matches.item(idx)))
- return idx;
- }
- matches.append(*ref.getClear());
- extractMatchedSymbols(matchPathExpr);
- return matches.ordinality()-1;
- }
- IHqlExpression * NlpParseContext::queryValidateExpr(IHqlExpression * expr) const
- {
- IHqlExpression * unicodeTest = expr->queryChild(2);
- switch (info.type)
- {
- case type_unicode:
- case type_utf8: // I think this is best
- if (unicodeTest && !unicodeTest->isAttribute())
- return unicodeTest;
- break;
- }
- return expr->queryChild(1);
- }
- void NlpParseContext::buildValidators(HqlCppTranslator & translator, BuildCtx & classctx)
- {
- if (validators.ordinality())
- {
- BuildCtx helperctx(classctx);
- IHqlStmt * helperClassStmt = translator.beginNestedClass(helperctx, "helper", "INlpHelper");
- BuildCtx funcctx(helperctx);
- funcctx.addQuotedFunction("virtual IValidator * queryValidator(unsigned i) override");
- BuildCtx casectx(funcctx);
- casectx.addQuotedCompoundLiteral("switch (i)");
- ForEachItemIn(idx, validators)
- {
- StringBuffer member;
- translator.getUniqueId(member.append("val"));
- LinkedHqlExpr validateExpr = queryValidateExpr(&validators.item(idx));
- ValidateKind kind = getValidateKind(validateExpr);
- BuildCtx validatorctx(helperctx);
- IHqlStmt * classStmt = translator.beginNestedClass(validatorctx, member, (kind != ValidateIsUnicode) ? "IStringValidator" : "IUnicodeValidator");
- {
- MemberFunction func(translator, validatorctx);
- CHqlBoundExpr boundMatched;
- if (kind != ValidateIsUnicode)
- {
- func.start("virtual bool isValid(unsigned len, const char * data) override");
- boundMatched.length.setown(createVariable("len", LINK(sizetType)));
- boundMatched.expr.setown(createVariable("data", makeReferenceModifier(LINK(unknownStringType))));
- func.ctx.associateExpr(activeMatchTextExpr, boundMatched);
- }
- else
- {
- func.start("virtual bool isValid(unsigned len, const UChar * data) override");
- boundMatched.length.setown(createVariable("len", LINK(sizetType)));
- boundMatched.expr.setown(createVariable("data", makeReferenceModifier(LINK(unknownUnicodeType))));
- func.ctx.associateExpr(activeMatchUnicodeExpr, boundMatched);
- }
- func.ctx.associateExpr(activeNlpMarkerExpr, activeNlpMarkerExpr);
- func.ctx.associateExpr(activeValidateMarkerExpr, activeValidateMarkerExpr);
- translator.bindTableCursor(func.ctx, queryNlpParsePseudoTable(), queryNlpParsePseudoTable());
- if (translator.queryOptions().spotCSE)
- validateExpr.setown(spotScalarCSE(validateExpr, NULL, translator.queryOptions().spotCseInIfDatasetConditions));
- translator.buildReturn(func.ctx, validateExpr);
- }
- translator.endNestedClass(classStmt);
- StringBuffer s;
- s.append("case ").append(idx).append(": return &").append(member).append(";");
- casectx.addQuoted(s);
- }
- funcctx.addReturn(queryQuotedNullExpr());
- translator.endNestedClass(helperClassStmt);
- classctx.addQuotedLiteral("virtual INlpHelper * queryHelper() override { return &helper; }");
- }
- }
- void NlpParseContext::buildProductions(HqlCppTranslator & translator, BuildCtx & classctx, BuildCtx & startctx)
- {
- if (!productions.ordinality())
- return;
- {
- BuildCtx metactx(classctx);
- metactx.addQuotedFunction("virtual IOutputMetaData * queryProductionMeta(unsigned id) override");
- BuildCtx metacasectx(metactx);
- metacasectx.addQuotedCompoundLiteral("switch (id)");
- StringBuffer s;
- ForEachItemIn(i, productions)
- {
- IHqlExpression & cur = productions.item(i);
- MetaInstance meta(translator, cur.queryChild(1)->queryRecord(), false);
- translator.buildMetaInfo(meta);
- s.clear().append("case ").append(getIntValue(cur.queryChild(0)));
- s.append(": return &").append(meta.queryInstanceObject()).append(";");
- metacasectx.addQuoted(s);
- }
- metactx.addQuotedLiteral("return 0;");
- }
- {
- OwnedHqlExpr callback = createVariable("input", makeBoolType());
- MemberFunction func(translator, startctx, "virtual size32_t executeProduction(ARowBuilder & crSelf, unsigned id, IProductionCallback * input) override");
- func.ctx.associateExpr(activeProductionMarkerExpr, callback);
- BuildCtx prodcasectx(func.ctx);
- prodcasectx.addQuotedCompoundLiteral("switch (id)");
- StringBuffer s, subname, proto;
- ForEachItemIn(i, productions)
- {
- IHqlExpression & cur = productions.item(i);
- IHqlExpression * transform = cur.queryChild(1);
- if (transform->getOperator() == no_record)
- continue;
- subname.clear().append("executeProduction").append(i+1);
-
- s.clear().append("case ").append(getIntValue(cur.queryChild(0)));
- s.append(": return ").append(subname).append("(crSelf, input);");
- prodcasectx.addQuoted(s);
- {
- proto.clear().append("size32_t ").append(subname).append("(ARowBuilder & crSelf, IProductionCallback * input)");
- MemberFunction validateFunc(translator, startctx, proto, MFdynamicproto);
- translator.ensureRowAllocated(validateFunc.ctx, "crSelf");
- validateFunc.ctx.associateExpr(activeProductionMarkerExpr, callback);
- OwnedHqlExpr newTransform = LINK(transform);
- OwnedHqlExpr dataset = createDataset(no_anon, LINK(transform->queryRecord()));
- translator.buildTransformBody(validateFunc.ctx, newTransform, NULL, NULL, dataset, NULL);
- }
- }
- func.ctx.addQuotedLiteral("return (size32_t)-1;");
- }
- }
- void NlpParseContext::extractMatchedSymbols(IHqlExpression * expr)
- {
- switch (expr->getOperator())
- {
- case no_pat_select:
- extractMatchedSymbols(expr->queryChild(0));
- extractMatchedSymbols(expr->queryChild(1));
- break;
- case no_pat_index:
- expr = expr->queryChild(0);
- //fall through
- default:
- {
- assertex(expr->getOperator() == no_pat_instance);
- if (!matchedSymbols.contains(*expr))
- matchedSymbols.append(*LINK(expr));
- break;
- }
- }
- }
- bool NlpParseContext::isMatched(IHqlExpression * expr, IAtom * name)
- {
- if (allMatched)
- return true;
- ForEachItemIn(i, matchedSymbols)
- {
- IHqlExpression & cur = matchedSymbols.item(i);
- if ((cur.queryChild(0)->queryBody() == expr->queryBody()) &&
- (cur.queryChild(1)->queryName() == name))
- return true;
- }
- return false;
- }
- void NlpParseContext::doExtractValidates(IHqlExpression * expr)
- {
- if (expr->queryTransformExtra())
- return;
- expr->setTransformExtraUnlinked(expr);
- switch (expr->getOperator())
- {
- case no_pat_validate:
- validators.append(*LINK(expr));
- doExtractValidates(expr->queryChild(0));
- break;
- default:
- ForEachChild(idx, expr)
- doExtractValidates(expr->queryChild(idx));
- break;
- }
- }
- void NlpParseContext::extractValidates(IHqlExpression * expr)
- {
- lockTransformMutex();
- doExtractValidates(expr);
- unlockTransformMutex();
- }
- bool NlpParseContext::isValidMatch(MatchReference & match, unsigned depth, IHqlExpression * pattern)
- {
- //MORE: This should check whether already visited, especially once we allow recursive patterns!
- if (&match.names.item(depth) == pattern)
- {
- if (depth+1 == match.names.ordinality())
- return true;
- if (isValidMatch(match, depth+1, pattern))
- return true;
- }
- ForEachChild(idx, pattern)
- if (isValidMatch(match, depth, pattern->queryChild(idx)))
- return true;
- return false;
- }
- void NlpParseContext::checkValidMatches()
- {
- ForEachItemIn(idx, matches)
- {
- if (!isValidMatch(matches.item(idx), 0, expr->queryChild(2)))
- {
- StringBuffer path;
- matches.item(idx).getPath(path);
- throwError1(HQLERR_BadMatchedPath, path.str());
- }
- }
- }
- void NlpParseContext::compileMatched(NlpAlgorithm & parser)
- {
- parser.matchInfo->setFormat(info.inputFormat());
- ForEachItemIn(idx, matches)
- {
- UnsignedArray ids;
- UnsignedArray indexValues;
- matches.item(idx).compileMatched(idAllocator, ids, indexValues);
- parser.matchInfo->addResult(ids, indexValues);
- }
- }
- static void getOptions(IHqlExpression * expr, INlpParseAlgorithm::MatchAction & matchAction, INlpParseAlgorithm::ScanAction & scanAction)
- {
- matchAction = INlpParseAlgorithm::NlpMatchAll;
- scanAction = INlpParseAlgorithm::NlpScanNext;
- if (expr->hasAttribute(firstAtom)) matchAction = INlpParseAlgorithm::NlpMatchFirst;
- if (expr->hasAttribute(allAtom)) matchAction = INlpParseAlgorithm::NlpMatchAll;
- if (expr->hasAttribute(noScanAtom)) scanAction = INlpParseAlgorithm::NlpScanNone;
- if (expr->hasAttribute(scanAtom)) scanAction = INlpParseAlgorithm::NlpScanNext;
- if (expr->hasAttribute(scanAllAtom)) scanAction = INlpParseAlgorithm::NlpScanAll;
- if (expr->hasAttribute(wholeAtom)) scanAction = INlpParseAlgorithm::NlpScanWhole;
- }
- void NlpParseContext::getDebugText(StringBuffer & s, unsigned detail)
- {
- INlpParseAlgorithm::MatchAction matchAction;
- INlpParseAlgorithm::ScanAction scanAction;
- getOptions(expr, matchAction, scanAction);
- s.newline().append("Options: ").append("Match(");
- switch (matchAction)
- {
- case INlpParseAlgorithm::NlpMatchFirst: s.append("First"); break;
- case INlpParseAlgorithm::NlpMatchAll: s.append("All"); break;
- }
- s.append(") Scan(");
- switch (scanAction)
- {
- case INlpParseAlgorithm::NlpScanWhole: s.append("Whole"); break;
- case INlpParseAlgorithm::NlpScanNone: s.append("None"); break;
- case INlpParseAlgorithm::NlpScanNext: s.append("Next"); break;
- case INlpParseAlgorithm::NlpScanAll: s.append("All"); break;
- }
- s.append(")").newline();
- s.append("Matches:").newline();
- ForEachItemIn(idx, matches)
- matches.item(idx).getDebugText(s.append("\t"), idAllocator).newline();
- }
- void NlpParseContext::setParserOptions(INlpParseAlgorithm & parser)
- {
- INlpParseAlgorithm::MatchAction matchAction;
- INlpParseAlgorithm::ScanAction scanAction;
- getOptions(expr, matchAction, scanAction);
- IHqlExpression * keep = expr->queryAttribute(keepAtom);
- unsigned keepLimit = keep ? (unsigned)keep->queryChild(0)->queryValue()->getIntValue() : 0;
- IHqlExpression * atmost = expr->queryAttribute(atmostAtom);
- unsigned atmostLimit = atmost ? (unsigned)atmost->queryChild(0)->queryValue()->getIntValue() : 0;
- IHqlExpression * maxLength = expr->queryAttribute(maxLengthAtom);
- size32_t maxLengthValue = maxLength ? (unsigned)maxLength->queryChild(0)->queryValue()->getIntValue() : DEFAULT_PATTERN_MAX_LENGTH;
- parser.setOptions(matchAction, scanAction, info.inputFormat(), keepLimit, atmostLimit);
- parser.setChoose(expr->hasAttribute(minAtom), expr->hasAttribute(maxAtom), expr->hasAttribute(bestAtom), !expr->hasAttribute(manyAtom));
- parser.setJoin(expr->hasAttribute(notMatchedAtom), expr->hasAttribute(notMatchedOnlyAtom));
- parser.setLimit(maxLengthValue);
- }
- //---------------------------------------------------------------------------
- void HqlCppTranslator::doBuildParseTransform(BuildCtx & classctx, IHqlExpression * expr)
- {
- MemberFunction func(*this, classctx, "virtual size32_t transform(ARowBuilder & crSelf, const void * _left, IMatchedResults * matched, IMatchWalker * walker) override");
- ensureRowAllocated(func.ctx, "crSelf");
- func.ctx.addQuotedLiteral("const unsigned char * left = (const unsigned char *) _left;");
- func.ctx.associateExpr(activeNlpMarkerExpr, activeNlpMarkerExpr);
- bindTableCursor(func.ctx, queryNlpParsePseudoTable(), queryNlpParsePseudoTable());
- // Bind left to "left" and right to RIGHT
- IHqlExpression * dataset = expr->queryChild(0);
- IHqlExpression * transform = expr->queryChild(4);
- BoundRow * selfCursor = bindSelf(func.ctx, expr, "crSelf");
- if (transform->getOperator() == no_newtransform)
- bindTableCursor(func.ctx, dataset, "left");
- else
- bindTableCursor(func.ctx, dataset, "left", no_left, querySelSeq(expr));
- associateSkipReturnMarker(func.ctx, queryZero(), selfCursor);
- doTransform(func.ctx, transform, selfCursor);
- buildReturnRecordSize(func.ctx, selfCursor);
- }
- void HqlCppTranslator::doBuildParseSearchText(BuildCtx & classctx, IHqlExpression * dataset, IHqlExpression * search, type_t searchTypeCode, ITypeInfo * transferType)
- {
- bool needToFree = true;
- {
- MemberFunction func(*this, classctx);
- if (searchTypeCode == type_unicode)
- {
- func.start("virtual void getSearchText(size32_t & retLen, char * & _retText, const void * _self) override");
- func.ctx.addQuotedLiteral("UChar * & retText = *(UChar * *)&_retText;"); // don't ask.
- }
- else
- func.start("virtual void getSearchText(size32_t & retLen, char * & retText, const void * _self) override");
- func.ctx.addQuotedLiteral("const unsigned char * self = (const unsigned char *) _self;");
- bindTableCursor(func.ctx, dataset, "self");
- Owned<ITypeInfo> retType;
- switch (searchTypeCode)
- {
- case type_unicode:
- retType.setown(makeUnicodeType(UNKNOWN_LENGTH, 0));
- break;
- case type_utf8:
- retType.setown(makeUtf8Type(UNKNOWN_LENGTH, 0));
- break;
- default:
- retType.setown(makeStringType(UNKNOWN_LENGTH, NULL, NULL));
- break;
- }
- OwnedHqlExpr castSearch = ensureExprType(search, retType);
- castSearch.setown(foldHqlExpression(castSearch));
- retType.setown(makeReferenceModifier(retType.getClear()));
- switch (castSearch->getOperator())
- {
- case no_select:
- case no_constant:
- //Not strictly true - could be conditional
- //also misses lots of cases - but I doubt anyone will ever complain...
- needToFree = false;
- break;
- }
- OwnedHqlExpr retLen = createVariable("retLen", LINK(sizetType));
- OwnedHqlExpr tempLen;
- if (transferType)
- tempLen.setown(func.ctx.getTempDeclare(sizetType, NULL));
- CHqlBoundTarget target;
- target.length.set(tempLen ? tempLen : retLen);
- target.expr.setown(createVariable("retText", LINK(retType)));
- if (needToFree)
- {
- buildExprAssign(func.ctx, target, castSearch);
- }
- else
- {
- CHqlBoundExpr bound;
- buildExpr(func.ctx, castSearch, bound);
- OwnedHqlExpr len = getBoundLength(bound);
- func.ctx.addAssign(target.length, len);
- OwnedHqlExpr transferred = createValue(no_cast, LINK(retType), LINK(bound.expr));
- func.ctx.addAssign(target.expr, transferred);
- }
- if (tempLen)
- {
- OwnedHqlExpr source = target.getTranslatedExpr();
- OwnedHqlExpr transferred = createValue(no_typetransfer, LINK(transferType), LINK(source));
- OwnedHqlExpr length = createValue(no_charlen, LINK(sizetType), LINK(transferred));
- buildAssignToTemp(func.ctx, retLen, length);
- }
- }
- doBuildBoolFunction(classctx, "searchTextNeedsFree", needToFree);
- }
- void HqlCppTranslator::doBuildParseSearchText(BuildCtx & classctx, IHqlExpression * expr)
- {
- doBuildParseSearchText(classctx, expr->queryChild(0), expr->queryChild(1), nlpParse->searchType(), NULL);
- }
- void HqlCppTranslator::doBuildParseExtra(BuildCtx & classctx, IHqlExpression * expr)
- {
- StringBuffer flags;
- if (expr->hasAttribute(groupAtom)) flags.append("|PFgroup");
- if (expr->hasAttribute(parallelAtom)) flags.append("|PFparallel");
- if (flags.length())
- doBuildUnsignedFunction(classctx, "getFlags", flags.str()+1);
- }
- void HqlCppTranslator::doBuildParseValidators(BuildCtx & classctx, IHqlExpression * expr)
- {
- nlpParse->extractValidates(expr->queryChild(2));
- nlpParse->buildValidators(*this, classctx);
- }
- void HqlCppTranslator::doBuildParseCompiled(BuildCtx & classctx, MemoryBuffer & buffer)
- {
- if (buffer.length() > 1000000)
- WARNING1(CategoryEfficiency, HQLWRN_ParseVeryLargeDefinition, buffer.length());
- BuildCtx funcctx(classctx);
- MemoryBuffer compressed;
- compressToBuffer(compressed, buffer.length(), buffer.toByteArray());
- unsigned buffLen = compressed.length();
- CHqlBoundExpr bound;
- StringBuffer s;
- OwnedHqlExpr srcData = addDataLiteral((const char *)compressed.toByteArray(), buffLen);
- OwnedHqlExpr retData = createVariable("retData", makePointerType(makeVoidType()));
- funcctx.addQuotedFunction("virtual void queryCompiled(IResourceContext *ctx, size32_t & retLen, const void * & retData) override");
- funcctx.addQuotedF("//uncompressed size = %d", buffer.length());
- buildExpr(funcctx, srcData, bound);
- funcctx.addQuoted(s.append("retLen = ").append(buffLen).append(";"));
- funcctx.addAssign(retData, srcData);
- }
- void HqlCppTranslator::gatherExplicitMatched(IHqlExpression * expr)
- {
- ForEachChild(idx, expr)
- {
- IHqlExpression * cur = expr->queryChild(idx);
- if (cur->getOperator() == no_matched)
- {
- IHqlExpression * arg = cur->queryChild(0);
- if (arg->getOperator() == no_all)
- nlpParse->addAllMatched();
- else
- nlpParse->addMatchReference(arg);
- }
- }
- }
- ABoundActivity * HqlCppTranslator::doBuildActivityParse(BuildCtx & ctx, IHqlExpression * _expr)
- {
- Owned<ABoundActivity> boundDataset = buildCachedActivity(ctx, _expr->queryChild(0));
- cycle_t startCycles = get_cycles_now();
- OwnedHqlExpr expr = optimizeParse(_expr);
- Owned<ActivityInstance> instance = new ActivityInstance(*this, ctx, TAKparse, expr, "Parse");
- buildActivityFramework(instance);
-
- buildInstancePrefix(instance);
- //This will become conditional on the flags....
- cycle_t startPrepareCycles = get_cycles_now();
- if (expr->hasAttribute(tomitaAtom))
- nlpParse = createTomitaContext(expr, code->workunit, options);
- else
- {
- //In 64bit the engines have enough stack space to use the stack-based regex implementation
- byte algorithm = __DEFINED_64BIT__ ? NLPAregexStack : NLPAregexHeap;
- switch (options.regexVersion)
- {
- case 1:
- algorithm = NLPAregexStack;
- break;
- case 2:
- algorithm = NLPAregexHeap;
- break;
- }
- IHqlExpression * algorithmHint = queryHintChild(expr, algorithmAtom, 0);
- if (matchesConstantString(algorithmHint, "stack", true))
- algorithm = NLPAregexStack;
- else if (matchesConstantString(algorithmHint, "heap", true))
- algorithm = NLPAregexHeap;
- nlpParse = createRegexContext(expr, code->workunit, options, algorithm);
- }
- gatherExplicitMatched(expr);
- doBuildParseTransform(instance->startctx, expr); // also gathers all the MATCHED() definitions.
- doBuildParseSearchText(instance->startctx, expr);
- doBuildParseValidators(instance->nestedctx, expr);
- doBuildParseExtra(instance->startctx, expr);
- if (options.timeTransforms)
- noteFinishedTiming("compile:PARSE:prepare", startPrepareCycles);
-
- MemoryBuffer buffer;
- cycle_t startCompileCycles = get_cycles_now();
- nlpParse->compileSearchPattern();
- nlpParse->queryParser()->serialize(buffer);
- if (nlpParse->isGrammarAmbiguous())
- WARNING1(CategoryEfficiency, HQLWRN_GrammarIsAmbiguous, instance->activityId);
- doBuildParseCompiled(instance->classctx, buffer);
- if (options.timeTransforms)
- noteFinishedTiming("compile:PARSE:compile", startCompileCycles);
- nlpParse->buildProductions(*this, instance->classctx, instance->startctx);
- #if 0
- StringBuffer text;
- getSystemTraceInfo(text, PerfMonProcMem);
- wu()->setDebugValue("maxMemory", text.str(), true);
- #endif
- if ((options.debugNlp != 0) && !options.obfuscateOutput)
- {
- BuildCtx subctx(instance->classctx);
- subctx.addQuotedLiteral("#if 0\nHuman readable form of the grammar");
- StringBuffer s;
- nlpParse->getDebugText(s, options.debugNlp);
- subctx.addQuoted(s);
- subctx.addQuotedLiteral("#endif");
- if (options.debugNlpAsHint)
- {
- StringBuffer hintText;
- hintText.append("<Hint type=\"activity\" id=\"").append(instance->activityId).append("\">").newline();
- encodeXML(s.str(), hintText, 0, s.length(), false);
- hintText.append("</Hint>");
- code->addHint(hintText.str(), ctxCallback);
- }
- }
- ::Release(nlpParse);
- nlpParse = NULL;
- buildInstanceSuffix(instance);
- buildConnectInputOutput(ctx, instance, boundDataset, 0, 0);
- if (options.timeTransforms)
- noteFinishedTiming("compile:PARSE", startCycles);
- return instance->getBoundActivity();
- }
- //---------------------------------------------------------------------------
- void getCheckRange(IHqlExpression * range, unsigned & minLength, unsigned & maxLength, unsigned charSize)
- {
- minLength = 0;
- maxLength = PATTERN_UNLIMITED_LENGTH;
- switch (range->getOperator())
- {
- case no_constant:
- minLength = maxLength = (unsigned)range->queryValue()->getIntValue();
- break;
- case no_rangefrom:
- minLength = (unsigned)range->queryChild(0)->queryValue()->getIntValue();
- break;
- case no_rangeto:
- maxLength = (unsigned)range->queryChild(0)->queryValue()->getIntValue();
- break;
- case no_range:
- minLength = (unsigned)range->queryChild(0)->queryValue()->getIntValue();
- maxLength = (unsigned)range->queryChild(1)->queryValue()->getIntValue();
- break;
- }
- minLength *= charSize;
- if (maxLength < PATTERN_UNLIMITED_LENGTH / charSize)
- maxLength *= charSize;
- else
- maxLength = PATTERN_UNLIMITED_LENGTH;
- }
- void HqlCppTranslator::doBuildMatched(BuildCtx & ctx, const CHqlBoundTarget * target, IHqlExpression * expr, CHqlBoundExpr * bound)
- {
- if (!nlpParse)
- throwError1(HQLERR_MatchedUsedOutsideParse, getOpString(expr->getOperator()));
- if (!ctx.queryMatchExpr(activeNlpMarkerExpr))
- {
- CHqlBoundExpr match;
- if (!buildExprInCorrectContext(ctx, expr, match, false))
- throwError1(HQLERR_MatchedUsedOutsideParse, getOpString(expr->getOperator()));
- if (target)
- assign(ctx, *target, match);
- else
- bound->set(match);
- return;
- }
- IHqlExpression * patternExpr = queryRealChild(expr, 0);
- if (ctx.queryMatchExpr(activeValidateMarkerExpr))
- {
- CHqlBoundExpr match;
- switch (expr->getOperator())
- {
- case no_matchtext:
- if (!ctx.getMatchExpr(activeMatchTextExpr, match))
- throwError(HQLERR_MatchTextNotUnicode);
- if (patternExpr)
- throwError1(HQLERR_NoArgumentsInValidator, "MATCHTEXT");
- break;
- case no_matchunicode:
- if (!ctx.getMatchExpr(activeMatchUnicodeExpr, match))
- throwError(HQLERR_MatchUnicodeNotText);
- if (patternExpr)
- throwError1(HQLERR_NoArgumentsInValidator, "MATCHUNICODE");
- break;
- case no_matchutf8:
- if (!ctx.getMatchExpr(activeMatchUtf8Expr, match))
- throwError(HQLERR_MatchUtf8NotText);
- if (patternExpr)
- throwError1(HQLERR_NoArgumentsInValidator, "MATCHUTF8");
- break;
- default:
- throwError(HQLERR_MatchTextOrUnicode);
- }
- if (target)
- assign(ctx, *target, match);
- else
- bound->set(match);
- return;
- }
- unsigned matchedIndex = nlpParse->addMatchReference(patternExpr);
- IIdAtom * func;
- switch (expr->getOperator())
- {
- case no_matched: func = getMatchedId; break;
- case no_matchtext: func = getMatchTextId; break;
- case no_matchunicode: func = getMatchUnicodeId; break;
- case no_matchlength: func = getMatchLengthId; break;
- case no_matchposition: func = getMatchPositionId; break;
- case no_matchutf8: func = getMatchUtf8Id; break;
- default: UNIMPLEMENTED;
- }
- HqlExprArray args;
- args.append(*createQuoted("matched", makeVoidType()));
- args.append(*createConstant((__int64)matchedIndex));
- OwnedHqlExpr call = bindFunctionCall(func, args);
- buildExprOrAssign(ctx, target, call, bound);
- }
- IReferenceSelector * HqlCppTranslator::doBuildRowMatchRow(BuildCtx & ctx, IHqlExpression * expr, bool isNew)
- {
- if (!nlpParse)
- throwError1(HQLERR_MatchedUsedOutsideParse, getOpString(expr->getOperator()));
- if (!ctx.queryMatchExpr(activeNlpMarkerExpr))
- throwError(HQLERR_AccessMatchAttrInChildQuery);
- unsigned matchedIndex = nlpParse->addMatchReference(expr->queryChild(1));
- HqlExprArray args;
- args.append(*createQuoted("matched", makeVoidType()));
- args.append(*createConstant((__int64)matchedIndex));
- OwnedHqlExpr call = bindTranslatedFunctionCall(getMatchRowId, args);
- IHqlExpression * record = expr->queryRecord();
- StringBuffer rowName;
- getUniqueId(rowName.append("row"));
- OwnedHqlExpr row = createVariable(rowName, makeConstantModifier(makeRowReferenceType(record)));
- ctx.addDeclare(row);
- ctx.addAssign(row, call);
- BoundRow * cursor = bindRow(ctx, expr, row);
- if (expr->queryChild(1))
- cursor->setConditional(true);
- return createReferenceSelector(cursor);
- }
- void HqlCppTranslator::doBuildMatchAttr(BuildCtx & ctx, const CHqlBoundTarget * target, IHqlExpression * expr, CHqlBoundExpr * bound)
- {
- if (!nlpParse)
- throwError1(HQLERR_MatchedUsedOutsideParse, getOpString(expr->getOperator()));
- if (!ctx.queryMatchExpr(activeNlpMarkerExpr) && !ctx.queryMatchExpr(activeProductionMarkerExpr))
- {
- CHqlBoundExpr match;
- if (!buildExprInCorrectContext(ctx, expr, match, false))
- throwError1(HQLERR_MatchedUsedOutsideParse, getOpString(expr->getOperator()));
- if (target)
- assign(ctx, *target, match);
- else
- bound->set(match);
- return;
- }
- HqlExprAssociation * marker = ctx.queryMatchExpr(activeProductionMarkerExpr);
- ITypeInfo * exprType = expr->queryType();
- if (marker)
- {
- HqlExprArray args;
- args.append(*LINK(marker->queryExpr()));
- args.append(*LINK(expr->queryChild(0)));
- IIdAtom * name;
- switch (exprType->getTypeCode())
- {
- case type_string:
- name = getProductionTextId;
- break;
- case type_unicode:
- name = getProductionUnicodeId;
- break;
- case type_utf8:
- name = getProductionUtf8Id;
- break;
- default:
- throwUnexpectedType(exprType);
- }
- OwnedHqlExpr call = bindFunctionCall(name, args);
- doBuildCall(ctx, target, call, bound);
- }
- else
- {
- node_operator op;
- switch (exprType->getTypeCode())
- {
- case type_string:
- op = no_matchtext;
- break;
- case type_unicode:
- op = no_matchunicode;
- break;
- case type_utf8:
- op = no_matchutf8;
- break;
- default:
- throwUnexpectedType(exprType);
- }
- OwnedHqlExpr newExpr = createValue(op, LINK(exprType));
- doBuildMatched(ctx, target, expr, bound);
- }
- }
- IReferenceSelector * HqlCppTranslator::doBuildRowMatchAttr(BuildCtx & ctx, IHqlExpression * expr)
- {
- if (!ctx.queryMatchExpr(activeNlpMarkerExpr) && !ctx.queryMatchExpr(activeProductionMarkerExpr))
- throwError(HQLERR_AccessMatchAttrInChildQuery);
- HqlExprArray args;
- IIdAtom * name;
- HqlExprAssociation * marker = ctx.queryMatchExpr(activeProductionMarkerExpr);
- if (marker)
- {
- name = getProductionResultId;
- args.append(*LINK(marker->queryExpr()));
- args.append(*LINK(expr->queryChild(1)));
- }
- else
- {
- name = getRootResultId;
- args.append(*createQuoted("matched", makeVoidType()));
- }
- OwnedHqlExpr call = bindTranslatedFunctionCall(name, args);
- IHqlExpression * record = expr->queryRecord();
- StringBuffer rowName;
- getUniqueId(rowName.append("row"));
- OwnedITypeInfo rowType = makeConstantModifier(makeRowReferenceType(record));
- rowType.setown(makeAttributeModifier(LINK(rowType), getLinkCountedAttr()));
- OwnedHqlExpr row = createVariable(rowName, rowType.getClear());
- ctx.addDeclare(row);
- ctx.addAssign(row, call);
- BoundRow * cursor = bindRow(ctx, expr, row);
- return createReferenceSelector(cursor);
- }
- /*
- Some special nodes are processed as follows:
- x (a before b) y : x -> a -> checkNext(b) -> y
- x (a after b) y : x -> checkPrev(b) -> a -> y
- x (a in b) y : x -> check(a, b) -> y
- x (a{2,3}) y : x -> repeat(a) -> y
- x (a+) y : x -> a +-> y
- ^-/
- Optimization issues:
- o Need information about all named elements that are referenced by MATCHED
- o The match ids are based on the logical named expressions, so if IHqlExpression tree rebuilt, matched need patching.
- o Don't want to re-apply same thing twice - so delay expanding named symbols.
- */
|