hqlnlp.cpp 35 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #ifndef U_OVERRIDE_CXX_ALLOCATION
  14. #define U_OVERRIDE_CXX_ALLOCATION 0 // Enabling this forces all allocation of ICU objects to ICU's heap, but is incompatible with jmemleak
  15. #endif
  16. #include "jliball.hpp"
  17. #include "hqlexpr.hpp"
  18. #include "hqlcerrors.hpp"
  19. #include "hqlnlp.ipp"
  20. #include "hqlhtcpp.ipp"
  21. #include "hqlcatom.hpp"
  22. #include "thorralgo.ipp"
  23. #include "hqlfold.hpp"
  24. #include "hqlcse.ipp"
  25. #include "hqlccommon.hpp"
  26. #include "hqlutil.hpp"
  27. #include "hqltcppc.hpp"
  28. #include "hqlcpputil.hpp"
  29. #define DEFAULT_NLP_DETAIL 1
  30. #define DEFAULT_PATTERN_MAX_LENGTH 4096
  31. #ifdef __64BIT__
  32. #define __DEFINED_64BIT__ true
  33. #else
  34. #define __DEFINED_64BIT__ false
  35. #endif
  36. //---------------------------------------------------------------------------
  37. ValidateKind getValidateKind(IHqlExpression * expr)
  38. {
  39. switch (expr->getOperator())
  40. {
  41. case no_matchtext: return ValidateIsString;
  42. case no_matchunicode: return ValidateIsUnicode;
  43. }
  44. ForEachChild(idx, expr)
  45. {
  46. ValidateKind kind = getValidateKind(expr->queryChild(idx));
  47. if (kind != ValidateIsEither)
  48. return kind;
  49. }
  50. return ValidateIsEither;
  51. }
  52. //---------------------------------------------------------------------------
  53. IHqlExpression * optimizeParse(IHqlExpression * parseExpr)
  54. {
  55. return LINK(parseExpr);
  56. }
  57. //===========================================================================
  58. MatchReference::MatchReference(IHqlExpression * expr)
  59. {
  60. expand(expr, true);
  61. }
  62. bool MatchReference::equals(const MatchReference & other) const
  63. {
  64. if (names.ordinality() != other.names.ordinality())
  65. return false;
  66. ForEachItemIn(idx, names)
  67. {
  68. if (&names.item(idx) != &other.names.item(idx))
  69. return false;
  70. if (&indices.item(idx) != &other.indices.item(idx))
  71. return false;
  72. }
  73. return true;
  74. }
  75. void MatchReference::expand(IHqlExpression * expr, bool isLast)
  76. {
  77. switch (expr->getOperator())
  78. {
  79. case no_pat_select:
  80. expand(expr->queryChild(0), false);
  81. expand(expr->queryChild(1), true);
  82. break;
  83. case no_pat_index:
  84. names.append(*LINK(expr->queryChild(0)));
  85. indices.append(*LINK(expr->queryChild(1)));
  86. break;
  87. default:
  88. names.append(*LINK(expr));
  89. indices.append(*createConstant((int)UNKNOWN_INSTANCE));
  90. break;
  91. }
  92. }
  93. void MatchReference::getPath(StringBuffer & path)
  94. {
  95. ForEachItemIn(idx, names)
  96. {
  97. if (idx)
  98. path.append(".");
  99. path.append(lower(queryPatternName(&names.item(idx))));
  100. }
  101. }
  102. StringBuffer & MatchReference::getDebugText(StringBuffer & out, RegexIdAllocator & idAllocator)
  103. {
  104. ForEachItemIn(i1, names)
  105. {
  106. IHqlExpression & curName = names.item(i1);
  107. IAtom * name = lower(queryPatternName(&curName));
  108. if (i1)
  109. out.append("/");
  110. out.append(str(name));
  111. out.append("{").append(idAllocator.queryID(curName.queryChild(0), name)).append("}");
  112. unsigned inst = (unsigned)indices.item(i1).queryValue()->getIntValue();
  113. if (inst != UNKNOWN_INSTANCE)
  114. out.append("[").append(inst).append("]");
  115. }
  116. return out;
  117. }
  118. void MatchReference::compileMatched(RegexIdAllocator & idAllocator, UnsignedArray & ids, UnsignedArray & indexValues)
  119. {
  120. ForEachItemIn(idx, names)
  121. {
  122. IHqlExpression & curName = names.item(idx);
  123. ids.append(idAllocator.queryID(curName.queryChild(0), lower(queryPatternName(&curName))));
  124. indexValues.append((unsigned)indices.item(idx).queryValue()->getIntValue());
  125. }
  126. }
  127. //---------------------------------------------------------------------------
  128. NlpParseContext::NlpParseContext(IHqlExpression * _expr, IWorkUnit * _wu, const HqlCppOptions & options)
  129. {
  130. workunit = _wu;
  131. expr.set(_expr);
  132. IHqlExpression * search = expr->queryChild(1);
  133. switch (search->queryType()->getTypeCode())
  134. {
  135. case type_unicode:
  136. case type_varunicode:
  137. info.type = type_unicode;
  138. break;
  139. case type_utf8:
  140. info.type = type_utf8;
  141. break;
  142. default:
  143. info.type = type_string;
  144. break;
  145. }
  146. info.charSize = (info.type == type_unicode) ? sizeof(UChar) : sizeof(char);
  147. IHqlExpression * sepAttr = expr->queryAttribute(separatorAtom);
  148. if (sepAttr)
  149. info.separator.set(sepAttr->queryChild(0));
  150. info.caseSensitive = !expr->hasAttribute(noCaseAtom); // default true.
  151. info.dfaRepeatMax = options.dfaRepeatMax;
  152. info.dfaRepeatMaxScore = options.dfaRepeatMaxScore;
  153. info.uidBase = getUniqueId();
  154. allMatched = false;
  155. }
  156. void NlpParseContext::addAllMatched()
  157. {
  158. allMatched = true;
  159. }
  160. unsigned NlpParseContext::addMatchReference(IHqlExpression * matchPathExpr)
  161. {
  162. if (!matchPathExpr) matchPathExpr = expr->queryChild(2);
  163. Owned<MatchReference> ref = new MatchReference(matchPathExpr);
  164. ForEachItemIn(idx, matches)
  165. {
  166. if (ref->equals(matches.item(idx)))
  167. return idx;
  168. }
  169. matches.append(*ref.getClear());
  170. extractMatchedSymbols(matchPathExpr);
  171. return matches.ordinality()-1;
  172. }
  173. IHqlExpression * NlpParseContext::queryValidateExpr(IHqlExpression * expr) const
  174. {
  175. IHqlExpression * unicodeTest = expr->queryChild(2);
  176. switch (info.type)
  177. {
  178. case type_unicode:
  179. case type_utf8: // I think this is best
  180. if (unicodeTest && !unicodeTest->isAttribute())
  181. return unicodeTest;
  182. break;
  183. }
  184. return expr->queryChild(1);
  185. }
  186. void NlpParseContext::buildValidators(HqlCppTranslator & translator, BuildCtx & classctx)
  187. {
  188. if (validators.ordinality())
  189. {
  190. BuildCtx helperctx(classctx);
  191. IHqlStmt * helperClassStmt = translator.beginNestedClass(helperctx, "helper", "INlpHelper");
  192. BuildCtx funcctx(helperctx);
  193. funcctx.addQuotedFunction("virtual IValidator * queryValidator(unsigned i) override");
  194. BuildCtx casectx(funcctx);
  195. casectx.addQuotedCompoundLiteral("switch (i)");
  196. ForEachItemIn(idx, validators)
  197. {
  198. StringBuffer member;
  199. translator.getUniqueId(member.append("val"));
  200. LinkedHqlExpr validateExpr = queryValidateExpr(&validators.item(idx));
  201. ValidateKind kind = getValidateKind(validateExpr);
  202. BuildCtx validatorctx(helperctx);
  203. IHqlStmt * classStmt = translator.beginNestedClass(validatorctx, member, (kind != ValidateIsUnicode) ? "IStringValidator" : "IUnicodeValidator");
  204. {
  205. MemberFunction func(translator, validatorctx);
  206. CHqlBoundExpr boundMatched;
  207. if (kind != ValidateIsUnicode)
  208. {
  209. func.start("virtual bool isValid(unsigned len, const char * data) override");
  210. boundMatched.length.setown(createVariable("len", LINK(sizetType)));
  211. boundMatched.expr.setown(createVariable("data", makeReferenceModifier(LINK(unknownStringType))));
  212. func.ctx.associateExpr(activeMatchTextExpr, boundMatched);
  213. }
  214. else
  215. {
  216. func.start("virtual bool isValid(unsigned len, const UChar * data) override");
  217. boundMatched.length.setown(createVariable("len", LINK(sizetType)));
  218. boundMatched.expr.setown(createVariable("data", makeReferenceModifier(LINK(unknownUnicodeType))));
  219. func.ctx.associateExpr(activeMatchUnicodeExpr, boundMatched);
  220. }
  221. func.ctx.associateExpr(activeNlpMarkerExpr, activeNlpMarkerExpr);
  222. func.ctx.associateExpr(activeValidateMarkerExpr, activeValidateMarkerExpr);
  223. translator.bindTableCursor(func.ctx, queryNlpParsePseudoTable(), queryNlpParsePseudoTable());
  224. if (translator.queryOptions().spotCSE)
  225. validateExpr.setown(spotScalarCSE(validateExpr, NULL, translator.queryOptions().spotCseInIfDatasetConditions));
  226. translator.buildReturn(func.ctx, validateExpr);
  227. }
  228. translator.endNestedClass(classStmt);
  229. StringBuffer s;
  230. s.append("case ").append(idx).append(": return &").append(member).append(";");
  231. casectx.addQuoted(s);
  232. }
  233. funcctx.addReturn(queryQuotedNullExpr());
  234. translator.endNestedClass(helperClassStmt);
  235. classctx.addQuotedLiteral("virtual INlpHelper * queryHelper() override { return &helper; }");
  236. }
  237. }
  238. void NlpParseContext::buildProductions(HqlCppTranslator & translator, BuildCtx & classctx, BuildCtx & startctx)
  239. {
  240. if (!productions.ordinality())
  241. return;
  242. {
  243. BuildCtx metactx(classctx);
  244. metactx.addQuotedFunction("virtual IOutputMetaData * queryProductionMeta(unsigned id) override");
  245. BuildCtx metacasectx(metactx);
  246. metacasectx.addQuotedCompoundLiteral("switch (id)");
  247. StringBuffer s;
  248. ForEachItemIn(i, productions)
  249. {
  250. IHqlExpression & cur = productions.item(i);
  251. MetaInstance meta(translator, cur.queryChild(1)->queryRecord(), false);
  252. translator.buildMetaInfo(meta);
  253. s.clear().append("case ").append(getIntValue(cur.queryChild(0)));
  254. s.append(": return &").append(meta.queryInstanceObject()).append(";");
  255. metacasectx.addQuoted(s);
  256. }
  257. metactx.addQuotedLiteral("return 0;");
  258. }
  259. {
  260. OwnedHqlExpr callback = createVariable("input", makeBoolType());
  261. MemberFunction func(translator, startctx, "virtual size32_t executeProduction(ARowBuilder & crSelf, unsigned id, IProductionCallback * input) override");
  262. func.ctx.associateExpr(activeProductionMarkerExpr, callback);
  263. BuildCtx prodcasectx(func.ctx);
  264. prodcasectx.addQuotedCompoundLiteral("switch (id)");
  265. StringBuffer s, subname, proto;
  266. ForEachItemIn(i, productions)
  267. {
  268. IHqlExpression & cur = productions.item(i);
  269. IHqlExpression * transform = cur.queryChild(1);
  270. if (transform->getOperator() == no_record)
  271. continue;
  272. subname.clear().append("executeProduction").append(i+1);
  273. s.clear().append("case ").append(getIntValue(cur.queryChild(0)));
  274. s.append(": return ").append(subname).append("(crSelf, input);");
  275. prodcasectx.addQuoted(s);
  276. {
  277. proto.clear().append("size32_t ").append(subname).append("(ARowBuilder & crSelf, IProductionCallback * input)");
  278. MemberFunction validateFunc(translator, startctx, proto, MFdynamicproto);
  279. translator.ensureRowAllocated(validateFunc.ctx, "crSelf");
  280. validateFunc.ctx.associateExpr(activeProductionMarkerExpr, callback);
  281. OwnedHqlExpr newTransform = LINK(transform);
  282. OwnedHqlExpr dataset = createDataset(no_anon, LINK(transform->queryRecord()));
  283. translator.buildTransformBody(validateFunc.ctx, newTransform, NULL, NULL, dataset, NULL);
  284. }
  285. }
  286. func.ctx.addQuotedLiteral("return (size32_t)-1;");
  287. }
  288. }
  289. void NlpParseContext::extractMatchedSymbols(IHqlExpression * expr)
  290. {
  291. switch (expr->getOperator())
  292. {
  293. case no_pat_select:
  294. extractMatchedSymbols(expr->queryChild(0));
  295. extractMatchedSymbols(expr->queryChild(1));
  296. break;
  297. case no_pat_index:
  298. expr = expr->queryChild(0);
  299. //fall through
  300. default:
  301. {
  302. assertex(expr->getOperator() == no_pat_instance);
  303. if (!matchedSymbols.contains(*expr))
  304. matchedSymbols.append(*LINK(expr));
  305. break;
  306. }
  307. }
  308. }
  309. bool NlpParseContext::isMatched(IHqlExpression * expr, IAtom * name)
  310. {
  311. if (allMatched)
  312. return true;
  313. ForEachItemIn(i, matchedSymbols)
  314. {
  315. IHqlExpression & cur = matchedSymbols.item(i);
  316. if ((cur.queryChild(0)->queryBody() == expr->queryBody()) &&
  317. (cur.queryChild(1)->queryName() == name))
  318. return true;
  319. }
  320. return false;
  321. }
  322. void NlpParseContext::doExtractValidates(IHqlExpression * expr)
  323. {
  324. if (expr->queryTransformExtra())
  325. return;
  326. expr->setTransformExtraUnlinked(expr);
  327. switch (expr->getOperator())
  328. {
  329. case no_pat_validate:
  330. validators.append(*LINK(expr));
  331. doExtractValidates(expr->queryChild(0));
  332. break;
  333. default:
  334. ForEachChild(idx, expr)
  335. doExtractValidates(expr->queryChild(idx));
  336. break;
  337. }
  338. }
  339. void NlpParseContext::extractValidates(IHqlExpression * expr)
  340. {
  341. lockTransformMutex();
  342. doExtractValidates(expr);
  343. unlockTransformMutex();
  344. }
  345. bool NlpParseContext::isValidMatch(MatchReference & match, unsigned depth, IHqlExpression * pattern)
  346. {
  347. //MORE: This should check whether already visited, especially once we allow recursive patterns!
  348. if (&match.names.item(depth) == pattern)
  349. {
  350. if (depth+1 == match.names.ordinality())
  351. return true;
  352. if (isValidMatch(match, depth+1, pattern))
  353. return true;
  354. }
  355. ForEachChild(idx, pattern)
  356. if (isValidMatch(match, depth, pattern->queryChild(idx)))
  357. return true;
  358. return false;
  359. }
  360. void NlpParseContext::checkValidMatches()
  361. {
  362. ForEachItemIn(idx, matches)
  363. {
  364. if (!isValidMatch(matches.item(idx), 0, expr->queryChild(2)))
  365. {
  366. StringBuffer path;
  367. matches.item(idx).getPath(path);
  368. throwError1(HQLERR_BadMatchedPath, path.str());
  369. }
  370. }
  371. }
  372. void NlpParseContext::compileMatched(NlpAlgorithm & parser)
  373. {
  374. parser.matchInfo->setFormat(info.inputFormat());
  375. ForEachItemIn(idx, matches)
  376. {
  377. UnsignedArray ids;
  378. UnsignedArray indexValues;
  379. matches.item(idx).compileMatched(idAllocator, ids, indexValues);
  380. parser.matchInfo->addResult(ids, indexValues);
  381. }
  382. }
  383. static void getOptions(IHqlExpression * expr, INlpParseAlgorithm::MatchAction & matchAction, INlpParseAlgorithm::ScanAction & scanAction)
  384. {
  385. matchAction = INlpParseAlgorithm::NlpMatchAll;
  386. scanAction = INlpParseAlgorithm::NlpScanNext;
  387. if (expr->hasAttribute(firstAtom)) matchAction = INlpParseAlgorithm::NlpMatchFirst;
  388. if (expr->hasAttribute(allAtom)) matchAction = INlpParseAlgorithm::NlpMatchAll;
  389. if (expr->hasAttribute(noScanAtom)) scanAction = INlpParseAlgorithm::NlpScanNone;
  390. if (expr->hasAttribute(scanAtom)) scanAction = INlpParseAlgorithm::NlpScanNext;
  391. if (expr->hasAttribute(scanAllAtom)) scanAction = INlpParseAlgorithm::NlpScanAll;
  392. if (expr->hasAttribute(wholeAtom)) scanAction = INlpParseAlgorithm::NlpScanWhole;
  393. }
  394. void NlpParseContext::getDebugText(StringBuffer & s, unsigned detail)
  395. {
  396. INlpParseAlgorithm::MatchAction matchAction;
  397. INlpParseAlgorithm::ScanAction scanAction;
  398. getOptions(expr, matchAction, scanAction);
  399. s.newline().append("Options: ").append("Match(");
  400. switch (matchAction)
  401. {
  402. case INlpParseAlgorithm::NlpMatchFirst: s.append("First"); break;
  403. case INlpParseAlgorithm::NlpMatchAll: s.append("All"); break;
  404. }
  405. s.append(") Scan(");
  406. switch (scanAction)
  407. {
  408. case INlpParseAlgorithm::NlpScanWhole: s.append("Whole"); break;
  409. case INlpParseAlgorithm::NlpScanNone: s.append("None"); break;
  410. case INlpParseAlgorithm::NlpScanNext: s.append("Next"); break;
  411. case INlpParseAlgorithm::NlpScanAll: s.append("All"); break;
  412. }
  413. s.append(")").newline();
  414. s.append("Matches:").newline();
  415. ForEachItemIn(idx, matches)
  416. matches.item(idx).getDebugText(s.append("\t"), idAllocator).newline();
  417. }
  418. void NlpParseContext::setParserOptions(INlpParseAlgorithm & parser)
  419. {
  420. INlpParseAlgorithm::MatchAction matchAction;
  421. INlpParseAlgorithm::ScanAction scanAction;
  422. getOptions(expr, matchAction, scanAction);
  423. IHqlExpression * keep = expr->queryAttribute(keepAtom);
  424. unsigned keepLimit = keep ? (unsigned)keep->queryChild(0)->queryValue()->getIntValue() : 0;
  425. IHqlExpression * atmost = expr->queryAttribute(atmostAtom);
  426. unsigned atmostLimit = atmost ? (unsigned)atmost->queryChild(0)->queryValue()->getIntValue() : 0;
  427. IHqlExpression * maxLength = expr->queryAttribute(maxLengthAtom);
  428. size32_t maxLengthValue = maxLength ? (unsigned)maxLength->queryChild(0)->queryValue()->getIntValue() : DEFAULT_PATTERN_MAX_LENGTH;
  429. parser.setOptions(matchAction, scanAction, info.inputFormat(), keepLimit, atmostLimit);
  430. parser.setChoose(expr->hasAttribute(minAtom), expr->hasAttribute(maxAtom), expr->hasAttribute(bestAtom), !expr->hasAttribute(manyAtom));
  431. parser.setJoin(expr->hasAttribute(notMatchedAtom), expr->hasAttribute(notMatchedOnlyAtom));
  432. parser.setLimit(maxLengthValue);
  433. }
  434. //---------------------------------------------------------------------------
  435. void HqlCppTranslator::doBuildParseTransform(BuildCtx & classctx, IHqlExpression * expr)
  436. {
  437. MemberFunction func(*this, classctx, "virtual size32_t transform(ARowBuilder & crSelf, const void * _left, IMatchedResults * matched, IMatchWalker * walker) override");
  438. ensureRowAllocated(func.ctx, "crSelf");
  439. func.ctx.addQuotedLiteral("const unsigned char * left = (const unsigned char *) _left;");
  440. func.ctx.associateExpr(activeNlpMarkerExpr, activeNlpMarkerExpr);
  441. bindTableCursor(func.ctx, queryNlpParsePseudoTable(), queryNlpParsePseudoTable());
  442. // Bind left to "left" and right to RIGHT
  443. IHqlExpression * dataset = expr->queryChild(0);
  444. IHqlExpression * transform = expr->queryChild(4);
  445. BoundRow * selfCursor = bindSelf(func.ctx, expr, "crSelf");
  446. if (transform->getOperator() == no_newtransform)
  447. bindTableCursor(func.ctx, dataset, "left");
  448. else
  449. bindTableCursor(func.ctx, dataset, "left", no_left, querySelSeq(expr));
  450. associateSkipReturnMarker(func.ctx, queryZero(), selfCursor);
  451. doTransform(func.ctx, transform, selfCursor);
  452. buildReturnRecordSize(func.ctx, selfCursor);
  453. }
  454. void HqlCppTranslator::doBuildParseSearchText(BuildCtx & classctx, IHqlExpression * dataset, IHqlExpression * search, type_t searchTypeCode, ITypeInfo * transferType)
  455. {
  456. bool needToFree = true;
  457. {
  458. MemberFunction func(*this, classctx);
  459. if (searchTypeCode == type_unicode)
  460. {
  461. func.start("virtual void getSearchText(size32_t & retLen, char * & _retText, const void * _self) override");
  462. func.ctx.addQuotedLiteral("UChar * & retText = *(UChar * *)&_retText;"); // don't ask.
  463. }
  464. else
  465. func.start("virtual void getSearchText(size32_t & retLen, char * & retText, const void * _self) override");
  466. func.ctx.addQuotedLiteral("const unsigned char * self = (const unsigned char *) _self;");
  467. bindTableCursor(func.ctx, dataset, "self");
  468. Owned<ITypeInfo> retType;
  469. switch (searchTypeCode)
  470. {
  471. case type_unicode:
  472. retType.setown(makeUnicodeType(UNKNOWN_LENGTH, 0));
  473. break;
  474. case type_utf8:
  475. retType.setown(makeUtf8Type(UNKNOWN_LENGTH, 0));
  476. break;
  477. default:
  478. retType.setown(makeStringType(UNKNOWN_LENGTH, NULL, NULL));
  479. break;
  480. }
  481. OwnedHqlExpr castSearch = ensureExprType(search, retType);
  482. castSearch.setown(foldHqlExpression(castSearch));
  483. retType.setown(makeReferenceModifier(retType.getClear()));
  484. switch (castSearch->getOperator())
  485. {
  486. case no_select:
  487. case no_constant:
  488. //Not strictly true - could be conditional
  489. //also misses lots of cases - but I doubt anyone will ever complain...
  490. needToFree = false;
  491. break;
  492. }
  493. OwnedHqlExpr retLen = createVariable("retLen", LINK(sizetType));
  494. OwnedHqlExpr tempLen;
  495. if (transferType)
  496. tempLen.setown(func.ctx.getTempDeclare(sizetType, NULL));
  497. CHqlBoundTarget target;
  498. target.length.set(tempLen ? tempLen : retLen);
  499. target.expr.setown(createVariable("retText", LINK(retType)));
  500. if (needToFree)
  501. {
  502. buildExprAssign(func.ctx, target, castSearch);
  503. }
  504. else
  505. {
  506. CHqlBoundExpr bound;
  507. buildExpr(func.ctx, castSearch, bound);
  508. OwnedHqlExpr len = getBoundLength(bound);
  509. func.ctx.addAssign(target.length, len);
  510. OwnedHqlExpr transferred = createValue(no_cast, LINK(retType), LINK(bound.expr));
  511. func.ctx.addAssign(target.expr, transferred);
  512. }
  513. if (tempLen)
  514. {
  515. OwnedHqlExpr source = target.getTranslatedExpr();
  516. OwnedHqlExpr transferred = createValue(no_typetransfer, LINK(transferType), LINK(source));
  517. OwnedHqlExpr length = createValue(no_charlen, LINK(sizetType), LINK(transferred));
  518. buildAssignToTemp(func.ctx, retLen, length);
  519. }
  520. }
  521. doBuildBoolFunction(classctx, "searchTextNeedsFree", needToFree);
  522. }
  523. void HqlCppTranslator::doBuildParseSearchText(BuildCtx & classctx, IHqlExpression * expr)
  524. {
  525. doBuildParseSearchText(classctx, expr->queryChild(0), expr->queryChild(1), nlpParse->searchType(), NULL);
  526. }
  527. void HqlCppTranslator::doBuildParseExtra(BuildCtx & classctx, IHqlExpression * expr)
  528. {
  529. StringBuffer flags;
  530. if (expr->hasAttribute(groupAtom)) flags.append("|PFgroup");
  531. if (expr->hasAttribute(parallelAtom)) flags.append("|PFparallel");
  532. if (flags.length())
  533. doBuildUnsignedFunction(classctx, "getFlags", flags.str()+1);
  534. }
  535. void HqlCppTranslator::doBuildParseValidators(BuildCtx & classctx, IHqlExpression * expr)
  536. {
  537. nlpParse->extractValidates(expr->queryChild(2));
  538. nlpParse->buildValidators(*this, classctx);
  539. }
  540. void HqlCppTranslator::doBuildParseCompiled(BuildCtx & classctx, MemoryBuffer & buffer)
  541. {
  542. if (buffer.length() > 1000000)
  543. WARNING1(CategoryEfficiency, HQLWRN_ParseVeryLargeDefinition, buffer.length());
  544. BuildCtx funcctx(classctx);
  545. MemoryBuffer compressed;
  546. compressToBuffer(compressed, buffer.length(), buffer.toByteArray());
  547. unsigned buffLen = compressed.length();
  548. CHqlBoundExpr bound;
  549. StringBuffer s;
  550. OwnedHqlExpr srcData = addDataLiteral((const char *)compressed.toByteArray(), buffLen);
  551. OwnedHqlExpr retData = createVariable("retData", makePointerType(makeVoidType()));
  552. funcctx.addQuotedFunction("virtual void queryCompiled(IResourceContext *ctx, size32_t & retLen, const void * & retData) override");
  553. funcctx.addQuotedF("//uncompressed size = %d", buffer.length());
  554. buildExpr(funcctx, srcData, bound);
  555. funcctx.addQuoted(s.append("retLen = ").append(buffLen).append(";"));
  556. funcctx.addAssign(retData, srcData);
  557. }
  558. void HqlCppTranslator::gatherExplicitMatched(IHqlExpression * expr)
  559. {
  560. ForEachChild(idx, expr)
  561. {
  562. IHqlExpression * cur = expr->queryChild(idx);
  563. if (cur->getOperator() == no_matched)
  564. {
  565. IHqlExpression * arg = cur->queryChild(0);
  566. if (arg->getOperator() == no_all)
  567. nlpParse->addAllMatched();
  568. else
  569. nlpParse->addMatchReference(arg);
  570. }
  571. }
  572. }
  573. ABoundActivity * HqlCppTranslator::doBuildActivityParse(BuildCtx & ctx, IHqlExpression * _expr)
  574. {
  575. Owned<ABoundActivity> boundDataset = buildCachedActivity(ctx, _expr->queryChild(0));
  576. cycle_t startCycles = get_cycles_now();
  577. OwnedHqlExpr expr = optimizeParse(_expr);
  578. Owned<ActivityInstance> instance = new ActivityInstance(*this, ctx, TAKparse, expr, "Parse");
  579. buildActivityFramework(instance);
  580. buildInstancePrefix(instance);
  581. //This will become conditional on the flags....
  582. cycle_t startPrepareCycles = get_cycles_now();
  583. if (expr->hasAttribute(tomitaAtom))
  584. nlpParse = createTomitaContext(expr, code->workunit, options);
  585. else
  586. {
  587. //In 64bit the engines have enough stack space to use the stack-based regex implementation
  588. byte algorithm = __DEFINED_64BIT__ ? NLPAregexStack : NLPAregexHeap;
  589. switch (options.regexVersion)
  590. {
  591. case 1:
  592. algorithm = NLPAregexStack;
  593. break;
  594. case 2:
  595. algorithm = NLPAregexHeap;
  596. break;
  597. }
  598. IHqlExpression * algorithmHint = queryHintChild(expr, algorithmAtom, 0);
  599. if (matchesConstantString(algorithmHint, "stack", true))
  600. algorithm = NLPAregexStack;
  601. else if (matchesConstantString(algorithmHint, "heap", true))
  602. algorithm = NLPAregexHeap;
  603. nlpParse = createRegexContext(expr, code->workunit, options, algorithm);
  604. }
  605. gatherExplicitMatched(expr);
  606. doBuildParseTransform(instance->startctx, expr); // also gathers all the MATCHED() definitions.
  607. doBuildParseSearchText(instance->startctx, expr);
  608. doBuildParseValidators(instance->nestedctx, expr);
  609. doBuildParseExtra(instance->startctx, expr);
  610. if (options.timeTransforms)
  611. noteFinishedTiming("compile:PARSE:prepare", startPrepareCycles);
  612. MemoryBuffer buffer;
  613. cycle_t startCompileCycles = get_cycles_now();
  614. nlpParse->compileSearchPattern();
  615. nlpParse->queryParser()->serialize(buffer);
  616. if (nlpParse->isGrammarAmbiguous())
  617. WARNING1(CategoryEfficiency, HQLWRN_GrammarIsAmbiguous, instance->activityId);
  618. doBuildParseCompiled(instance->classctx, buffer);
  619. if (options.timeTransforms)
  620. noteFinishedTiming("compile:PARSE:compile", startCompileCycles);
  621. nlpParse->buildProductions(*this, instance->classctx, instance->startctx);
  622. #if 0
  623. StringBuffer text;
  624. getSystemTraceInfo(text, PerfMonProcMem);
  625. wu()->setDebugValue("maxMemory", text.str(), true);
  626. #endif
  627. if ((options.debugNlp != 0) && !options.obfuscateOutput)
  628. {
  629. BuildCtx subctx(instance->classctx);
  630. subctx.addQuotedLiteral("#if 0\nHuman readable form of the grammar");
  631. StringBuffer s;
  632. nlpParse->getDebugText(s, options.debugNlp);
  633. subctx.addQuoted(s);
  634. subctx.addQuotedLiteral("#endif");
  635. if (options.debugNlpAsHint)
  636. {
  637. StringBuffer hintText;
  638. hintText.append("<Hint type=\"activity\" id=\"").append(instance->activityId).append("\">").newline();
  639. encodeXML(s.str(), hintText, 0, s.length(), false);
  640. hintText.append("</Hint>");
  641. code->addHint(hintText.str(), ctxCallback);
  642. }
  643. }
  644. ::Release(nlpParse);
  645. nlpParse = NULL;
  646. buildInstanceSuffix(instance);
  647. buildConnectInputOutput(ctx, instance, boundDataset, 0, 0);
  648. if (options.timeTransforms)
  649. noteFinishedTiming("compile:PARSE", startCycles);
  650. return instance->getBoundActivity();
  651. }
  652. //---------------------------------------------------------------------------
  653. void getCheckRange(IHqlExpression * range, unsigned & minLength, unsigned & maxLength, unsigned charSize)
  654. {
  655. minLength = 0;
  656. maxLength = PATTERN_UNLIMITED_LENGTH;
  657. switch (range->getOperator())
  658. {
  659. case no_constant:
  660. minLength = maxLength = (unsigned)range->queryValue()->getIntValue();
  661. break;
  662. case no_rangefrom:
  663. minLength = (unsigned)range->queryChild(0)->queryValue()->getIntValue();
  664. break;
  665. case no_rangeto:
  666. maxLength = (unsigned)range->queryChild(0)->queryValue()->getIntValue();
  667. break;
  668. case no_range:
  669. minLength = (unsigned)range->queryChild(0)->queryValue()->getIntValue();
  670. maxLength = (unsigned)range->queryChild(1)->queryValue()->getIntValue();
  671. break;
  672. }
  673. minLength *= charSize;
  674. if (maxLength < PATTERN_UNLIMITED_LENGTH / charSize)
  675. maxLength *= charSize;
  676. else
  677. maxLength = PATTERN_UNLIMITED_LENGTH;
  678. }
  679. void HqlCppTranslator::doBuildMatched(BuildCtx & ctx, const CHqlBoundTarget * target, IHqlExpression * expr, CHqlBoundExpr * bound)
  680. {
  681. if (!nlpParse)
  682. throwError1(HQLERR_MatchedUsedOutsideParse, getOpString(expr->getOperator()));
  683. if (!ctx.queryMatchExpr(activeNlpMarkerExpr))
  684. {
  685. CHqlBoundExpr match;
  686. if (!buildExprInCorrectContext(ctx, expr, match, false))
  687. throwError1(HQLERR_MatchedUsedOutsideParse, getOpString(expr->getOperator()));
  688. if (target)
  689. assign(ctx, *target, match);
  690. else
  691. bound->set(match);
  692. return;
  693. }
  694. IHqlExpression * patternExpr = queryRealChild(expr, 0);
  695. if (ctx.queryMatchExpr(activeValidateMarkerExpr))
  696. {
  697. CHqlBoundExpr match;
  698. switch (expr->getOperator())
  699. {
  700. case no_matchtext:
  701. if (!ctx.getMatchExpr(activeMatchTextExpr, match))
  702. throwError(HQLERR_MatchTextNotUnicode);
  703. if (patternExpr)
  704. throwError1(HQLERR_NoArgumentsInValidator, "MATCHTEXT");
  705. break;
  706. case no_matchunicode:
  707. if (!ctx.getMatchExpr(activeMatchUnicodeExpr, match))
  708. throwError(HQLERR_MatchUnicodeNotText);
  709. if (patternExpr)
  710. throwError1(HQLERR_NoArgumentsInValidator, "MATCHUNICODE");
  711. break;
  712. case no_matchutf8:
  713. if (!ctx.getMatchExpr(activeMatchUtf8Expr, match))
  714. throwError(HQLERR_MatchUtf8NotText);
  715. if (patternExpr)
  716. throwError1(HQLERR_NoArgumentsInValidator, "MATCHUTF8");
  717. break;
  718. default:
  719. throwError(HQLERR_MatchTextOrUnicode);
  720. }
  721. if (target)
  722. assign(ctx, *target, match);
  723. else
  724. bound->set(match);
  725. return;
  726. }
  727. unsigned matchedIndex = nlpParse->addMatchReference(patternExpr);
  728. IIdAtom * func;
  729. switch (expr->getOperator())
  730. {
  731. case no_matched: func = getMatchedId; break;
  732. case no_matchtext: func = getMatchTextId; break;
  733. case no_matchunicode: func = getMatchUnicodeId; break;
  734. case no_matchlength: func = getMatchLengthId; break;
  735. case no_matchposition: func = getMatchPositionId; break;
  736. case no_matchutf8: func = getMatchUtf8Id; break;
  737. default: UNIMPLEMENTED;
  738. }
  739. HqlExprArray args;
  740. args.append(*createQuoted("matched", makeVoidType()));
  741. args.append(*createConstant((__int64)matchedIndex));
  742. OwnedHqlExpr call = bindFunctionCall(func, args);
  743. buildExprOrAssign(ctx, target, call, bound);
  744. }
  745. IReferenceSelector * HqlCppTranslator::doBuildRowMatchRow(BuildCtx & ctx, IHqlExpression * expr, bool isNew)
  746. {
  747. if (!nlpParse)
  748. throwError1(HQLERR_MatchedUsedOutsideParse, getOpString(expr->getOperator()));
  749. if (!ctx.queryMatchExpr(activeNlpMarkerExpr))
  750. throwError(HQLERR_AccessMatchAttrInChildQuery);
  751. unsigned matchedIndex = nlpParse->addMatchReference(expr->queryChild(1));
  752. HqlExprArray args;
  753. args.append(*createQuoted("matched", makeVoidType()));
  754. args.append(*createConstant((__int64)matchedIndex));
  755. OwnedHqlExpr call = bindTranslatedFunctionCall(getMatchRowId, args);
  756. IHqlExpression * record = expr->queryRecord();
  757. StringBuffer rowName;
  758. getUniqueId(rowName.append("row"));
  759. OwnedHqlExpr row = createVariable(rowName, makeConstantModifier(makeRowReferenceType(record)));
  760. ctx.addDeclare(row);
  761. ctx.addAssign(row, call);
  762. BoundRow * cursor = bindRow(ctx, expr, row);
  763. if (expr->queryChild(1))
  764. cursor->setConditional(true);
  765. return createReferenceSelector(cursor);
  766. }
  767. void HqlCppTranslator::doBuildMatchAttr(BuildCtx & ctx, const CHqlBoundTarget * target, IHqlExpression * expr, CHqlBoundExpr * bound)
  768. {
  769. if (!nlpParse)
  770. throwError1(HQLERR_MatchedUsedOutsideParse, getOpString(expr->getOperator()));
  771. if (!ctx.queryMatchExpr(activeNlpMarkerExpr) && !ctx.queryMatchExpr(activeProductionMarkerExpr))
  772. {
  773. CHqlBoundExpr match;
  774. if (!buildExprInCorrectContext(ctx, expr, match, false))
  775. throwError1(HQLERR_MatchedUsedOutsideParse, getOpString(expr->getOperator()));
  776. if (target)
  777. assign(ctx, *target, match);
  778. else
  779. bound->set(match);
  780. return;
  781. }
  782. HqlExprAssociation * marker = ctx.queryMatchExpr(activeProductionMarkerExpr);
  783. ITypeInfo * exprType = expr->queryType();
  784. if (marker)
  785. {
  786. HqlExprArray args;
  787. args.append(*LINK(marker->queryExpr()));
  788. args.append(*LINK(expr->queryChild(0)));
  789. IIdAtom * name;
  790. switch (exprType->getTypeCode())
  791. {
  792. case type_string:
  793. name = getProductionTextId;
  794. break;
  795. case type_unicode:
  796. name = getProductionUnicodeId;
  797. break;
  798. case type_utf8:
  799. name = getProductionUtf8Id;
  800. break;
  801. default:
  802. throwUnexpectedType(exprType);
  803. }
  804. OwnedHqlExpr call = bindFunctionCall(name, args);
  805. doBuildCall(ctx, target, call, bound);
  806. }
  807. else
  808. {
  809. node_operator op;
  810. switch (exprType->getTypeCode())
  811. {
  812. case type_string:
  813. op = no_matchtext;
  814. break;
  815. case type_unicode:
  816. op = no_matchunicode;
  817. break;
  818. case type_utf8:
  819. op = no_matchutf8;
  820. break;
  821. default:
  822. throwUnexpectedType(exprType);
  823. }
  824. OwnedHqlExpr newExpr = createValue(op, LINK(exprType));
  825. doBuildMatched(ctx, target, expr, bound);
  826. }
  827. }
  828. IReferenceSelector * HqlCppTranslator::doBuildRowMatchAttr(BuildCtx & ctx, IHqlExpression * expr)
  829. {
  830. if (!ctx.queryMatchExpr(activeNlpMarkerExpr) && !ctx.queryMatchExpr(activeProductionMarkerExpr))
  831. throwError(HQLERR_AccessMatchAttrInChildQuery);
  832. HqlExprArray args;
  833. IIdAtom * name;
  834. HqlExprAssociation * marker = ctx.queryMatchExpr(activeProductionMarkerExpr);
  835. if (marker)
  836. {
  837. name = getProductionResultId;
  838. args.append(*LINK(marker->queryExpr()));
  839. args.append(*LINK(expr->queryChild(1)));
  840. }
  841. else
  842. {
  843. name = getRootResultId;
  844. args.append(*createQuoted("matched", makeVoidType()));
  845. }
  846. OwnedHqlExpr call = bindTranslatedFunctionCall(name, args);
  847. IHqlExpression * record = expr->queryRecord();
  848. StringBuffer rowName;
  849. getUniqueId(rowName.append("row"));
  850. OwnedITypeInfo rowType = makeConstantModifier(makeRowReferenceType(record));
  851. rowType.setown(makeAttributeModifier(LINK(rowType), getLinkCountedAttr()));
  852. OwnedHqlExpr row = createVariable(rowName, rowType.getClear());
  853. ctx.addDeclare(row);
  854. ctx.addAssign(row, call);
  855. BoundRow * cursor = bindRow(ctx, expr, row);
  856. return createReferenceSelector(cursor);
  857. }
  858. /*
  859. Some special nodes are processed as follows:
  860. x (a before b) y : x -> a -> checkNext(b) -> y
  861. x (a after b) y : x -> checkPrev(b) -> a -> y
  862. x (a in b) y : x -> check(a, b) -> y
  863. x (a{2,3}) y : x -> repeat(a) -> y
  864. x (a+) y : x -> a +-> y
  865. ^-/
  866. Optimization issues:
  867. o Need information about all named elements that are referenced by MATCHED
  868. o The match ids are based on the logical named expressions, so if IHqlExpression tree rebuilt, matched need patching.
  869. o Don't want to re-apply same thing twice - so delay expanding named symbols.
  870. */