hqlnlp.cpp 35 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #ifndef U_OVERRIDE_CXX_ALLOCATION
  14. #define U_OVERRIDE_CXX_ALLOCATION 0 // Enabling this forces all allocation of ICU objects to ICU's heap, but is incompatible with jmemleak
  15. #endif
  16. #include "jliball.hpp"
  17. #include "hqlexpr.hpp"
  18. #include "hqlcerrors.hpp"
  19. #include "hqlnlp.ipp"
  20. #include "hqlhtcpp.ipp"
  21. #include "hqlcatom.hpp"
  22. #include "thorralgo.ipp"
  23. #include "hqlfold.hpp"
  24. #include "hqlcse.ipp"
  25. #include "hqlccommon.hpp"
  26. #include "hqlutil.hpp"
  27. #include "hqltcppc.hpp"
  28. #include "hqlcpputil.hpp"
  29. #include "unicode/uchar.h"
  30. #define DEFAULT_NLP_DETAIL 1
  31. #define DEFAULT_PATTERN_MAX_LENGTH 4096
  32. #ifdef __64BIT__
  33. #define __DEFINED_64BIT__ true
  34. #else
  35. #define __DEFINED_64BIT__ false
  36. #endif
  37. //---------------------------------------------------------------------------
  38. ValidateKind getValidateKind(IHqlExpression * expr)
  39. {
  40. switch (expr->getOperator())
  41. {
  42. case no_matchtext: return ValidateIsString;
  43. case no_matchunicode: return ValidateIsUnicode;
  44. }
  45. ForEachChild(idx, expr)
  46. {
  47. ValidateKind kind = getValidateKind(expr->queryChild(idx));
  48. if (kind != ValidateIsEither)
  49. return kind;
  50. }
  51. return ValidateIsEither;
  52. }
  53. //---------------------------------------------------------------------------
  54. IHqlExpression * optimizeParse(IHqlExpression * parseExpr)
  55. {
  56. return LINK(parseExpr);
  57. }
  58. //===========================================================================
  59. MatchReference::MatchReference(IHqlExpression * expr)
  60. {
  61. expand(expr, true);
  62. }
  63. bool MatchReference::equals(const MatchReference & other) const
  64. {
  65. if (names.ordinality() != other.names.ordinality())
  66. return false;
  67. ForEachItemIn(idx, names)
  68. {
  69. if (&names.item(idx) != &other.names.item(idx))
  70. return false;
  71. if (&indices.item(idx) != &other.indices.item(idx))
  72. return false;
  73. }
  74. return true;
  75. }
  76. void MatchReference::expand(IHqlExpression * expr, bool isLast)
  77. {
  78. switch (expr->getOperator())
  79. {
  80. case no_pat_select:
  81. expand(expr->queryChild(0), false);
  82. expand(expr->queryChild(1), true);
  83. break;
  84. case no_pat_index:
  85. names.append(*LINK(expr->queryChild(0)));
  86. indices.append(*LINK(expr->queryChild(1)));
  87. break;
  88. default:
  89. names.append(*LINK(expr));
  90. indices.append(*createConstant((int)UNKNOWN_INSTANCE));
  91. break;
  92. }
  93. }
  94. void MatchReference::getPath(StringBuffer & path)
  95. {
  96. ForEachItemIn(idx, names)
  97. {
  98. if (idx)
  99. path.append(".");
  100. path.append(queryPatternName(&names.item(idx))->lower());
  101. }
  102. }
  103. StringBuffer & MatchReference::getDebugText(StringBuffer & out, RegexIdAllocator & idAllocator)
  104. {
  105. ForEachItemIn(i1, names)
  106. {
  107. IHqlExpression & curName = names.item(i1);
  108. IAtom * name = queryPatternName(&curName)->lower();
  109. if (i1)
  110. out.append("/");
  111. out.append(name->str());
  112. out.append("{").append(idAllocator.queryID(curName.queryChild(0), name)).append("}");
  113. unsigned inst = (unsigned)indices.item(i1).queryValue()->getIntValue();
  114. if (inst != UNKNOWN_INSTANCE)
  115. out.append("[").append(inst).append("]");
  116. }
  117. return out;
  118. }
  119. void MatchReference::compileMatched(RegexIdAllocator & idAllocator, UnsignedArray & ids, UnsignedArray & indexValues)
  120. {
  121. ForEachItemIn(idx, names)
  122. {
  123. IHqlExpression & curName = names.item(idx);
  124. ids.append(idAllocator.queryID(curName.queryChild(0), queryPatternName(&curName)->lower()));
  125. indexValues.append((unsigned)indices.item(idx).queryValue()->getIntValue());
  126. }
  127. }
  128. //---------------------------------------------------------------------------
  129. NlpParseContext::NlpParseContext(IHqlExpression * _expr, IWorkUnit * _wu, const HqlCppOptions & options, ITimeReporter * _timeReporter) : timeReporter(_timeReporter)
  130. {
  131. workunit = _wu;
  132. expr.set(_expr);
  133. IHqlExpression * search = expr->queryChild(1);
  134. switch (search->queryType()->getTypeCode())
  135. {
  136. case type_unicode:
  137. case type_varunicode:
  138. info.type = type_unicode;
  139. break;
  140. case type_utf8:
  141. info.type = type_utf8;
  142. break;
  143. default:
  144. info.type = type_string;
  145. break;
  146. }
  147. info.charSize = (info.type == type_unicode) ? sizeof(UChar) : sizeof(char);
  148. IHqlExpression * sepAttr = expr->queryAttribute(separatorAtom);
  149. if (sepAttr)
  150. info.separator.set(sepAttr->queryChild(0));
  151. info.caseSensitive = !expr->hasAttribute(noCaseAtom); // default true.
  152. info.dfaRepeatMax = options.dfaRepeatMax;
  153. info.dfaRepeatMaxScore = options.dfaRepeatMaxScore;
  154. info.uidBase = getUniqueId();
  155. allMatched = false;
  156. }
  157. void NlpParseContext::addAllMatched()
  158. {
  159. allMatched = true;
  160. }
  161. unsigned NlpParseContext::addMatchReference(IHqlExpression * matchPathExpr)
  162. {
  163. if (!matchPathExpr) matchPathExpr = expr->queryChild(2);
  164. Owned<MatchReference> ref = new MatchReference(matchPathExpr);
  165. ForEachItemIn(idx, matches)
  166. {
  167. if (ref->equals(matches.item(idx)))
  168. return idx;
  169. }
  170. matches.append(*ref.getClear());
  171. extractMatchedSymbols(matchPathExpr);
  172. return matches.ordinality()-1;
  173. }
  174. IHqlExpression * NlpParseContext::queryValidateExpr(IHqlExpression * expr) const
  175. {
  176. IHqlExpression * unicodeTest = expr->queryChild(2);
  177. switch (info.type)
  178. {
  179. case type_unicode:
  180. case type_utf8: // I think this is best
  181. if (unicodeTest && !unicodeTest->isAttribute())
  182. return unicodeTest;
  183. break;
  184. }
  185. return expr->queryChild(1);
  186. }
  187. void NlpParseContext::buildValidators(HqlCppTranslator & translator, BuildCtx & classctx)
  188. {
  189. if (validators.ordinality())
  190. {
  191. BuildCtx helperctx(classctx);
  192. translator.beginNestedClass(helperctx, "helper", "INlpHelper");
  193. BuildCtx funcctx(helperctx);
  194. funcctx.addQuotedCompound("virtual IValidator * queryValidator(unsigned i)");
  195. BuildCtx casectx(funcctx);
  196. casectx.addQuotedCompound("switch (i)");
  197. ForEachItemIn(idx, validators)
  198. {
  199. StringBuffer member;
  200. translator.getUniqueId(member.append("val"));
  201. LinkedHqlExpr validateExpr = queryValidateExpr(&validators.item(idx));
  202. ValidateKind kind = getValidateKind(validateExpr);
  203. BuildCtx validatorctx(helperctx);
  204. translator.beginNestedClass(validatorctx, member, (kind != ValidateIsUnicode) ? "IStringValidator" : "IUnicodeValidator");
  205. BuildCtx validctx(validatorctx);
  206. CHqlBoundExpr boundMatched;
  207. if (kind != ValidateIsUnicode)
  208. {
  209. validctx.addQuotedCompound("virtual bool isValid(unsigned len, const char * data)");
  210. boundMatched.length.setown(createVariable("len", LINK(sizetType)));
  211. boundMatched.expr.setown(createVariable("data", makeReferenceModifier(LINK(unknownStringType))));
  212. validctx.associateExpr(activeMatchTextExpr, boundMatched);
  213. }
  214. else
  215. {
  216. validctx.addQuotedCompound("virtual bool isValid(unsigned len, const UChar * data)");
  217. boundMatched.length.setown(createVariable("len", LINK(sizetType)));
  218. boundMatched.expr.setown(createVariable("data", makeReferenceModifier(LINK(unknownUnicodeType))));
  219. validctx.associateExpr(activeMatchUnicodeExpr, boundMatched);
  220. }
  221. validctx.associateExpr(activeNlpMarkerExpr, activeNlpMarkerExpr);
  222. validctx.associateExpr(activeValidateMarkerExpr, activeValidateMarkerExpr);
  223. translator.bindTableCursor(validctx, queryNlpParsePseudoTable(), queryNlpParsePseudoTable());
  224. if (translator.queryOptions().spotCSE)
  225. validateExpr.setown(spotScalarCSE(validateExpr));
  226. translator.buildReturn(validctx, validateExpr);
  227. translator.endNestedClass();
  228. StringBuffer s;
  229. s.append("case ").append(idx).append(": return &").append(member).append(";");
  230. casectx.addQuoted(s);
  231. }
  232. funcctx.addReturn(queryQuotedNullExpr());
  233. translator.endNestedClass();
  234. classctx.addQuoted("virtual INlpHelper * queryHelper() { return &helper; }");
  235. }
  236. }
  237. void NlpParseContext::buildProductions(HqlCppTranslator & translator, BuildCtx & classctx, BuildCtx & startctx)
  238. {
  239. if (!productions.ordinality())
  240. return;
  241. {
  242. BuildCtx metactx(classctx);
  243. metactx.addQuotedCompound("virtual IOutputMetaData * queryProductionMeta(unsigned id)");
  244. BuildCtx metacasectx(metactx);
  245. metacasectx.addQuotedCompound("switch (id)");
  246. StringBuffer s;
  247. ForEachItemIn(i, productions)
  248. {
  249. IHqlExpression & cur = productions.item(i);
  250. MetaInstance meta(translator, cur.queryChild(1)->queryRecord(), false);
  251. translator.buildMetaInfo(meta);
  252. s.clear().append("case ").append(getIntValue(cur.queryChild(0)));
  253. s.append(": return &").append(meta.queryInstanceObject()).append(";");
  254. metacasectx.addQuoted(s);
  255. }
  256. metactx.addQuoted("return 0;");
  257. }
  258. {
  259. OwnedHqlExpr callback = createVariable("input", makeBoolType());
  260. BuildCtx prodctx(startctx);
  261. prodctx.addQuotedCompound("virtual size32_t executeProduction(ARowBuilder & crSelf, unsigned id, IProductionCallback * input)");
  262. prodctx.associateExpr(activeProductionMarkerExpr, callback);
  263. BuildCtx prodcasectx(prodctx);
  264. prodcasectx.addQuotedCompound("switch (id)");
  265. StringBuffer s, subname;
  266. ForEachItemIn(i, productions)
  267. {
  268. IHqlExpression & cur = productions.item(i);
  269. IHqlExpression * transform = cur.queryChild(1);
  270. if (transform->getOperator() == no_record)
  271. continue;
  272. subname.clear().append("executeProduction").append(i+1);
  273. s.clear().append("case ").append(getIntValue(cur.queryChild(0)));
  274. s.append(": return ").append(subname).append("(crSelf, input);");
  275. prodcasectx.addQuoted(s);
  276. {
  277. BuildCtx childctx(startctx);
  278. childctx.addQuotedCompound(s.clear().append("size32_t ").append(subname).append("(ARowBuilder & crSelf, IProductionCallback * input)"));
  279. translator.ensureRowAllocated(childctx, "crSelf");
  280. childctx.associateExpr(activeProductionMarkerExpr, callback);
  281. OwnedHqlExpr newTransform = LINK(transform);
  282. OwnedHqlExpr dataset = createDataset(no_anon, LINK(transform->queryRecord()));
  283. translator.buildTransformBody(childctx, newTransform, NULL, NULL, dataset, NULL);
  284. }
  285. }
  286. prodctx.addQuoted("return (size32_t)-1;");
  287. }
  288. }
  289. void NlpParseContext::extractMatchedSymbols(IHqlExpression * expr)
  290. {
  291. switch (expr->getOperator())
  292. {
  293. case no_pat_select:
  294. extractMatchedSymbols(expr->queryChild(0));
  295. extractMatchedSymbols(expr->queryChild(1));
  296. break;
  297. case no_pat_index:
  298. expr = expr->queryChild(0);
  299. //fall through
  300. default:
  301. {
  302. assertex(expr->getOperator() == no_pat_instance);
  303. if (!matchedSymbols.contains(*expr))
  304. matchedSymbols.append(*LINK(expr));
  305. break;
  306. }
  307. }
  308. }
  309. bool NlpParseContext::isMatched(IHqlExpression * expr, IAtom * name)
  310. {
  311. if (allMatched)
  312. return true;
  313. ForEachItemIn(i, matchedSymbols)
  314. {
  315. IHqlExpression & cur = matchedSymbols.item(i);
  316. if ((cur.queryChild(0)->queryBody() == expr->queryBody()) &&
  317. (cur.queryChild(1)->queryName() == name))
  318. return true;
  319. }
  320. return false;
  321. }
  322. void NlpParseContext::doExtractValidates(IHqlExpression * expr)
  323. {
  324. if (expr->queryTransformExtra())
  325. return;
  326. expr->setTransformExtraUnlinked(expr);
  327. switch (expr->getOperator())
  328. {
  329. case no_pat_validate:
  330. validators.append(*LINK(expr));
  331. doExtractValidates(expr->queryChild(0));
  332. break;
  333. default:
  334. ForEachChild(idx, expr)
  335. doExtractValidates(expr->queryChild(idx));
  336. break;
  337. }
  338. }
  339. void NlpParseContext::extractValidates(IHqlExpression * expr)
  340. {
  341. lockTransformMutex();
  342. doExtractValidates(expr);
  343. unlockTransformMutex();
  344. }
  345. bool NlpParseContext::isValidMatch(MatchReference & match, unsigned depth, IHqlExpression * pattern)
  346. {
  347. //MORE: This should check whether already visited, especially once we allow recursive patterns!
  348. if (&match.names.item(depth) == pattern)
  349. {
  350. if (depth+1 == match.names.ordinality())
  351. return true;
  352. if (isValidMatch(match, depth+1, pattern))
  353. return true;
  354. }
  355. ForEachChild(idx, pattern)
  356. if (isValidMatch(match, depth, pattern->queryChild(idx)))
  357. return true;
  358. return false;
  359. }
  360. void NlpParseContext::checkValidMatches()
  361. {
  362. ForEachItemIn(idx, matches)
  363. {
  364. if (!isValidMatch(matches.item(idx), 0, expr->queryChild(2)))
  365. {
  366. StringBuffer path;
  367. matches.item(idx).getPath(path);
  368. throwError1(HQLERR_BadMatchedPath, path.str());
  369. }
  370. }
  371. }
  372. void NlpParseContext::compileMatched(NlpAlgorithm & parser)
  373. {
  374. parser.matchInfo->setFormat(info.inputFormat());
  375. ForEachItemIn(idx, matches)
  376. {
  377. UnsignedArray ids;
  378. UnsignedArray indexValues;
  379. matches.item(idx).compileMatched(idAllocator, ids, indexValues);
  380. parser.matchInfo->addResult(ids, indexValues);
  381. }
  382. }
  383. static void getOptions(IHqlExpression * expr, INlpParseAlgorithm::MatchAction & matchAction, INlpParseAlgorithm::ScanAction & scanAction)
  384. {
  385. matchAction = INlpParseAlgorithm::NlpMatchAll;
  386. scanAction = INlpParseAlgorithm::NlpScanNext;
  387. if (expr->hasAttribute(firstAtom)) matchAction = INlpParseAlgorithm::NlpMatchFirst;
  388. if (expr->hasAttribute(allAtom)) matchAction = INlpParseAlgorithm::NlpMatchAll;
  389. if (expr->hasAttribute(noScanAtom)) scanAction = INlpParseAlgorithm::NlpScanNone;
  390. if (expr->hasAttribute(scanAtom)) scanAction = INlpParseAlgorithm::NlpScanNext;
  391. if (expr->hasAttribute(scanAllAtom)) scanAction = INlpParseAlgorithm::NlpScanAll;
  392. if (expr->hasAttribute(wholeAtom)) scanAction = INlpParseAlgorithm::NlpScanWhole;
  393. }
  394. void NlpParseContext::getDebugText(StringBuffer & s, unsigned detail)
  395. {
  396. INlpParseAlgorithm::MatchAction matchAction;
  397. INlpParseAlgorithm::ScanAction scanAction;
  398. getOptions(expr, matchAction, scanAction);
  399. s.newline().append("Options: ").append("Match(");
  400. switch (matchAction)
  401. {
  402. case INlpParseAlgorithm::NlpMatchFirst: s.append("First"); break;
  403. case INlpParseAlgorithm::NlpMatchAll: s.append("All"); break;
  404. }
  405. s.append(") Scan(");
  406. switch (scanAction)
  407. {
  408. case INlpParseAlgorithm::NlpScanWhole: s.append("Whole"); break;
  409. case INlpParseAlgorithm::NlpScanNone: s.append("None"); break;
  410. case INlpParseAlgorithm::NlpScanNext: s.append("Next"); break;
  411. case INlpParseAlgorithm::NlpScanAll: s.append("All"); break;
  412. }
  413. s.append(")").newline();
  414. s.append("Matches:").newline();
  415. ForEachItemIn(idx, matches)
  416. matches.item(idx).getDebugText(s.append("\t"), idAllocator).newline();
  417. }
  418. void NlpParseContext::setParserOptions(INlpParseAlgorithm & parser)
  419. {
  420. INlpParseAlgorithm::MatchAction matchAction;
  421. INlpParseAlgorithm::ScanAction scanAction;
  422. getOptions(expr, matchAction, scanAction);
  423. IHqlExpression * keep = expr->queryAttribute(keepAtom);
  424. unsigned keepLimit = keep ? (unsigned)keep->queryChild(0)->queryValue()->getIntValue() : 0;
  425. IHqlExpression * atmost = expr->queryAttribute(atmostAtom);
  426. unsigned atmostLimit = atmost ? (unsigned)atmost->queryChild(0)->queryValue()->getIntValue() : 0;
  427. IHqlExpression * maxLength = expr->queryAttribute(maxLengthAtom);
  428. size32_t maxLengthValue = maxLength ? (unsigned)maxLength->queryChild(0)->queryValue()->getIntValue() : DEFAULT_PATTERN_MAX_LENGTH;
  429. parser.setOptions(matchAction, scanAction, info.inputFormat(), keepLimit, atmostLimit);
  430. parser.setChoose(expr->hasAttribute(minAtom), expr->hasAttribute(maxAtom), expr->hasAttribute(bestAtom), !expr->hasAttribute(manyAtom));
  431. parser.setJoin(expr->hasAttribute(notMatchedAtom), expr->hasAttribute(notMatchedOnlyAtom));
  432. parser.setLimit(maxLengthValue);
  433. }
  434. //---------------------------------------------------------------------------
  435. void HqlCppTranslator::doBuildParseTransform(BuildCtx & classctx, IHqlExpression * expr)
  436. {
  437. BuildCtx funcctx(classctx);
  438. funcctx.addQuotedCompound("virtual size32_t transform(ARowBuilder & crSelf, const void * _left, IMatchedResults * matched, IMatchWalker * walker)");
  439. ensureRowAllocated(funcctx, "crSelf");
  440. funcctx.addQuoted("const unsigned char * left = (const unsigned char *) _left;");
  441. funcctx.associateExpr(activeNlpMarkerExpr, activeNlpMarkerExpr);
  442. bindTableCursor(funcctx, queryNlpParsePseudoTable(), queryNlpParsePseudoTable());
  443. // Bind left to "left" and right to RIGHT
  444. IHqlExpression * dataset = expr->queryChild(0);
  445. IHqlExpression * transform = expr->queryChild(4);
  446. BoundRow * selfCursor = bindSelf(funcctx, expr, "crSelf");
  447. if (transform->getOperator() == no_newtransform)
  448. bindTableCursor(funcctx, dataset, "left");
  449. else
  450. bindTableCursor(funcctx, dataset, "left", no_left, querySelSeq(expr));
  451. associateSkipReturnMarker(funcctx, queryZero(), selfCursor);
  452. doTransform(funcctx, transform, selfCursor);
  453. buildReturnRecordSize(funcctx, selfCursor);
  454. }
  455. void HqlCppTranslator::doBuildParseSearchText(BuildCtx & classctx, IHqlExpression * dataset, IHqlExpression * search, type_t searchTypeCode, ITypeInfo * transferType)
  456. {
  457. BuildCtx funcctx(classctx);
  458. if (searchTypeCode == type_unicode)
  459. {
  460. funcctx.addQuotedCompound("virtual void getSearchText(size32_t & retLen, char * & _retText, const void * _self)");
  461. funcctx.addQuoted("UChar * & retText = *(UChar * *)&_retText;"); // don't ask.
  462. }
  463. else
  464. funcctx.addQuotedCompound("virtual void getSearchText(size32_t & retLen, char * & retText, const void * _self)");
  465. funcctx.addQuoted("const unsigned char * self = (const unsigned char *) _self;");
  466. bindTableCursor(funcctx, dataset, "self");
  467. bool needToFree = true;
  468. Owned<ITypeInfo> retType;
  469. switch (searchTypeCode)
  470. {
  471. case type_unicode:
  472. retType.setown(makeUnicodeType(UNKNOWN_LENGTH, 0));
  473. break;
  474. case type_utf8:
  475. retType.setown(makeUtf8Type(UNKNOWN_LENGTH, 0));
  476. break;
  477. default:
  478. retType.setown(makeStringType(UNKNOWN_LENGTH, NULL, NULL));
  479. break;
  480. }
  481. OwnedHqlExpr castSearch = ensureExprType(search, retType);
  482. castSearch.setown(foldHqlExpression(castSearch));
  483. retType.setown(makeReferenceModifier(retType.getClear()));
  484. switch (castSearch->getOperator())
  485. {
  486. case no_select:
  487. case no_constant:
  488. //Not strictly true - could be conditional
  489. //also misses lots of cases - but I doubt anyone will ever complain...
  490. needToFree = false;
  491. break;
  492. }
  493. OwnedHqlExpr retLen = createVariable("retLen", LINK(sizetType));
  494. OwnedHqlExpr tempLen;
  495. if (transferType)
  496. tempLen.setown(funcctx.getTempDeclare(sizetType, NULL));
  497. CHqlBoundTarget target;
  498. target.length.set(tempLen ? tempLen : retLen);
  499. target.expr.setown(createVariable("retText", LINK(retType)));
  500. if (needToFree)
  501. {
  502. buildExprAssign(funcctx, target, castSearch);
  503. }
  504. else
  505. {
  506. CHqlBoundExpr bound;
  507. buildExpr(funcctx, castSearch, bound);
  508. OwnedHqlExpr len = getBoundLength(bound);
  509. funcctx.addAssign(target.length, len);
  510. funcctx.addAssign(target.expr, bound.expr);
  511. }
  512. if (tempLen)
  513. {
  514. OwnedHqlExpr source = target.getTranslatedExpr();
  515. OwnedHqlExpr transferred = createValue(no_typetransfer, LINK(transferType), LINK(source));
  516. OwnedHqlExpr length = createValue(no_charlen, LINK(sizetType), LINK(transferred));
  517. buildAssignToTemp(funcctx, retLen, length);
  518. }
  519. doBuildBoolFunction(classctx, "searchTextNeedsFree", needToFree);
  520. }
  521. void HqlCppTranslator::doBuildParseSearchText(BuildCtx & classctx, IHqlExpression * expr)
  522. {
  523. doBuildParseSearchText(classctx, expr->queryChild(0), expr->queryChild(1), nlpParse->searchType(), NULL);
  524. }
  525. void HqlCppTranslator::doBuildParseExtra(BuildCtx & classctx, IHqlExpression * expr)
  526. {
  527. StringBuffer flags;
  528. if (expr->hasAttribute(groupAtom)) flags.append("|PFgroup");
  529. if (expr->hasAttribute(parallelAtom)) flags.append("|PFparallel");
  530. if (flags.length())
  531. doBuildUnsignedFunction(classctx, "getFlags", flags.str()+1);
  532. }
  533. void HqlCppTranslator::doBuildParseValidators(BuildCtx & classctx, IHqlExpression * expr)
  534. {
  535. nlpParse->extractValidates(expr->queryChild(2));
  536. nlpParse->buildValidators(*this, classctx);
  537. }
  538. void HqlCppTranslator::doBuildParseCompiled(BuildCtx & classctx, MemoryBuffer & buffer)
  539. {
  540. if (buffer.length() > 1000000)
  541. WARNING1(HQLWRN_ParseVeryLargeDefinition, buffer.length());
  542. BuildCtx funcctx(classctx);
  543. MemoryBuffer compressed;
  544. compressToBuffer(compressed, buffer.length(), buffer.toByteArray());
  545. unsigned buffLen = compressed.length();
  546. CHqlBoundExpr bound;
  547. StringBuffer s;
  548. OwnedHqlExpr srcData = addDataLiteral((const char *)compressed.toByteArray(), buffLen);
  549. OwnedHqlExpr retData = createVariable("retData", makePointerType(makeVoidType()));
  550. funcctx.addQuotedCompound("virtual void queryCompiled(IResourceContext *ctx, size32_t & retLen, const void * & retData)");
  551. funcctx.addQuotedF("//uncompressed size = %d", buffer.length());
  552. buildExpr(funcctx, srcData, bound);
  553. funcctx.addQuoted(s.append("retLen = ").append(buffLen).append(";"));
  554. funcctx.addAssign(retData, srcData);
  555. }
  556. void HqlCppTranslator::gatherExplicitMatched(IHqlExpression * expr)
  557. {
  558. ForEachChild(idx, expr)
  559. {
  560. IHqlExpression * cur = expr->queryChild(idx);
  561. if (cur->getOperator() == no_matched)
  562. {
  563. IHqlExpression * arg = cur->queryChild(0);
  564. if (arg->getOperator() == no_all)
  565. nlpParse->addAllMatched();
  566. else
  567. nlpParse->addMatchReference(arg);
  568. }
  569. }
  570. }
  571. ABoundActivity * HqlCppTranslator::doBuildActivityParse(BuildCtx & ctx, IHqlExpression * _expr)
  572. {
  573. Owned<ABoundActivity> boundDataset = buildCachedActivity(ctx, _expr->queryChild(0));
  574. unsigned startTime = msTick();
  575. OwnedHqlExpr expr = optimizeParse(_expr);
  576. Owned<ActivityInstance> instance = new ActivityInstance(*this, ctx, TAKparse, expr, "Parse");
  577. buildActivityFramework(instance);
  578. buildInstancePrefix(instance);
  579. //This will become conditional on the flags....
  580. unsigned startPrepareTime = msTick();
  581. ITimeReporter * reporter = options.addTimingToWorkunit ? timeReporter : NULL;
  582. if (expr->hasAttribute(tomitaAtom))
  583. nlpParse = createTomitaContext(expr, code->workunit, options, reporter);
  584. else
  585. {
  586. //In 64bit the engines have enough stack space to use the stack-based regex implementation
  587. byte algorithm = __DEFINED_64BIT__ ? NLPAregexStack : NLPAregexHeap;
  588. switch (options.regexVersion)
  589. {
  590. case 1:
  591. algorithm = NLPAregexStack;
  592. break;
  593. case 2:
  594. algorithm = NLPAregexHeap;
  595. break;
  596. }
  597. IHqlExpression * algorithmHint = queryHintChild(expr, algorithmAtom, 0);
  598. if (matchesConstantString(algorithmHint, "stack", true))
  599. algorithm = NLPAregexStack;
  600. else if (matchesConstantString(algorithmHint, "heap", true))
  601. algorithm = NLPAregexHeap;
  602. nlpParse = createRegexContext(expr, code->workunit, options, reporter, algorithm);
  603. }
  604. gatherExplicitMatched(expr);
  605. doBuildParseTransform(instance->startctx, expr); // also gathers all the MATCHED() definitions.
  606. doBuildParseSearchText(instance->startctx, expr);
  607. doBuildParseValidators(instance->nestedctx, expr);
  608. doBuildParseExtra(instance->startctx, expr);
  609. DEBUG_TIMER("EclServer: Generate PARSE: Prepare", msTick()-startPrepareTime);
  610. MemoryBuffer buffer;
  611. unsigned startCompileTime = msTick();
  612. nlpParse->compileSearchPattern();
  613. nlpParse->queryParser()->serialize(buffer);
  614. if (nlpParse->isGrammarAmbiguous())
  615. WARNING1(HQLWRN_GrammarIsAmbiguous, instance->activityId);
  616. doBuildParseCompiled(instance->classctx, buffer);
  617. DEBUG_TIMER("EclServer: Generate PARSE: Compile", msTick()-startCompileTime);
  618. nlpParse->buildProductions(*this, instance->classctx, instance->startctx);
  619. #if 0
  620. StringBuffer text;
  621. getSystemTraceInfo(text, PerfMonProcMem);
  622. wu()->setDebugValue("maxMemory", text.str(), true);
  623. #endif
  624. if (options.debugNlp != 0)
  625. {
  626. BuildCtx subctx(instance->classctx);
  627. subctx.addQuoted("#if 0\nHuman readable form of the grammar");
  628. StringBuffer s;
  629. nlpParse->getDebugText(s, options.debugNlp);
  630. subctx.addQuoted(s);
  631. subctx.addQuoted("#endif");
  632. if (options.debugNlpAsHint)
  633. {
  634. StringBuffer hintText;
  635. hintText.append("<Hint type=\"activity\" id=\"").append(instance->activityId).append("\">").newline();
  636. encodeXML(s.str(), hintText, 0, s.length(), false);
  637. hintText.append("</Hint>");
  638. code->addHint(hintText.str(), ctxCallback);
  639. }
  640. }
  641. ::Release(nlpParse);
  642. nlpParse = NULL;
  643. buildInstanceSuffix(instance);
  644. buildConnectInputOutput(ctx, instance, boundDataset, 0, 0);
  645. DEBUG_TIMER("EclServer: Generate PARSE", msTick()-startTime);
  646. return instance->getBoundActivity();
  647. }
  648. //---------------------------------------------------------------------------
  649. void getCheckRange(IHqlExpression * range, unsigned & minLength, unsigned & maxLength, unsigned charSize)
  650. {
  651. minLength = 0;
  652. maxLength = PATTERN_UNLIMITED_LENGTH;
  653. switch (range->getOperator())
  654. {
  655. case no_constant:
  656. minLength = maxLength = (unsigned)range->queryValue()->getIntValue();
  657. break;
  658. case no_rangefrom:
  659. minLength = (unsigned)range->queryChild(0)->queryValue()->getIntValue();
  660. break;
  661. case no_rangeto:
  662. maxLength = (unsigned)range->queryChild(0)->queryValue()->getIntValue();
  663. break;
  664. case no_range:
  665. minLength = (unsigned)range->queryChild(0)->queryValue()->getIntValue();
  666. maxLength = (unsigned)range->queryChild(1)->queryValue()->getIntValue();
  667. break;
  668. }
  669. minLength *= charSize;
  670. if (maxLength < PATTERN_UNLIMITED_LENGTH / charSize)
  671. maxLength *= charSize;
  672. else
  673. maxLength = PATTERN_UNLIMITED_LENGTH;
  674. }
  675. void HqlCppTranslator::doBuildMatched(BuildCtx & ctx, const CHqlBoundTarget * target, IHqlExpression * expr, CHqlBoundExpr * bound)
  676. {
  677. if (!nlpParse)
  678. throwError1(HQLERR_MatchedUsedOutsideParse, getOpString(expr->getOperator()));
  679. if (!ctx.queryMatchExpr(activeNlpMarkerExpr))
  680. {
  681. CHqlBoundExpr match;
  682. if (!buildExprInCorrectContext(ctx, expr, match, false))
  683. throwError1(HQLERR_MatchedUsedOutsideParse, getOpString(expr->getOperator()));
  684. if (target)
  685. assign(ctx, *target, match);
  686. else
  687. bound->set(match);
  688. return;
  689. }
  690. IHqlExpression * patternExpr = queryRealChild(expr, 0);
  691. if (ctx.queryMatchExpr(activeValidateMarkerExpr))
  692. {
  693. CHqlBoundExpr match;
  694. switch (expr->getOperator())
  695. {
  696. case no_matchtext:
  697. if (!ctx.getMatchExpr(activeMatchTextExpr, match))
  698. throwError(HQLERR_MatchTextNotUnicode);
  699. if (patternExpr)
  700. throwError1(HQLERR_NoArgumentsInValidator, "MATCHTEXT");
  701. break;
  702. case no_matchunicode:
  703. if (!ctx.getMatchExpr(activeMatchUnicodeExpr, match))
  704. throwError(HQLERR_MatchUnicodeNotText);
  705. if (patternExpr)
  706. throwError1(HQLERR_NoArgumentsInValidator, "MATCHUNICODE");
  707. break;
  708. case no_matchutf8:
  709. if (!ctx.getMatchExpr(activeMatchUtf8Expr, match))
  710. throwError(HQLERR_MatchUtf8NotText);
  711. if (patternExpr)
  712. throwError1(HQLERR_NoArgumentsInValidator, "MATCHUTF8");
  713. break;
  714. default:
  715. throwError(HQLERR_MatchTextOrUnicode);
  716. }
  717. if (target)
  718. assign(ctx, *target, match);
  719. else
  720. bound->set(match);
  721. return;
  722. }
  723. unsigned matchedIndex = nlpParse->addMatchReference(patternExpr);
  724. IIdAtom * func;
  725. switch (expr->getOperator())
  726. {
  727. case no_matched: func = getMatchedId; break;
  728. case no_matchtext: func = getMatchTextId; break;
  729. case no_matchunicode: func = getMatchUnicodeId; break;
  730. case no_matchlength: func = getMatchLengthId; break;
  731. case no_matchposition: func = getMatchPositionId; break;
  732. case no_matchutf8: func = getMatchUtf8Id; break;
  733. default: UNIMPLEMENTED;
  734. }
  735. HqlExprArray args;
  736. args.append(*createQuoted("matched", makeVoidType()));
  737. args.append(*createConstant((__int64)matchedIndex));
  738. OwnedHqlExpr call = bindFunctionCall(func, args);
  739. buildExprOrAssign(ctx, target, call, bound);
  740. }
  741. IReferenceSelector * HqlCppTranslator::doBuildRowMatchRow(BuildCtx & ctx, IHqlExpression * expr, bool isNew)
  742. {
  743. if (!nlpParse)
  744. throwError1(HQLERR_MatchedUsedOutsideParse, getOpString(expr->getOperator()));
  745. if (!ctx.queryMatchExpr(activeNlpMarkerExpr))
  746. throwError(HQLERR_AccessMatchAttrInChildQuery);
  747. unsigned matchedIndex = nlpParse->addMatchReference(expr->queryChild(1));
  748. HqlExprArray args;
  749. args.append(*createQuoted("matched", makeVoidType()));
  750. args.append(*createConstant((__int64)matchedIndex));
  751. OwnedHqlExpr call = bindTranslatedFunctionCall(getMatchRowId, args);
  752. IHqlExpression * record = expr->queryRecord();
  753. StringBuffer rowName;
  754. getUniqueId(rowName.append("row"));
  755. OwnedHqlExpr row = createVariable(rowName, makeConstantModifier(makeRowReferenceType(record)));
  756. ctx.addDeclare(row);
  757. ctx.addAssign(row, call);
  758. BoundRow * cursor = bindRow(ctx, expr, row);
  759. if (expr->queryChild(1))
  760. cursor->setConditional(true);
  761. return createReferenceSelector(cursor);
  762. }
  763. void HqlCppTranslator::doBuildMatchAttr(BuildCtx & ctx, const CHqlBoundTarget * target, IHqlExpression * expr, CHqlBoundExpr * bound)
  764. {
  765. if (!nlpParse)
  766. throwError1(HQLERR_MatchedUsedOutsideParse, getOpString(expr->getOperator()));
  767. if (!ctx.queryMatchExpr(activeNlpMarkerExpr) && !ctx.queryMatchExpr(activeProductionMarkerExpr))
  768. {
  769. CHqlBoundExpr match;
  770. if (!buildExprInCorrectContext(ctx, expr, match, false))
  771. throwError1(HQLERR_MatchedUsedOutsideParse, getOpString(expr->getOperator()));
  772. if (target)
  773. assign(ctx, *target, match);
  774. else
  775. bound->set(match);
  776. return;
  777. }
  778. HqlExprAssociation * marker = ctx.queryMatchExpr(activeProductionMarkerExpr);
  779. ITypeInfo * exprType = expr->queryType();
  780. if (marker)
  781. {
  782. HqlExprArray args;
  783. args.append(*LINK(marker->queryExpr()));
  784. args.append(*LINK(expr->queryChild(0)));
  785. IIdAtom * name;
  786. switch (exprType->getTypeCode())
  787. {
  788. case type_string:
  789. name = getProductionTextId;
  790. break;
  791. case type_unicode:
  792. name = getProductionUnicodeId;
  793. break;
  794. case type_utf8:
  795. name = getProductionUtf8Id;
  796. break;
  797. default:
  798. throwUnexpectedType(exprType);
  799. }
  800. OwnedHqlExpr call = bindFunctionCall(name, args);
  801. doBuildCall(ctx, target, call, bound);
  802. }
  803. else
  804. {
  805. node_operator op;
  806. switch (exprType->getTypeCode())
  807. {
  808. case type_string:
  809. op = no_matchtext;
  810. break;
  811. case type_unicode:
  812. op = no_matchunicode;
  813. break;
  814. case type_utf8:
  815. op = no_matchutf8;
  816. break;
  817. default:
  818. throwUnexpectedType(exprType);
  819. }
  820. OwnedHqlExpr newExpr = createValue(op, LINK(exprType));
  821. doBuildMatched(ctx, target, expr, bound);
  822. }
  823. }
  824. IReferenceSelector * HqlCppTranslator::doBuildRowMatchAttr(BuildCtx & ctx, IHqlExpression * expr)
  825. {
  826. if (!ctx.queryMatchExpr(activeNlpMarkerExpr) && !ctx.queryMatchExpr(activeProductionMarkerExpr))
  827. throwError(HQLERR_AccessMatchAttrInChildQuery);
  828. HqlExprArray args;
  829. IIdAtom * name;
  830. HqlExprAssociation * marker = ctx.queryMatchExpr(activeProductionMarkerExpr);
  831. if (marker)
  832. {
  833. name = getProductionResultId;
  834. args.append(*LINK(marker->queryExpr()));
  835. args.append(*LINK(expr->queryChild(1)));
  836. }
  837. else
  838. {
  839. name = getRootResultId;
  840. args.append(*createQuoted("matched", makeVoidType()));
  841. }
  842. OwnedHqlExpr call = bindTranslatedFunctionCall(name, args);
  843. IHqlExpression * record = expr->queryRecord();
  844. StringBuffer rowName;
  845. getUniqueId(rowName.append("row"));
  846. OwnedITypeInfo rowType = makeConstantModifier(makeRowReferenceType(record));
  847. rowType.setown(makeAttributeModifier(LINK(rowType), getLinkCountedAttr()));
  848. OwnedHqlExpr row = createVariable(rowName, rowType.getClear());
  849. ctx.addDeclare(row);
  850. ctx.addAssign(row, call);
  851. BoundRow * cursor = bindRow(ctx, expr, row);
  852. return createReferenceSelector(cursor);
  853. }
  854. /*
  855. Some special nodes are processed as follows:
  856. x (a before b) y : x -> a -> checkNext(b) -> y
  857. x (a after b) y : x -> checkPrev(b) -> a -> y
  858. x (a in b) y : x -> check(a, b) -> y
  859. x (a{2,3}) y : x -> repeat(a) -> y
  860. x (a+) y : x -> a +-> y
  861. ^-/
  862. Optimization issues:
  863. o Need information about all named elements that are referenced by MATCHED
  864. o The match ids are based on the logical named expressions, so if IHqlExpression tree rebuilt, matched need patching.
  865. o Don't want to re-apply same thing twice - so delay expanding named symbols.
  866. */