thorparse.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653
  1. /*##############################################################################
  2. HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. ############################################################################## */
  13. #include "jliball.hpp"
  14. #include "thorparse.ipp"
  15. #include "thorregex.hpp"
  16. #include "eclrtl.hpp"
  17. #include "eclhelper.hpp"
  18. IAtom * separatorTagAtom;
  19. MODULE_INIT(INIT_PRIORITY_STANDARD)
  20. {
  21. separatorTagAtom = createAtom("<separator>");
  22. return true;
  23. }
  24. //---------------------------------------------------------------------------
  25. void deserializeBoolArray(unsigned len, bool * values, MemoryBuffer & in)
  26. {
  27. for (unsigned i = 0; i < len; i+= 8)
  28. {
  29. unsigned char next;
  30. in.read(next);
  31. unsigned max = i+8 <= len ? 8 : len - i;
  32. for (unsigned j=0; j<max; j++)
  33. values[i+j] = (next & (1 << j)) != 0;
  34. }
  35. }
  36. void serializeBoolArray(MemoryBuffer & out, unsigned len, const bool * values)
  37. {
  38. for (unsigned i = 0; i < len; i+= 8)
  39. {
  40. unsigned char next = 0;
  41. unsigned max = i+8 <= len ? 8 : len - i;
  42. for (unsigned j=0; j<max; j++)
  43. if (values[i+j]) next |= (1 << j);
  44. out.append(next);
  45. }
  46. }
  47. //---------------------------------------------------------------------------
  48. NlpState::NlpState(INlpMatchedAction * _action, NlpInputFormat _inputFormat, size32_t len, const void * text)
  49. {
  50. matchAction = _action;
  51. inputFormat = _inputFormat;
  52. charSize = (inputFormat == NlpUnicode) ? sizeof(UChar) : sizeof(char);
  53. start = (const byte *)text;
  54. cur = start;
  55. end = start + len;
  56. curMatch = &top;
  57. next = &top.firstChild;
  58. top.start = start;
  59. top.parent = NULL;
  60. }
  61. void NlpState::pushMatch(MatchState & match, MatchSaveState & save)
  62. {
  63. save.savedNext = next;
  64. save.savedMatch = curMatch;
  65. match.parent = curMatch;
  66. match.start = cur;
  67. *next = &match;
  68. next = &match.firstChild;
  69. curMatch = &match;
  70. };
  71. void NlpState::popMatch(const MatchSaveState & save)
  72. {
  73. next = save.savedNext;
  74. *next = NULL;
  75. curMatch = save.savedMatch;
  76. }
  77. void NlpState::markFinish(MatchSaveState & save)
  78. {
  79. save.savedMatch = curMatch;
  80. save.savedNext = next;
  81. curMatch->end = cur;
  82. next = &curMatch->next;
  83. curMatch = curMatch->parent;
  84. }
  85. void NlpState::unmarkFinish(const MatchSaveState & save)
  86. {
  87. next = save.savedNext;
  88. curMatch = save.savedMatch;
  89. }
  90. //---------------------------------------------------------------------------
  91. NlpMatchPath::NlpMatchPath(const UnsignedArray & _ids, const UnsignedArray & _indices)
  92. {
  93. assert(_ids.ordinality() == _indices.ordinality());
  94. ForEachItemIn(idx, _ids)
  95. {
  96. ids.append(_ids.item(idx));
  97. indices.append(_indices.item(idx));
  98. }
  99. }
  100. NlpMatchPath::NlpMatchPath(MemoryBuffer & in)
  101. {
  102. unsigned num;
  103. in.read(num);
  104. for (unsigned idx = 0; idx < num; idx++)
  105. {
  106. unsigned index, id;
  107. in.read(id);
  108. in.read(index);
  109. ids.append(id);
  110. indices.append(index);
  111. }
  112. }
  113. NlpMatchPath::~NlpMatchPath()
  114. {
  115. }
  116. void NlpMatchPath::serialize(MemoryBuffer & out) const
  117. {
  118. unsigned num = ids.ordinality();
  119. out.append(num);
  120. for (unsigned idx = 0; idx < num; idx++)
  121. {
  122. out.append(ids.item(idx));
  123. out.append(indices.item(idx));
  124. }
  125. }
  126. //---------------------------------------------------------------------------
  127. CMatchedResultInfo::CMatchedResultInfo()
  128. {
  129. inputFormat = NlpAscii;
  130. }
  131. void CMatchedResultInfo::deserialize(MemoryBuffer & in)
  132. {
  133. unsigned num;
  134. in.read(inputFormat);
  135. in.read(num);
  136. for (unsigned idx = 0; idx < num; idx++)
  137. {
  138. NlpMatchPath & cur = *createMatchPath(in);
  139. matchResults.append(cur);
  140. }
  141. }
  142. void CMatchedResultInfo::serialize(MemoryBuffer & out) const
  143. {
  144. unsigned num = matchResults.ordinality();
  145. out.append(inputFormat);
  146. out.append(num);
  147. for (unsigned idx = 0; idx < num; idx++)
  148. matchResults.item(idx).serialize(out);
  149. }
  150. //---------------------------------------------------------------------------
  151. CMatchedResults::CMatchedResults(CMatchedResultInfo * _def)
  152. {
  153. in = NULL;
  154. def = _def;
  155. unsigned num = def->matchResults.ordinality();
  156. matched = new IMatchedElement *[num];
  157. for (unsigned i=0; i<num; i++)
  158. matched[i] = NULL;
  159. }
  160. CMatchedResults::~CMatchedResults()
  161. {
  162. kill();
  163. }
  164. bool CMatchedResults::getMatched(unsigned idx)
  165. {
  166. return matched[idx] != &notMatched;
  167. }
  168. size32_t CMatchedResults::getMatchLength(unsigned idx)
  169. {
  170. const IMatchedElement * cur = matched[idx];
  171. const byte * start = cur->queryStartPtr();
  172. size32_t size = (size32_t)(cur->queryEndPtr() - start);
  173. size32_t len;
  174. switch (def->inputFormat)
  175. {
  176. case NlpAscii:
  177. len = size;
  178. break;
  179. case NlpUtf8:
  180. len = rtlUtf8Length(size, start);
  181. break;
  182. case NlpUnicode:
  183. len = size / sizeof(UChar);
  184. break;
  185. default:
  186. throwUnexpected();
  187. }
  188. return len;
  189. }
  190. size32_t CMatchedResults::getMatchPosition(unsigned idx)
  191. {
  192. IMatchedElement * cur = matched[idx];
  193. if (cur == &notMatched)
  194. return 0;
  195. size32_t pos = (size32_t)(cur->queryStartPtr() - in);
  196. switch (def->inputFormat)
  197. {
  198. case NlpUtf8:
  199. pos = rtlUtf8Length(pos, in);
  200. break;
  201. case NlpUnicode:
  202. pos = pos / sizeof(UChar);
  203. break;
  204. }
  205. return pos+1;
  206. }
  207. void CMatchedResults::getMatchText(size32_t & outlen, char * & out, unsigned idx)
  208. {
  209. const IMatchedElement * cur = matched[idx];
  210. const byte * start = cur->queryStartPtr();
  211. size32_t size = (size32_t)(cur->queryEndPtr() - start);
  212. switch (def->inputFormat)
  213. {
  214. case NlpAscii:
  215. rtlStrToStrX(outlen, out, size, start);
  216. break;
  217. case NlpUtf8:
  218. {
  219. //could use codepage2codepage if worried about efficiency...
  220. unsigned len = rtlUtf8Length(size, start);
  221. rtlUtf8ToStrX(outlen, out, len, (const char *)start);
  222. break;
  223. }
  224. case NlpUnicode:
  225. rtlUnicodeToStrX(outlen, out, size/sizeof(UChar), (const UChar *)start);
  226. break;
  227. }
  228. }
  229. void CMatchedResults::getMatchUnicode(size32_t & outlen, UChar * & out, unsigned idx)
  230. {
  231. const IMatchedElement * cur = matched[idx];
  232. const byte * start = cur->queryStartPtr();
  233. size32_t size = (size32_t)(cur->queryEndPtr() - start);
  234. switch (def->inputFormat)
  235. {
  236. case NlpAscii:
  237. rtlStrToUnicodeX(outlen, out, size, (const char *)start);
  238. break;
  239. case NlpUtf8:
  240. {
  241. //could use codepage2codepage if worried about efficiency...
  242. unsigned len = rtlUtf8Length(size, start);
  243. rtlUtf8ToUnicodeX(outlen, out, len, (const char *)start);
  244. break;
  245. }
  246. break;
  247. case NlpUnicode:
  248. rtlUnicodeToUnicodeX(outlen, out, size/sizeof(UChar), (const UChar*)start);
  249. break;
  250. }
  251. }
  252. void CMatchedResults::getMatchUtf8(size32_t & outlen, char * & out, unsigned idx)
  253. {
  254. const IMatchedElement * cur = matched[idx];
  255. const byte * start = cur->queryStartPtr();
  256. size32_t size = (size32_t)(cur->queryEndPtr() - start);
  257. switch (def->inputFormat)
  258. {
  259. case NlpAscii:
  260. rtlStrToUtf8X(outlen, out, size, (const char *)start);
  261. break;
  262. case NlpUtf8:
  263. {
  264. //could use codepage2codepage if worried about efficiency...
  265. unsigned len = rtlUtf8Length(size, start);
  266. rtlUtf8ToUtf8X(outlen, out, len, (const char *)start);
  267. break;
  268. }
  269. case NlpUnicode:
  270. rtlUnicodeToUtf8X(outlen, out, size/sizeof(UChar), (const UChar*)start);
  271. break;
  272. }
  273. }
  274. byte * CMatchedResults::queryMatchRow(unsigned idx)
  275. {
  276. const IMatchedElement * cur = matched[idx];
  277. return (byte *)cur->queryRow();
  278. }
  279. //MORE: Allow access to attributes at any location on the tree.
  280. byte * CMatchedResults::queryRootResult()
  281. {
  282. return (byte *)rootResult;
  283. }
  284. void CMatchedResults::kill()
  285. {
  286. if (matched)
  287. {
  288. unsigned num = def->matchResults.ordinality();
  289. for (unsigned i=0; i < num; i++)
  290. ::Release(matched[i]);
  291. delete [] matched;
  292. matched = NULL;
  293. }
  294. }
  295. //---------------------------------------------------------------------------
  296. IAtom * NlpMatchWalker::queryName()
  297. {
  298. return curMatch->queryName();
  299. }
  300. size32_t NlpMatchWalker::queryMatchSize()
  301. {
  302. return (size32_t)(curMatch->end - curMatch->start);
  303. }
  304. const void * NlpMatchWalker::queryMatchStart()
  305. {
  306. return curMatch->start;
  307. }
  308. unsigned NlpMatchWalker::numChildren()
  309. {
  310. unsigned count = 0;
  311. MatchState * cur = curMatch->firstChild;
  312. while (cur)
  313. {
  314. count++;
  315. cur = cur->next;
  316. }
  317. return count;
  318. }
  319. IMatchWalker * NlpMatchWalker::getChild(unsigned numToSkip)
  320. {
  321. MatchState * cur = curMatch->firstChild;
  322. while (cur && numToSkip)
  323. {
  324. numToSkip--;
  325. cur = cur->next;
  326. }
  327. if (cur)
  328. return new NlpMatchWalker(cur);
  329. return NULL;
  330. }
  331. //------------------------------------------------------
  332. static bool hasChildren(IMatchWalker * walker)
  333. {
  334. for (unsigned i=0;;i++)
  335. {
  336. Owned<IMatchWalker> child = walker->getChild(i);
  337. if (!child)
  338. return false;
  339. if (child->queryName() != separatorTagAtom)
  340. return true;
  341. }
  342. }
  343. static StringBuffer & getElementText(StringBuffer & s, IMatchWalker * walker)
  344. {
  345. unsigned len = walker->queryMatchSize();
  346. const char * text = (const char *)walker->queryMatchStart();
  347. return s.append(len, text);
  348. }
  349. static void expandElementText(StringBuffer & s, IMatchWalker * walker)
  350. {
  351. getElementText(s.append('"'), walker).append('"');
  352. }
  353. static void getDefaultParseTree(StringBuffer & s, IMatchWalker * cur)
  354. {
  355. IAtom * name = cur->queryName();
  356. if (name != separatorTagAtom)
  357. {
  358. if (name)
  359. {
  360. StringBuffer lowerName;
  361. lowerName.append(name).toLowerCase();
  362. s.append(lowerName);
  363. }
  364. if (hasChildren(cur))
  365. {
  366. s.append("[");
  367. for (unsigned i=0;;i++)
  368. {
  369. Owned<IMatchWalker> child = cur->getChild(i);
  370. if (!child)
  371. break;
  372. getDefaultParseTree(s, child);
  373. s.append(" ");
  374. }
  375. s.setLength(s.length()-1);
  376. s.append("]");
  377. }
  378. else
  379. expandElementText(s, cur);
  380. }
  381. }
  382. void getDefaultParseTree(IMatchWalker * walker, unsigned & len, char * & text)
  383. {
  384. StringBuffer s;
  385. getDefaultParseTree(s, walker);
  386. len = s.length();
  387. text = s.detach();
  388. }
  389. static void getXmlParseTree(StringBuffer & s, IMatchWalker * walker, unsigned indent)
  390. {
  391. IAtom * name = walker->queryName();
  392. if (name != separatorTagAtom)
  393. {
  394. unsigned max = walker->numChildren();
  395. if (!name)
  396. {
  397. if (hasChildren(walker))
  398. {
  399. for (unsigned i=0; i<max; i++)
  400. {
  401. Owned<IMatchWalker> child = walker->getChild(i);
  402. getXmlParseTree(s, child, indent);
  403. }
  404. }
  405. else
  406. getElementText(s, walker);
  407. }
  408. else
  409. {
  410. StringBuffer lowerName;
  411. lowerName.append(name).toLowerCase();
  412. s.pad(indent).append('<').append(lowerName).append('>');
  413. if (hasChildren(walker))
  414. {
  415. s.newline();
  416. for (unsigned i=0; i<max; i++)
  417. {
  418. Owned<IMatchWalker> child = walker->getChild(i);
  419. getXmlParseTree(s, child, indent+1);
  420. }
  421. s.pad(indent);
  422. }
  423. else
  424. getElementText(s, walker);
  425. s.append("</").append(lowerName).append('>').newline();
  426. }
  427. }
  428. }
  429. void getXmlParseTree(IMatchWalker * walker, unsigned & len, char * & text)
  430. {
  431. StringBuffer s;
  432. getXmlParseTree(s, walker, 0);
  433. len = s.length();
  434. text = s.detach();
  435. }
  436. //------------------------------------------------------
  437. unsigned getMaximumMatchLength(AsciiDfa & dfa, unsigned len, const byte * start)
  438. {
  439. const byte * cur = start;
  440. const byte * end = start+len;
  441. unsigned activeState = 0;
  442. const AsciiDfaState * states = dfa.queryStates();
  443. unsigned * transitions = dfa.queryTransitions();
  444. const byte * best = NULL;
  445. for (;;)
  446. {
  447. if (states[activeState].accepts())
  448. best = cur;
  449. if (cur == end)
  450. break;
  451. byte next = *cur++;
  452. if (next < states[activeState].min)
  453. break;
  454. if (next > states[activeState].max)
  455. break;
  456. activeState = transitions[states[activeState].delta + next];
  457. if (activeState == NotFound)
  458. break;
  459. }
  460. if (best)
  461. return (size32_t)(best-start);
  462. return NotFound;
  463. }
  464. //------------------------------------------------------
  465. NlpAlgorithm::NlpAlgorithm(CMatchedResultInfo * _matched)
  466. {
  467. matchInfo = _matched;
  468. addedSeparators = false;
  469. notMatched = false;
  470. notMatchedOnly = false;
  471. chooseMin = false;
  472. chooseMax = false;
  473. chooseBest = false;
  474. singleChoicePerLine = false;
  475. inputFormat = NlpAscii;
  476. keepLimit = UINT_MAX;
  477. atMostLimit = UINT_MAX;
  478. charWidth = sizeof(char);
  479. }
  480. NlpAlgorithm::~NlpAlgorithm()
  481. {
  482. ::Release(matchInfo);
  483. }
  484. void NlpAlgorithm::setOptions(MatchAction _matchAction, ScanAction _scanAction, NlpInputFormat _inputFormat, unsigned _keepLimit, unsigned _atMostLimit)
  485. {
  486. matchAction = _matchAction;
  487. scanAction = _scanAction;
  488. inputFormat = _inputFormat;
  489. keepLimit = _keepLimit ? _keepLimit : UINT_MAX;
  490. atMostLimit = _atMostLimit ? _atMostLimit : UINT_MAX;
  491. charWidth = (inputFormat == NlpUnicode) ? sizeof(UChar) : sizeof(char);
  492. }
  493. void NlpAlgorithm::setChoose(bool _chooseMin, bool _chooseMax, bool _chooseBest, bool _singleChoicePerLine)
  494. {
  495. chooseMin = _chooseMin;
  496. chooseMax = _chooseMax;
  497. chooseBest = _chooseBest;
  498. singleChoicePerLine = _singleChoicePerLine;
  499. }
  500. void NlpAlgorithm::setJoin(bool _notMatched, bool _notMatchedOnly)
  501. {
  502. notMatched = _notMatched;
  503. notMatchedOnly = _notMatchedOnly;
  504. }
  505. void NlpAlgorithm::setLimit(unsigned _maxLength)
  506. {
  507. maxLength = _maxLength;
  508. }
  509. void NlpAlgorithm::serialize(MemoryBuffer & out)
  510. {
  511. out.append((unsigned)matchAction);
  512. out.append((unsigned)scanAction);
  513. out.append((byte)inputFormat);
  514. out.append(keepLimit);
  515. out.append(atMostLimit);
  516. out.append(charWidth);
  517. out.append(addedSeparators);
  518. out.append(notMatched);
  519. out.append(notMatchedOnly);
  520. out.append(chooseMin);
  521. out.append(chooseMax);
  522. out.append(chooseBest);
  523. out.append(singleChoicePerLine);
  524. out.append(maxLength);
  525. matchInfo->serialize(out);
  526. }
  527. void NlpAlgorithm::deserialize(MemoryBuffer & in)
  528. {
  529. unsigned temp;
  530. byte tempByte;
  531. in.read(temp); matchAction = (MatchAction)temp;
  532. in.read(temp); scanAction = (ScanAction)temp;
  533. in.read(tempByte); inputFormat = (NlpInputFormat)tempByte;
  534. in.read(keepLimit);
  535. in.read(atMostLimit);
  536. in.read(charWidth);
  537. in.read(addedSeparators);
  538. in.read(notMatched);
  539. in.read(notMatchedOnly);
  540. in.read(chooseMin);
  541. in.read(chooseMax);
  542. in.read(chooseBest);
  543. in.read(singleChoicePerLine);
  544. in.read(maxLength);
  545. matchInfo->deserialize(in);
  546. }
  547. INlpParseAlgorithm * createThorParser(MemoryBuffer & buffer, IOutputMetaData * outRecordSize)
  548. {
  549. byte kind;
  550. buffer.read(kind);
  551. switch (kind)
  552. {
  553. case NLPAregexStack:
  554. case NLPAregexHeap:
  555. return createRegexParser(buffer, outRecordSize, kind);
  556. case NLPAtomita:
  557. return createTomitaParser(buffer, outRecordSize);
  558. default:
  559. UNIMPLEMENTED;
  560. }
  561. }
  562. INlpParseAlgorithm * createThorParser(IResourceContext *ctx, IHThorParseArg & helper)
  563. {
  564. unsigned len;
  565. const void * data;
  566. helper.queryCompiled(ctx, len, data);
  567. MemoryBuffer compressed, buffer;
  568. compressed.setBuffer(len, (void *)data, false);
  569. decompressToBuffer(buffer, compressed);
  570. INlpParseAlgorithm * algorithm = createThorParser(buffer, helper.queryOutputMeta());
  571. algorithm->init(helper);
  572. return algorithm;
  573. }