thorparse.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652
  1. /*##############################################################################
  2. Copyright (C) 2011 HPCC Systems.
  3. All rights reserved. This program is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU Affero General Public License as
  5. published by the Free Software Foundation, either version 3 of the
  6. License, or (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU Affero General Public License for more details.
  11. You should have received a copy of the GNU Affero General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>.
  13. ############################################################################## */
  14. #include "jliball.hpp"
  15. #include "thorparse.ipp"
  16. #include "thorregex.hpp"
  17. #include "eclrtl.hpp"
  18. #include "eclhelper.hpp"
  19. _ATOM separatorTagAtom;
  20. MODULE_INIT(INIT_PRIORITY_STANDARD)
  21. {
  22. separatorTagAtom = createAtom("<separator>");
  23. return true;
  24. }
  25. //---------------------------------------------------------------------------
  26. void deserializeBoolArray(unsigned len, bool * values, MemoryBuffer & in)
  27. {
  28. for (unsigned i = 0; i < len; i+= 8)
  29. {
  30. unsigned char next;
  31. in.read(next);
  32. unsigned max = i+8 <= len ? 8 : len - i;
  33. for (unsigned j=0; j<max; j++)
  34. values[i+j] = (next & (1 << j)) != 0;
  35. }
  36. }
  37. void serializeBoolArray(MemoryBuffer & out, unsigned len, const bool * values)
  38. {
  39. for (unsigned i = 0; i < len; i+= 8)
  40. {
  41. unsigned char next = 0;
  42. unsigned max = i+8 <= len ? 8 : len - i;
  43. for (unsigned j=0; j<max; j++)
  44. if (values[i+j]) next |= (1 << j);
  45. out.append(next);
  46. }
  47. }
  48. //---------------------------------------------------------------------------
  49. NlpState::NlpState(INlpMatchedAction * _action, NlpInputFormat _inputFormat, size32_t len, const void * text)
  50. {
  51. matchAction = _action;
  52. inputFormat = _inputFormat;
  53. charSize = (inputFormat == NlpUnicode) ? sizeof(UChar) : sizeof(char);
  54. start = (const byte *)text;
  55. cur = start;
  56. end = start + len;
  57. curMatch = &top;
  58. next = &top.firstChild;
  59. top.start = start;
  60. top.parent = NULL;
  61. }
  62. void NlpState::pushMatch(MatchState & match, MatchSaveState & save)
  63. {
  64. save.savedNext = next;
  65. save.savedMatch = curMatch;
  66. match.parent = curMatch;
  67. match.start = cur;
  68. *next = &match;
  69. next = &match.firstChild;
  70. curMatch = &match;
  71. };
  72. void NlpState::popMatch(const MatchSaveState & save)
  73. {
  74. next = save.savedNext;
  75. *next = NULL;
  76. curMatch = save.savedMatch;
  77. }
  78. void NlpState::markFinish(MatchSaveState & save)
  79. {
  80. save.savedMatch = curMatch;
  81. save.savedNext = next;
  82. curMatch->end = cur;
  83. next = &curMatch->next;
  84. curMatch = curMatch->parent;
  85. }
  86. void NlpState::unmarkFinish(const MatchSaveState & save)
  87. {
  88. next = save.savedNext;
  89. curMatch = save.savedMatch;
  90. }
  91. //---------------------------------------------------------------------------
  92. NlpMatchPath::NlpMatchPath(const UnsignedArray & _ids, const UnsignedArray & _indices)
  93. {
  94. assert(_ids.ordinality() == _indices.ordinality());
  95. ForEachItemIn(idx, _ids)
  96. {
  97. ids.append(_ids.item(idx));
  98. indices.append(_indices.item(idx));
  99. }
  100. }
  101. NlpMatchPath::NlpMatchPath(MemoryBuffer & in)
  102. {
  103. unsigned num;
  104. in.read(num);
  105. for (unsigned idx = 0; idx < num; idx++)
  106. {
  107. unsigned index, id;
  108. in.read(id);
  109. in.read(index);
  110. ids.append(id);
  111. indices.append(index);
  112. }
  113. }
  114. NlpMatchPath::~NlpMatchPath()
  115. {
  116. }
  117. void NlpMatchPath::serialize(MemoryBuffer & out) const
  118. {
  119. unsigned num = ids.ordinality();
  120. out.append(num);
  121. for (unsigned idx = 0; idx < num; idx++)
  122. {
  123. out.append(ids.item(idx));
  124. out.append(indices.item(idx));
  125. }
  126. }
  127. //---------------------------------------------------------------------------
  128. CMatchedResultInfo::CMatchedResultInfo()
  129. {
  130. inputFormat = NlpAscii;
  131. }
  132. void CMatchedResultInfo::deserialize(MemoryBuffer & in)
  133. {
  134. unsigned num;
  135. in.read(inputFormat);
  136. in.read(num);
  137. for (unsigned idx = 0; idx < num; idx++)
  138. {
  139. NlpMatchPath & cur = *createMatchPath(in);
  140. matchResults.append(cur);
  141. }
  142. }
  143. void CMatchedResultInfo::serialize(MemoryBuffer & out) const
  144. {
  145. unsigned num = matchResults.ordinality();
  146. out.append(inputFormat);
  147. out.append(num);
  148. for (unsigned idx = 0; idx < num; idx++)
  149. matchResults.item(idx).serialize(out);
  150. }
  151. //---------------------------------------------------------------------------
  152. CMatchedResults::CMatchedResults(CMatchedResultInfo * _def)
  153. {
  154. in = NULL;
  155. def = _def;
  156. unsigned num = def->matchResults.ordinality();
  157. matched = new IMatchedElement *[num];
  158. for (unsigned i=0; i<num; i++)
  159. matched[i] = NULL;
  160. }
  161. CMatchedResults::~CMatchedResults()
  162. {
  163. kill();
  164. }
  165. bool CMatchedResults::getMatched(unsigned idx)
  166. {
  167. return matched[idx] != &notMatched;
  168. }
  169. size32_t CMatchedResults::getMatchLength(unsigned idx)
  170. {
  171. const IMatchedElement * cur = matched[idx];
  172. const byte * start = cur->queryStartPtr();
  173. size32_t size = (size32_t)(cur->queryEndPtr() - start);
  174. size32_t len;
  175. switch (def->inputFormat)
  176. {
  177. case NlpAscii:
  178. len = size;
  179. break;
  180. case NlpUtf8:
  181. len = rtlUtf8Length(size, start);
  182. break;
  183. case NlpUnicode:
  184. len = size / sizeof(UChar);
  185. break;
  186. }
  187. return len;
  188. }
  189. size32_t CMatchedResults::getMatchPosition(unsigned idx)
  190. {
  191. IMatchedElement * cur = matched[idx];
  192. if (cur == &notMatched)
  193. return 0;
  194. size32_t pos = (size32_t)(cur->queryStartPtr() - in);
  195. switch (def->inputFormat)
  196. {
  197. case NlpUtf8:
  198. pos = rtlUtf8Length(pos, in);
  199. break;
  200. case NlpUnicode:
  201. pos = pos / sizeof(UChar);
  202. break;
  203. }
  204. return pos+1;
  205. }
  206. void CMatchedResults::getMatchText(size32_t & outlen, char * & out, unsigned idx)
  207. {
  208. const IMatchedElement * cur = matched[idx];
  209. const byte * start = cur->queryStartPtr();
  210. size32_t size = (size32_t)(cur->queryEndPtr() - start);
  211. switch (def->inputFormat)
  212. {
  213. case NlpAscii:
  214. rtlStrToStrX(outlen, out, size, start);
  215. break;
  216. case NlpUtf8:
  217. {
  218. //could use codepage2codepage if worried about efficiency...
  219. unsigned len = rtlUtf8Length(size, start);
  220. rtlUtf8ToStrX(outlen, out, len, (const char *)start);
  221. break;
  222. }
  223. case NlpUnicode:
  224. rtlUnicodeToStrX(outlen, out, size/sizeof(UChar), (const UChar *)start);
  225. break;
  226. }
  227. }
  228. void CMatchedResults::getMatchUnicode(size32_t & outlen, UChar * & out, unsigned idx)
  229. {
  230. const IMatchedElement * cur = matched[idx];
  231. const byte * start = cur->queryStartPtr();
  232. size32_t size = (size32_t)(cur->queryEndPtr() - start);
  233. switch (def->inputFormat)
  234. {
  235. case NlpAscii:
  236. rtlStrToUnicodeX(outlen, out, size, (const char *)start);
  237. break;
  238. case NlpUtf8:
  239. {
  240. //could use codepage2codepage if worried about efficiency...
  241. unsigned len = rtlUtf8Length(size, start);
  242. rtlUtf8ToUnicodeX(outlen, out, len, (const char *)start);
  243. break;
  244. }
  245. break;
  246. case NlpUnicode:
  247. rtlUnicodeToUnicodeX(outlen, out, size/sizeof(UChar), (const UChar*)start);
  248. break;
  249. }
  250. }
  251. void CMatchedResults::getMatchUtf8(size32_t & outlen, char * & out, unsigned idx)
  252. {
  253. const IMatchedElement * cur = matched[idx];
  254. const byte * start = cur->queryStartPtr();
  255. size32_t size = (size32_t)(cur->queryEndPtr() - start);
  256. switch (def->inputFormat)
  257. {
  258. case NlpAscii:
  259. rtlStrToUtf8X(outlen, out, size, (const char *)start);
  260. break;
  261. case NlpUtf8:
  262. {
  263. //could use codepage2codepage if worried about efficiency...
  264. unsigned len = rtlUtf8Length(size, start);
  265. rtlUtf8ToUtf8X(outlen, out, len, (const char *)start);
  266. break;
  267. }
  268. case NlpUnicode:
  269. rtlUnicodeToUtf8X(outlen, out, size/sizeof(UChar), (const UChar*)start);
  270. break;
  271. }
  272. }
  273. byte * CMatchedResults::queryMatchRow(unsigned idx)
  274. {
  275. const IMatchedElement * cur = matched[idx];
  276. return (byte *)cur->queryRow();
  277. }
  278. //MORE: Allow access to attributes at any location on the tree.
  279. byte * CMatchedResults::queryRootResult()
  280. {
  281. return (byte *)rootResult;
  282. }
  283. void CMatchedResults::kill()
  284. {
  285. if (matched)
  286. {
  287. unsigned num = def->matchResults.ordinality();
  288. for (unsigned i=0; i < num; i++)
  289. ::Release(matched[i]);
  290. delete [] matched;
  291. matched = NULL;
  292. }
  293. }
  294. //---------------------------------------------------------------------------
  295. _ATOM NlpMatchWalker::queryName()
  296. {
  297. return curMatch->queryName();
  298. }
  299. size32_t NlpMatchWalker::queryMatchSize()
  300. {
  301. return (size32_t)(curMatch->end - curMatch->start);
  302. }
  303. const void * NlpMatchWalker::queryMatchStart()
  304. {
  305. return curMatch->start;
  306. }
  307. unsigned NlpMatchWalker::numChildren()
  308. {
  309. unsigned count = 0;
  310. MatchState * cur = curMatch->firstChild;
  311. while (cur)
  312. {
  313. count++;
  314. cur = cur->next;
  315. }
  316. return count;
  317. }
  318. IMatchWalker * NlpMatchWalker::getChild(unsigned numToSkip)
  319. {
  320. MatchState * cur = curMatch->firstChild;
  321. while (cur && numToSkip)
  322. {
  323. numToSkip--;
  324. cur = cur->next;
  325. }
  326. if (cur)
  327. return new NlpMatchWalker(cur);
  328. return NULL;
  329. }
  330. //------------------------------------------------------
  331. static bool hasChildren(IMatchWalker * walker)
  332. {
  333. for (unsigned i=0;;i++)
  334. {
  335. Owned<IMatchWalker> child = walker->getChild(i);
  336. if (!child)
  337. return false;
  338. if (child->queryName() != separatorTagAtom)
  339. return true;
  340. }
  341. }
  342. static StringBuffer & getElementText(StringBuffer & s, IMatchWalker * walker)
  343. {
  344. unsigned len = walker->queryMatchSize();
  345. const char * text = (const char *)walker->queryMatchStart();
  346. return s.append(len, text);
  347. }
  348. static void expandElementText(StringBuffer & s, IMatchWalker * walker)
  349. {
  350. getElementText(s.append('"'), walker).append('"');
  351. }
  352. static void getDefaultParseTree(StringBuffer & s, IMatchWalker * cur)
  353. {
  354. _ATOM name = cur->queryName();
  355. if (name != separatorTagAtom)
  356. {
  357. if (name)
  358. {
  359. StringBuffer lowerName;
  360. lowerName.append(name).toLowerCase();
  361. s.append(lowerName);
  362. }
  363. if (hasChildren(cur))
  364. {
  365. s.append("[");
  366. for (unsigned i=0;;i++)
  367. {
  368. Owned<IMatchWalker> child = cur->getChild(i);
  369. if (!child)
  370. break;
  371. getDefaultParseTree(s, child);
  372. s.append(" ");
  373. }
  374. s.setLength(s.length()-1);
  375. s.append("]");
  376. }
  377. else
  378. expandElementText(s, cur);
  379. }
  380. }
  381. void getDefaultParseTree(IMatchWalker * walker, unsigned & len, char * & text)
  382. {
  383. StringBuffer s;
  384. getDefaultParseTree(s, walker);
  385. len = s.length();
  386. text = s.detach();
  387. }
  388. static void getXmlParseTree(StringBuffer & s, IMatchWalker * walker, unsigned indent)
  389. {
  390. _ATOM name = walker->queryName();
  391. if (name != separatorTagAtom)
  392. {
  393. unsigned max = walker->numChildren();
  394. if (!name)
  395. {
  396. if (hasChildren(walker))
  397. {
  398. for (unsigned i=0; i<max; i++)
  399. {
  400. Owned<IMatchWalker> child = walker->getChild(i);
  401. getXmlParseTree(s, child, indent);
  402. }
  403. }
  404. else
  405. getElementText(s, walker);
  406. }
  407. else
  408. {
  409. StringBuffer lowerName;
  410. lowerName.append(name).toLowerCase();
  411. s.pad(indent).append('<').append(lowerName).append('>');
  412. if (hasChildren(walker))
  413. {
  414. s.newline();
  415. for (unsigned i=0; i<max; i++)
  416. {
  417. Owned<IMatchWalker> child = walker->getChild(i);
  418. getXmlParseTree(s, child, indent+1);
  419. }
  420. s.pad(indent);
  421. }
  422. else
  423. getElementText(s, walker);
  424. s.append("</").append(lowerName).append('>').newline();
  425. }
  426. }
  427. }
  428. void getXmlParseTree(IMatchWalker * walker, unsigned & len, char * & text)
  429. {
  430. StringBuffer s;
  431. getXmlParseTree(s, walker, 0);
  432. len = s.length();
  433. text = s.detach();
  434. }
  435. //------------------------------------------------------
  436. unsigned getMaximumMatchLength(AsciiDfa & dfa, unsigned len, const byte * start)
  437. {
  438. const byte * cur = start;
  439. const byte * end = start+len;
  440. unsigned activeState = 0;
  441. const AsciiDfaState * states = dfa.queryStates();
  442. unsigned * transitions = dfa.queryTransitions();
  443. const byte * best = NULL;
  444. loop
  445. {
  446. if (states[activeState].accepts())
  447. best = cur;
  448. if (cur == end)
  449. break;
  450. byte next = *cur++;
  451. if (next < states[activeState].min)
  452. break;
  453. if (next > states[activeState].max)
  454. break;
  455. activeState = transitions[states[activeState].delta + next];
  456. if (activeState == NotFound)
  457. break;
  458. }
  459. if (best)
  460. return (size32_t)(best-start);
  461. return NotFound;
  462. }
  463. //------------------------------------------------------
  464. NlpAlgorithm::NlpAlgorithm(CMatchedResultInfo * _matched)
  465. {
  466. matchInfo = _matched;
  467. addedSeparators = false;
  468. notMatched = false;
  469. notMatchedOnly = false;
  470. chooseMin = false;
  471. chooseMax = false;
  472. chooseBest = false;
  473. singleChoicePerLine = false;
  474. inputFormat = NlpAscii;
  475. keepLimit = UINT_MAX;
  476. atMostLimit = UINT_MAX;
  477. charWidth = sizeof(char);
  478. }
  479. NlpAlgorithm::~NlpAlgorithm()
  480. {
  481. ::Release(matchInfo);
  482. }
  483. void NlpAlgorithm::setOptions(MatchAction _matchAction, ScanAction _scanAction, NlpInputFormat _inputFormat, unsigned _keepLimit, unsigned _atMostLimit)
  484. {
  485. matchAction = _matchAction;
  486. scanAction = _scanAction;
  487. inputFormat = _inputFormat;
  488. keepLimit = _keepLimit ? _keepLimit : UINT_MAX;
  489. atMostLimit = _atMostLimit ? _atMostLimit : UINT_MAX;
  490. charWidth = (inputFormat == NlpUnicode) ? sizeof(UChar) : sizeof(char);
  491. }
  492. void NlpAlgorithm::setChoose(bool _chooseMin, bool _chooseMax, bool _chooseBest, bool _singleChoicePerLine)
  493. {
  494. chooseMin = _chooseMin;
  495. chooseMax = _chooseMax;
  496. chooseBest = _chooseBest;
  497. singleChoicePerLine = _singleChoicePerLine;
  498. }
  499. void NlpAlgorithm::setJoin(bool _notMatched, bool _notMatchedOnly)
  500. {
  501. notMatched = _notMatched;
  502. notMatchedOnly = _notMatchedOnly;
  503. }
  504. void NlpAlgorithm::setLimit(unsigned _maxLength)
  505. {
  506. maxLength = _maxLength;
  507. }
  508. void NlpAlgorithm::serialize(MemoryBuffer & out)
  509. {
  510. out.append((unsigned)matchAction);
  511. out.append((unsigned)scanAction);
  512. out.append((byte)inputFormat);
  513. out.append(keepLimit);
  514. out.append(atMostLimit);
  515. out.append(charWidth);
  516. out.append(addedSeparators);
  517. out.append(notMatched);
  518. out.append(notMatchedOnly);
  519. out.append(chooseMin);
  520. out.append(chooseMax);
  521. out.append(chooseBest);
  522. out.append(singleChoicePerLine);
  523. out.append(maxLength);
  524. matchInfo->serialize(out);
  525. }
  526. void NlpAlgorithm::deserialize(MemoryBuffer & in)
  527. {
  528. unsigned temp;
  529. byte tempByte;
  530. in.read(temp); matchAction = (MatchAction)temp;
  531. in.read(temp); scanAction = (ScanAction)temp;
  532. in.read(tempByte); inputFormat = (NlpInputFormat)tempByte;
  533. in.read(keepLimit);
  534. in.read(atMostLimit);
  535. in.read(charWidth);
  536. in.read(addedSeparators);
  537. in.read(notMatched);
  538. in.read(notMatchedOnly);
  539. in.read(chooseMin);
  540. in.read(chooseMax);
  541. in.read(chooseBest);
  542. in.read(singleChoicePerLine);
  543. in.read(maxLength);
  544. matchInfo->deserialize(in);
  545. }
  546. INlpParseAlgorithm * createThorParser(MemoryBuffer & buffer, IOutputMetaData * outRecordSize)
  547. {
  548. byte kind;
  549. buffer.read(kind);
  550. switch (kind)
  551. {
  552. case NLPAregexStack:
  553. case NLPAregexHeap:
  554. return createRegexParser(buffer, outRecordSize, kind);
  555. case NLPAtomita:
  556. return createTomitaParser(buffer, outRecordSize);
  557. default:
  558. UNIMPLEMENTED;
  559. }
  560. }
  561. INlpParseAlgorithm * createThorParser(IResourceContext *ctx, IHThorParseArg & helper)
  562. {
  563. unsigned len;
  564. const void * data;
  565. helper.queryCompiled(ctx, len, data);
  566. MemoryBuffer compressed, buffer;
  567. compressed.setBuffer(len, (void *)data, false);
  568. decompressToBuffer(buffer, compressed);
  569. INlpParseAlgorithm * algorithm = createThorParser(buffer, helper.queryOutputMeta());
  570. algorithm->init(helper);
  571. return algorithm;
  572. }