text_formats.cc 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659
  1. /* Copyright 2016 Google Inc. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. #include <memory>
  13. #include <string>
  14. #include <vector>
  15. #include "syntaxnet/document_format.h"
  16. #include "syntaxnet/segmenter_utils.h"
  17. #include "syntaxnet/sentence.pb.h"
  18. #include "syntaxnet/utils.h"
  19. #include "tensorflow/core/lib/io/buffered_inputstream.h"
  20. #include "tensorflow/core/lib/strings/strcat.h"
  21. #include "tensorflow/core/lib/strings/stringprintf.h"
  22. #include "tensorflow/core/platform/regexp.h"
  23. namespace syntaxnet {
  24. // CoNLL document format reader for dependency annotated corpora.
  25. // The expected format is described e.g. at http://ilk.uvt.nl/conll/#dataformat
  26. //
  27. // Data should adhere to the following rules:
  28. // - Data files contain sentences separated by a blank line.
  29. // - A sentence consists of one or tokens, each one starting on a new line.
  30. // - A token consists of ten fields described in the table below.
  31. // - Fields are separated by a single tab character.
  32. // - All data files will contains these ten fields, although only the ID
  33. // column is required to contain non-dummy (i.e. non-underscore) values.
  34. // Data files should be UTF-8 encoded (Unicode).
  35. //
  36. // Fields:
  37. // 1 ID: Token counter, starting at 1 for each new sentence and increasing
  38. // by 1 for every new token.
  39. // 2 FORM: Word form or punctuation symbol.
  40. // 3 LEMMA: Lemma or stem.
  41. // 4 CPOSTAG: Coarse-grained part-of-speech tag or category.
  42. // 5 POSTAG: Fine-grained part-of-speech tag. Note that the same POS tag
  43. // cannot appear with multiple coarse-grained POS tags.
  44. // 6 FEATS: Unordered set of syntactic and/or morphological features.
  45. // 7 HEAD: Head of the current token, which is either a value of ID or '0'.
  46. // 8 DEPREL: Dependency relation to the HEAD.
  47. // 9 PHEAD: Projective head of current token.
  48. // 10 PDEPREL: Dependency relation to the PHEAD.
  49. //
  50. // This CoNLL reader is compatible with the CoNLL-U format described at
  51. // http://universaldependencies.org/format.html
  52. // Note that this reader skips CoNLL-U multiword tokens and ignores the last two
  53. // fields of every line, which are PHEAD and PDEPREL in CoNLL format, but are
  54. // replaced by DEPS and MISC in CoNLL-U.
  55. //
  56. class CoNLLSyntaxFormat : public DocumentFormat {
  57. public:
  58. CoNLLSyntaxFormat() {}
  59. void Setup(TaskContext *context) override {
  60. join_category_to_pos_ = context->GetBoolParameter("join_category_to_pos");
  61. add_pos_as_attribute_ = context->GetBoolParameter("add_pos_as_attribute");
  62. }
  63. // Reads up to the first empty line and returns false end of file is reached.
  64. bool ReadRecord(tensorflow::io::BufferedInputStream *buffer,
  65. string *record) override {
  66. string line;
  67. record->clear();
  68. tensorflow::Status status = buffer->ReadLine(&line);
  69. while (!line.empty() && status.ok()) {
  70. tensorflow::strings::StrAppend(record, line, "\n");
  71. status = buffer->ReadLine(&line);
  72. }
  73. return status.ok() || !record->empty();
  74. }
  75. void ConvertFromString(const string &key, const string &value,
  76. std::vector<Sentence *> *sentences) override {
  77. // Create new sentence.
  78. Sentence *sentence = new Sentence();
  79. // Each line corresponds to one token.
  80. string text;
  81. std::vector<string> lines = utils::Split(value, '\n');
  82. // Add each token to the sentence.
  83. std::vector<string> fields;
  84. int expected_id = 1;
  85. for (size_t i = 0; i < lines.size(); ++i) {
  86. // Split line into tab-separated fields.
  87. fields.clear();
  88. fields = utils::Split(lines[i], '\t');
  89. if (fields.empty()) continue;
  90. // Skip comment lines.
  91. if (fields[0][0] == '#') continue;
  92. // Skip CoNLLU lines for multiword tokens which are indicated by
  93. // hyphenated line numbers, e.g., "2-4".
  94. // http://universaldependencies.github.io/docs/format.html
  95. if (RE2::FullMatch(fields[0], "[0-9]+-[0-9]+")) continue;
  96. // Clear all optional fields equal to '_'.
  97. for (size_t j = 2; j < fields.size(); ++j) {
  98. if (fields[j].length() == 1 && fields[j][0] == '_') fields[j].clear();
  99. }
  100. // Check that the line is valid.
  101. CHECK_GE(fields.size(), 8)
  102. << "Every line has to have at least 8 tab separated fields.";
  103. // Check that the ids follow the expected format.
  104. const int id = utils::ParseUsing<int>(fields[0], 0, utils::ParseInt32);
  105. CHECK_EQ(expected_id++, id)
  106. << "Token ids start at 1 for each new sentence and increase by 1 "
  107. << "on each new token. Sentences are separated by an empty line.";
  108. // Get relevant fields.
  109. const string &word = fields[1];
  110. const string &cpostag = fields[3];
  111. const string &tag = fields[4];
  112. const string &attributes = fields[5];
  113. const int head = utils::ParseUsing<int>(fields[6], 0, utils::ParseInt32);
  114. const string &label = fields[7];
  115. // Add token to sentence text.
  116. if (!text.empty()) text.append(" ");
  117. const int start = text.size();
  118. const int end = start + word.size() - 1;
  119. text.append(word);
  120. // Add token to sentence.
  121. Token *token = sentence->add_token();
  122. token->set_word(word);
  123. token->set_start(start);
  124. token->set_end(end);
  125. if (head > 0) token->set_head(head - 1);
  126. if (!tag.empty()) token->set_tag(tag);
  127. if (!cpostag.empty()) token->set_category(cpostag);
  128. if (!label.empty()) token->set_label(label);
  129. if (!attributes.empty()) AddMorphAttributes(attributes, token);
  130. if (join_category_to_pos_) JoinCategoryToPos(token);
  131. if (add_pos_as_attribute_) AddPosAsAttribute(token);
  132. }
  133. if (sentence->token_size() > 0) {
  134. sentence->set_docid(key);
  135. sentence->set_text(text);
  136. sentences->push_back(sentence);
  137. } else {
  138. // If the sentence was empty (e.g., blank lines at the beginning of a
  139. // file), then don't save it.
  140. delete sentence;
  141. }
  142. }
  143. // Converts a sentence to a key/value pair.
  144. void ConvertToString(const Sentence &sentence, string *key,
  145. string *value) override {
  146. *key = sentence.docid();
  147. std::vector<string> lines;
  148. for (int i = 0; i < sentence.token_size(); ++i) {
  149. Token token = sentence.token(i);
  150. if (join_category_to_pos_) SplitCategoryFromPos(&token);
  151. if (add_pos_as_attribute_) RemovePosFromAttributes(&token);
  152. std::vector<string> fields(10);
  153. fields[0] = tensorflow::strings::Printf("%d", i + 1);
  154. fields[1] = UnderscoreIfEmpty(token.word());
  155. fields[2] = "_";
  156. fields[3] = UnderscoreIfEmpty(token.category());
  157. fields[4] = UnderscoreIfEmpty(token.tag());
  158. fields[5] = GetMorphAttributes(token);
  159. fields[6] = tensorflow::strings::Printf("%d", token.head() + 1);
  160. fields[7] = UnderscoreIfEmpty(token.label());
  161. fields[8] = "_";
  162. fields[9] = "_";
  163. lines.push_back(utils::Join(fields, "\t"));
  164. }
  165. *value = tensorflow::strings::StrCat(utils::Join(lines, "\n"), "\n\n");
  166. }
  167. private:
  168. // Replaces empty fields with an undescore.
  169. string UnderscoreIfEmpty(const string &field) {
  170. return field.empty() ? "_" : field;
  171. }
  172. // Creates a TokenMorphology object out of a list of attribute values of the
  173. // form: a1=v1|a2=v2|... or v1|v2|...
  174. void AddMorphAttributes(const string &attributes, Token *token) {
  175. TokenMorphology *morph =
  176. token->MutableExtension(TokenMorphology::morphology);
  177. std::vector<string> att_vals = utils::Split(attributes, '|');
  178. for (int i = 0; i < att_vals.size(); ++i) {
  179. std::vector<string> att_val = utils::SplitOne(att_vals[i], '=');
  180. // Format is either:
  181. // 1) a1=v1|a2=v2..., e.g., Czech CoNLL data, or,
  182. // 2) v1|v2|..., e.g., German CoNLL data.
  183. const std::pair<string, string> name_value =
  184. att_val.size() == 2 ? std::make_pair(att_val[0], att_val[1])
  185. : std::make_pair(att_val[0], "on");
  186. // We currently don't expect an empty attribute value, but might have an
  187. // empty attribute name due to data input errors.
  188. if (name_value.second.empty()) {
  189. LOG(WARNING) << "Invalid attributes string: " << attributes
  190. << " for token: " << token->ShortDebugString();
  191. continue;
  192. }
  193. if (!name_value.first.empty()) {
  194. TokenMorphology::Attribute *attribute = morph->add_attribute();
  195. attribute->set_name(name_value.first);
  196. attribute->set_value(name_value.second);
  197. }
  198. }
  199. }
  200. // Creates a list of attribute values of the form a1=v1|a2=v2|... or v1|v2|...
  201. // from a TokenMorphology object.
  202. string GetMorphAttributes(const Token &token) {
  203. const TokenMorphology &morph =
  204. token.GetExtension(TokenMorphology::morphology);
  205. if (morph.attribute_size() == 0) return "_";
  206. string attributes;
  207. for (const TokenMorphology::Attribute &attribute : morph.attribute()) {
  208. if (!attributes.empty()) tensorflow::strings::StrAppend(&attributes, "|");
  209. tensorflow::strings::StrAppend(&attributes, attribute.name());
  210. if (attribute.value() != "on") {
  211. tensorflow::strings::StrAppend(&attributes, "=", attribute.value());
  212. }
  213. }
  214. return attributes;
  215. }
  216. void JoinCategoryToPos(Token *token) {
  217. token->set_tag(
  218. tensorflow::strings::StrCat(token->category(), "++", token->tag()));
  219. token->clear_category();
  220. }
  221. void SplitCategoryFromPos(Token *token) {
  222. const string &tag = token->tag();
  223. const size_t pos = tag.find("++");
  224. if (pos != string::npos) {
  225. token->set_category(tag.substr(0, pos));
  226. token->set_tag(tag.substr(pos + 2));
  227. }
  228. }
  229. void AddPosAsAttribute(Token *token) {
  230. if (!token->tag().empty()) {
  231. TokenMorphology *morph =
  232. token->MutableExtension(TokenMorphology::morphology);
  233. TokenMorphology::Attribute *attribute = morph->add_attribute();
  234. attribute->set_name("fPOS");
  235. attribute->set_value(token->tag());
  236. }
  237. }
  238. void RemovePosFromAttributes(Token *token) {
  239. // Assumes the "fPOS" attribute, if present, is the last one.
  240. TokenMorphology *morph =
  241. token->MutableExtension(TokenMorphology::morphology);
  242. if (morph->attribute_size() > 0 &&
  243. morph->attribute().rbegin()->name() == "fPOS") {
  244. morph->mutable_attribute()->RemoveLast();
  245. }
  246. }
  247. bool join_category_to_pos_ = false;
  248. bool add_pos_as_attribute_ = false;
  249. TF_DISALLOW_COPY_AND_ASSIGN(CoNLLSyntaxFormat);
  250. };
  251. REGISTER_SYNTAXNET_DOCUMENT_FORMAT("conll-sentence", CoNLLSyntaxFormat);
  252. // Reader for segmentation training data format. This reader assumes the input
  253. // format is similar to CoNLL format but with only two fileds:
  254. //
  255. // Fields:
  256. // 1 FORM: Word form or punctuation symbol.
  257. // 2 SPACE FLAG: Can be either 'SPACE' or 'NO_SPACE' indicates that whether
  258. // there should be a space between this word and the next one in
  259. // the raw text.
  260. //
  261. // Examples:
  262. // To create a training example for sentence with raw text:
  263. // That's a good point.
  264. // and the corresponding gold segmentation:
  265. // That 's a good point .
  266. // Then the correct input is:
  267. // That NO_SPACE
  268. // 's SPACE
  269. // a SPACE
  270. // good SPACE
  271. // point NO_SPACE
  272. // . NO_SPACE
  273. //
  274. // Yet another example:
  275. // To create a training example for sentence with raw text:
  276. // 这是一个测试
  277. // and the corresponding gold segmentation:
  278. // 这 是 一 个 测试
  279. // Then the correct input is:
  280. // 这 NO_SPACE
  281. // 是 NO_SPACE
  282. // 一 NO_SPACE
  283. // 个 NO_SPACE
  284. // 测试 NO_SPACE
  285. class SegmentationTrainingDataFormat : public CoNLLSyntaxFormat {
  286. public:
  287. // Converts to segmentation training data by breaking those word in the input
  288. // tokens to utf8 character based tokens. Moreover, if a character is the
  289. // first char of the word in the original token, then its break level is set
  290. // to SPACE_BREAK to indicate that the corresponding gold transition for that
  291. // character token is START. Otherwise NO_BREAK to indicate MERGE.
  292. void ConvertFromString(const string &key, const string &value,
  293. std::vector<Sentence *> *sentences) override {
  294. // Create new sentence.
  295. Sentence *sentence = new Sentence();
  296. // Each line corresponds to one token.
  297. string text;
  298. std::vector<string> lines = utils::Split(value, '\n');
  299. // Add each token to the sentence.
  300. std::vector<string> fields;
  301. for (size_t i = 0; i < lines.size(); ++i) {
  302. // Split line into tab-separated fields.
  303. fields.clear();
  304. fields = utils::Split(lines[i], '\t');
  305. if (fields.empty()) continue;
  306. // Skip comment lines.
  307. if (fields[0][0] == '#') continue;
  308. // Check that the line is valid.
  309. CHECK_GE(fields.size(), 2)
  310. << "Every line has to have at least 8 tab separated fields.";
  311. // Get relevant fields.
  312. const string &word = fields[0];
  313. CHECK(fields[1] == "SPACE" || fields[1] == "NO_SPACE")
  314. << "The space field can only be either 'SPACE' or 'NO_SPACE'";
  315. const bool space_after = fields[1] == "SPACE";
  316. // Add token to sentence text.
  317. int start = text.size();
  318. text.append(word);
  319. if (space_after && i != lines.size() - 1) {
  320. text.append(" ");
  321. }
  322. // Add character-based token to sentence.
  323. std::vector<tensorflow::StringPiece> chars;
  324. SegmenterUtils::GetUTF8Chars(word, &chars);
  325. bool is_first_char = true;
  326. for (auto utf8char : chars) {
  327. Token *char_token = sentence->add_token();
  328. char_token->set_word(utf8char.ToString());
  329. char_token->set_start(start);
  330. start += char_token->word().size();
  331. char_token->set_end(start - 1);
  332. char_token->set_break_level(
  333. is_first_char ? Token::SPACE_BREAK : Token::NO_BREAK);
  334. is_first_char = false;
  335. }
  336. // Add another space token.
  337. if (space_after) {
  338. Token *char_token = sentence->add_token();
  339. char_token->set_word(" ");
  340. char_token->set_start(start);
  341. char_token->set_end(start);
  342. char_token->set_break_level(Token::SPACE_BREAK);
  343. }
  344. }
  345. if (sentence->token_size() > 0) {
  346. sentence->set_docid(key);
  347. sentence->set_text(text);
  348. sentences->push_back(sentence);
  349. } else {
  350. // If the sentence was empty (e.g., blank lines at the beginning of a
  351. // file), then don't save it.
  352. delete sentence;
  353. }
  354. }
  355. };
  356. REGISTER_SYNTAXNET_DOCUMENT_FORMAT("segment-train-data",
  357. SegmentationTrainingDataFormat);
  358. // Reader for tokenized text. This reader expects every sentence to be on a
  359. // single line and tokens on that line to be separated by single spaces.
  360. //
  361. class TokenizedTextFormat : public DocumentFormat {
  362. public:
  363. TokenizedTextFormat() {}
  364. // Reads a line and returns false if end of file is reached.
  365. bool ReadRecord(tensorflow::io::BufferedInputStream *buffer,
  366. string *record) override {
  367. return buffer->ReadLine(record).ok();
  368. }
  369. void ConvertFromString(const string &key, const string &value,
  370. std::vector<Sentence *> *sentences) override {
  371. Sentence *sentence = new Sentence();
  372. string text;
  373. for (const string &word : utils::Split(value, ' ')) {
  374. if (word.empty()) continue;
  375. const int start = text.size();
  376. const int end = start + word.size() - 1;
  377. if (!text.empty()) text.append(" ");
  378. text.append(word);
  379. Token *token = sentence->add_token();
  380. token->set_word(word);
  381. token->set_start(start);
  382. token->set_end(end);
  383. }
  384. if (sentence->token_size() > 0) {
  385. sentence->set_docid(key);
  386. sentence->set_text(text);
  387. sentences->push_back(sentence);
  388. } else {
  389. // If the sentence was empty (e.g., blank lines at the beginning of a
  390. // file), then don't save it.
  391. delete sentence;
  392. }
  393. }
  394. void ConvertToString(const Sentence &sentence, string *key,
  395. string *value) override {
  396. *key = sentence.docid();
  397. value->clear();
  398. for (const Token &token : sentence.token()) {
  399. if (!value->empty()) value->append(" ");
  400. value->append(token.word());
  401. if (token.has_tag()) {
  402. value->append("_");
  403. value->append(token.tag());
  404. }
  405. if (token.has_head()) {
  406. value->append("_");
  407. value->append(tensorflow::strings::StrCat(token.head()));
  408. }
  409. }
  410. value->append("\n");
  411. }
  412. private:
  413. TF_DISALLOW_COPY_AND_ASSIGN(TokenizedTextFormat);
  414. };
  415. REGISTER_SYNTAXNET_DOCUMENT_FORMAT("tokenized-text", TokenizedTextFormat);
  416. // Reader for un-tokenized text. This reader expects every sentence to be on a
  417. // single line. For each line in the input, a sentence proto will be created,
  418. // where tokens are utf8 characters of that line.
  419. //
  420. class UntokenizedTextFormat : public TokenizedTextFormat {
  421. public:
  422. UntokenizedTextFormat() {}
  423. void ConvertFromString(const string &key, const string &value,
  424. std::vector<Sentence *> *sentences) override {
  425. Sentence *sentence = new Sentence();
  426. std::vector<tensorflow::StringPiece> chars;
  427. SegmenterUtils::GetUTF8Chars(value, &chars);
  428. int start = 0;
  429. for (auto utf8char : chars) {
  430. Token *token = sentence->add_token();
  431. token->set_word(utf8char.ToString());
  432. token->set_start(start);
  433. start += utf8char.size();
  434. token->set_end(start - 1);
  435. }
  436. if (sentence->token_size() > 0) {
  437. sentence->set_docid(key);
  438. sentence->set_text(value);
  439. sentences->push_back(sentence);
  440. } else {
  441. // If the sentence was empty (e.g., blank lines at the beginning of a
  442. // file), then don't save it.
  443. delete sentence;
  444. }
  445. }
  446. private:
  447. TF_DISALLOW_COPY_AND_ASSIGN(UntokenizedTextFormat);
  448. };
  449. REGISTER_SYNTAXNET_DOCUMENT_FORMAT("untokenized-text", UntokenizedTextFormat);
  450. // Text reader that attmpts to perform Penn Treebank tokenization on arbitrary
  451. // raw text. Adapted from https://www.cis.upenn.edu/~treebank/tokenizer.sed
  452. // by Robert MacIntyre, University of Pennsylvania, late 1995.
  453. // Expected input: raw text with one sentence per line.
  454. //
  455. class EnglishTextFormat : public TokenizedTextFormat {
  456. public:
  457. EnglishTextFormat() {}
  458. void ConvertFromString(const string &key, const string &value,
  459. std::vector<Sentence *> *sentences) override {
  460. std::vector<std::pair<string, string>> preproc_rules = {
  461. // Punctuation.
  462. {"’", "'"},
  463. {"…", "..."},
  464. {"---", "--"},
  465. {"—", "--"},
  466. {"–", "--"},
  467. {",", ","},
  468. {"。", "."},
  469. {"!", "!"},
  470. {"?", "?"},
  471. {":", ":"},
  472. {";", ";"},
  473. {"&", "&"},
  474. // Brackets.
  475. {"\\[", "("},
  476. {"]", ")"},
  477. {"{", "("},
  478. {"}", ")"},
  479. {"【", "("},
  480. {"】", ")"},
  481. {"(", "("},
  482. {")", ")"},
  483. // Quotation marks.
  484. {"\"", "\""},
  485. {"″", "\""},
  486. {"“", "\""},
  487. {"„", "\""},
  488. {"‵‵", "\""},
  489. {"”", "\""},
  490. {"’", "\""},
  491. {"‘", "\""},
  492. {"′′", "\""},
  493. {"‹", "\""},
  494. {"›", "\""},
  495. {"«", "\""},
  496. {"»", "\""},
  497. // Discarded punctuation that breaks sentences.
  498. {"|", ""},
  499. {"·", ""},
  500. {"•", ""},
  501. {"●", ""},
  502. {"▪", ""},
  503. {"■", ""},
  504. {"□", ""},
  505. {"❑", ""},
  506. {"◆", ""},
  507. {"★", ""},
  508. {"*", ""},
  509. {"♦", ""},
  510. };
  511. std::vector<std::pair<string, string>> rules = {
  512. // attempt to get correct directional quotes
  513. {R"re(^")re", "`` "},
  514. {R"re(([ \([{<])")re", "\\1 `` "},
  515. // close quotes handled at end
  516. {R"re(\.\.\.)re", " ... "},
  517. {"[,;:@#$%&]", " \\0 "},
  518. // Assume sentence tokenization has been done first, so split FINAL
  519. // periods only.
  520. {R"re(([^.])(\.)([\]\)}>"']*)[ ]*$)re", "\\1 \\2\\3 "},
  521. // however, we may as well split ALL question marks and exclamation
  522. // points, since they shouldn't have the abbrev.-marker ambiguity
  523. // problem
  524. {"[?!]", " \\0 "},
  525. // parentheses, brackets, etc.
  526. {R"re([\]\[\(\){}<>])re", " \\0 "},
  527. // Like Adwait Ratnaparkhi's MXPOST, we use the parsed-file version of
  528. // these symbols.
  529. {"\\(", "-LRB-"},
  530. {"\\)", "-RRB-"},
  531. {"\\]", "-LSB-"},
  532. {"\\]", "-RSB-"},
  533. {"{", "-LCB-"},
  534. {"}", "-RCB-"},
  535. {"--", " -- "},
  536. // First off, add a space to the beginning and end of each line, to
  537. // reduce necessary number of regexps.
  538. {"$", " "},
  539. {"^", " "},
  540. {"\"", " '' "},
  541. // possessive or close-single-quote
  542. {"([^'])' ", "\\1 ' "},
  543. // as in it's, I'm, we'd
  544. {"'([sSmMdD]) ", " '\\1 "},
  545. {"'ll ", " 'll "},
  546. {"'re ", " 're "},
  547. {"'ve ", " 've "},
  548. {"n't ", " n't "},
  549. {"'LL ", " 'LL "},
  550. {"'RE ", " 'RE "},
  551. {"'VE ", " 'VE "},
  552. {"N'T ", " N'T "},
  553. {" ([Cc])annot ", " \\1an not "},
  554. {" ([Dd])'ye ", " \\1' ye "},
  555. {" ([Gg])imme ", " \\1im me "},
  556. {" ([Gg])onna ", " \\1on na "},
  557. {" ([Gg])otta ", " \\1ot ta "},
  558. {" ([Ll])emme ", " \\1em me "},
  559. {" ([Mm])ore'n ", " \\1ore 'n "},
  560. {" '([Tt])is ", " '\\1 is "},
  561. {" '([Tt])was ", " '\\1 was "},
  562. {" ([Ww])anna ", " \\1an na "},
  563. {" ([Ww])haddya ", " \\1ha dd ya "},
  564. {" ([Ww])hatcha ", " \\1ha t cha "},
  565. // clean out extra spaces
  566. {" *", " "},
  567. {"^ *", ""},
  568. };
  569. string rewritten = value;
  570. for (const std::pair<string, string> &rule : preproc_rules) {
  571. RE2::GlobalReplace(&rewritten, rule.first, rule.second);
  572. }
  573. for (const std::pair<string, string> &rule : rules) {
  574. RE2::GlobalReplace(&rewritten, rule.first, rule.second);
  575. }
  576. TokenizedTextFormat::ConvertFromString(key, rewritten, sentences);
  577. }
  578. private:
  579. TF_DISALLOW_COPY_AND_ASSIGN(EnglishTextFormat);
  580. };
  581. REGISTER_SYNTAXNET_DOCUMENT_FORMAT("english-text", EnglishTextFormat);
  582. } // namespace syntaxnet