text_formats.cc 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724
  1. /* Copyright 2016 Google Inc. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. #include <memory>
  13. #include "syntaxnet/base.h"
  14. #include "syntaxnet/document_format.h"
  15. #include "syntaxnet/segmenter_utils.h"
  16. #include "syntaxnet/sentence.pb.h"
  17. #include "syntaxnet/utils.h"
  18. #include "tensorflow/core/lib/io/buffered_inputstream.h"
  19. #include "tensorflow/core/lib/strings/strcat.h"
  20. #include "tensorflow/core/lib/strings/stringprintf.h"
  21. #include "tensorflow/core/platform/regexp.h"
  22. namespace syntaxnet {
  23. namespace {
  24. // Reads up to the first empty line, and returns false end of file is reached.
  25. //
  26. // This reader is shared by CONLL and prototext formats, where records are
  27. // separated by double newlines.
  28. bool DoubleNewlineReadRecord(tensorflow::io::BufferedInputStream *buffer,
  29. string *record) {
  30. string line;
  31. record->clear();
  32. tensorflow::Status status = buffer->ReadLine(&line);
  33. while (!line.empty() && status.ok()) {
  34. tensorflow::strings::StrAppend(record, line, "\n");
  35. status = buffer->ReadLine(&line);
  36. }
  37. return status.ok() || !record->empty();
  38. }
  39. } // namespace
  40. // CoNLL document format reader for dependency annotated corpora.
  41. // The expected format is described e.g. at http://ilk.uvt.nl/conll/#dataformat
  42. //
  43. // Data should adhere to the following rules:
  44. // - Data files contain sentences separated by a blank line.
  45. // - A sentence consists of one or tokens, each one starting on a new line.
  46. // - A token consists of ten fields described in the table below.
  47. // - Fields are separated by a single tab character.
  48. // - All data files will contains these ten fields, although only the ID
  49. // column is required to contain non-dummy (i.e. non-underscore) values.
  50. // Data files should be UTF-8 encoded (Unicode).
  51. //
  52. // Fields:
  53. // 1 ID: Token counter, starting at 1 for each new sentence and increasing
  54. // by 1 for every new token.
  55. // 2 FORM: Word form or punctuation symbol.
  56. // 3 LEMMA: Lemma or stem.
  57. // 4 CPOSTAG: Coarse-grained part-of-speech tag or category.
  58. // 5 POSTAG: Fine-grained part-of-speech tag. Note that the same POS tag
  59. // cannot appear with multiple coarse-grained POS tags.
  60. // 6 FEATS: Unordered set of syntactic and/or morphological features.
  61. // 7 HEAD: Head of the current token, which is either a value of ID or '0'.
  62. // 8 DEPREL: Dependency relation to the HEAD.
  63. // 9 PHEAD: Projective head of current token.
  64. // 10 PDEPREL: Dependency relation to the PHEAD.
  65. //
  66. // This CoNLL reader is compatible with the CoNLL-U format described at
  67. // http://universaldependencies.org/format.html
  68. // Note that this reader skips CoNLL-U multiword tokens and empty nodes.
  69. //
  70. // Note on reconstruct the raw text of a sentence: the raw text is constructed
  71. // by concatenating all words (field 2) with a intervening space between
  72. // consecutive words. If the last field of a token is "SpaceAfter=No", there
  73. // would be no space between current word and the next one.
  74. class CoNLLSyntaxFormat : public DocumentFormat {
  75. public:
  76. CoNLLSyntaxFormat() {}
  77. void Setup(TaskContext *context) override {
  78. join_category_to_pos_ = context->GetBoolParameter("join_category_to_pos");
  79. add_pos_as_attribute_ = context->GetBoolParameter("add_pos_as_attribute");
  80. serialize_morph_to_pos_ =
  81. context->GetBoolParameter("serialize_morph_to_pos");
  82. }
  83. // Reads up to the first empty line and returns false end of file is reached.
  84. bool ReadRecord(tensorflow::io::BufferedInputStream *buffer,
  85. string *record) override {
  86. return DoubleNewlineReadRecord(buffer, record);
  87. }
  88. void ConvertFromString(const string &key, const string &value,
  89. std::vector<Sentence *> *sentences) override {
  90. // Create new sentence.
  91. Sentence *sentence = new Sentence();
  92. // Each line corresponds to one token.
  93. string text;
  94. bool add_space_to_text = true;
  95. std::vector<string> lines = utils::Split(value, '\n');
  96. // Add each token to the sentence.
  97. std::vector<string> fields;
  98. int expected_id = 1;
  99. for (size_t i = 0; i < lines.size(); ++i) {
  100. // Split line into tab-separated fields.
  101. fields.clear();
  102. fields = utils::Split(lines[i], '\t');
  103. if (fields.empty()) continue;
  104. // Skip comment lines.
  105. if (fields[0][0] == '#') continue;
  106. // Skip CoNLLU lines for multiword tokens which are indicated by
  107. // hyphenated line numbers, e.g., "2-4".
  108. // http://universaldependencies.github.io/docs/format.html
  109. if (RE2::FullMatch(fields[0], "[0-9]+-[0-9]+")) continue;
  110. // Skip CoNLLU lines for empty tokens, indicated by decimals.
  111. // Introduced in v2. http://universaldependencies.org/format.html
  112. if (RE2::FullMatch(fields[0], "[0-9]+\\.[0-9]+")) continue;
  113. // Clear all optional fields equal to '_'.
  114. for (size_t j = 2; j < fields.size(); ++j) {
  115. if (fields[j].length() == 1 && fields[j][0] == '_') fields[j].clear();
  116. }
  117. // Check that the line is valid.
  118. CHECK_GE(fields.size(), 8)
  119. << "Every line has to have at least 8 tab separated fields.";
  120. // Check that the ids follow the expected format.
  121. const int id = utils::ParseUsing<int>(fields[0], 0, utils::ParseInt32);
  122. CHECK_EQ(expected_id++, id)
  123. << "Token ids start at 1 for each new sentence and increase by 1 "
  124. << "on each new token. Sentences are separated by an empty line.";
  125. // Get relevant fields.
  126. const string &word = fields[1];
  127. const string &cpostag = fields[3];
  128. const string &tag = fields[4];
  129. const string &attributes = fields[5];
  130. const int head = utils::ParseUsing<int>(fields[6], 0, utils::ParseInt32);
  131. const string &label = fields[7];
  132. // Add token to sentence text.
  133. if (!text.empty() && add_space_to_text) text.append(" ");
  134. const int start = text.size();
  135. const int end = start + word.size() - 1;
  136. text.append(word);
  137. add_space_to_text = fields[9] != "SpaceAfter=No";
  138. // Add token to sentence.
  139. Token *token = sentence->add_token();
  140. token->set_word(word);
  141. token->set_start(start);
  142. token->set_end(end);
  143. if (head > 0) token->set_head(head - 1);
  144. if (!tag.empty()) token->set_tag(tag);
  145. if (!cpostag.empty()) token->set_category(cpostag);
  146. if (!label.empty()) token->set_label(label);
  147. if (!attributes.empty()) AddMorphAttributes(attributes, token);
  148. if (join_category_to_pos_) JoinCategoryToPos(token);
  149. if (add_pos_as_attribute_) AddPosAsAttribute(token);
  150. if (serialize_morph_to_pos_) SerializeMorphToPos(token);
  151. }
  152. if (sentence->token_size() > 0) {
  153. sentence->set_docid(key);
  154. sentence->set_text(text);
  155. sentences->push_back(sentence);
  156. } else {
  157. // If the sentence was empty (e.g., blank lines at the beginning of a
  158. // file), then don't save it.
  159. delete sentence;
  160. }
  161. }
  162. // Converts a sentence to a key/value pair.
  163. void ConvertToString(const Sentence &sentence, string *key,
  164. string *value) override {
  165. *key = sentence.docid();
  166. std::vector<string> lines;
  167. for (int i = 0; i < sentence.token_size(); ++i) {
  168. Token token = sentence.token(i);
  169. if (join_category_to_pos_) SplitCategoryFromPos(&token);
  170. if (add_pos_as_attribute_) RemovePosFromAttributes(&token);
  171. std::vector<string> fields(10);
  172. fields[0] = tensorflow::strings::Printf("%d", i + 1);
  173. fields[1] = UnderscoreIfEmpty(token.word());
  174. fields[2] = "_";
  175. fields[3] = UnderscoreIfEmpty(token.category());
  176. fields[4] = UnderscoreIfEmpty(token.tag());
  177. fields[5] = GetMorphAttributes(token);
  178. fields[6] = tensorflow::strings::Printf("%d", token.head() + 1);
  179. fields[7] = UnderscoreIfEmpty(token.label());
  180. fields[8] = "_";
  181. fields[9] = "_";
  182. lines.push_back(utils::Join(fields, "\t"));
  183. }
  184. *value = tensorflow::strings::StrCat(utils::Join(lines, "\n"), "\n\n");
  185. }
  186. private:
  187. // Replaces empty fields with an undescore.
  188. string UnderscoreIfEmpty(const string &field) {
  189. return field.empty() ? "_" : field;
  190. }
  191. // Creates a TokenMorphology object out of a list of attribute values of the
  192. // form: a1=v1|a2=v2|... or v1|v2|...
  193. void AddMorphAttributes(const string &attributes, Token *token) {
  194. TokenMorphology *morph =
  195. token->MutableExtension(TokenMorphology::morphology);
  196. std::vector<string> att_vals = utils::Split(attributes, '|');
  197. for (int i = 0; i < att_vals.size(); ++i) {
  198. std::vector<string> att_val = utils::SplitOne(att_vals[i], '=');
  199. // Format is either:
  200. // 1) a1=v1|a2=v2..., e.g., Czech CoNLL data, or,
  201. // 2) v1|v2|..., e.g., German CoNLL data.
  202. const std::pair<string, string> name_value =
  203. att_val.size() == 2 ? std::make_pair(att_val[0], att_val[1])
  204. : std::make_pair(att_val[0], "on");
  205. // We currently don't expect an empty attribute value, but might have an
  206. // empty attribute name due to data input errors.
  207. if (name_value.second.empty()) {
  208. LOG(WARNING) << "Invalid attributes string: " << attributes
  209. << " for token: " << token->ShortDebugString();
  210. continue;
  211. }
  212. if (!name_value.first.empty()) {
  213. TokenMorphology::Attribute *attribute = morph->add_attribute();
  214. attribute->set_name(name_value.first);
  215. attribute->set_value(name_value.second);
  216. }
  217. }
  218. }
  219. // Creates a list of attribute values of the form a1=v1|a2=v2|... or v1|v2|...
  220. // from a TokenMorphology object.
  221. string GetMorphAttributes(const Token &token) {
  222. const TokenMorphology &morph =
  223. token.GetExtension(TokenMorphology::morphology);
  224. if (morph.attribute_size() == 0) return "_";
  225. string attributes;
  226. for (const TokenMorphology::Attribute &attribute : morph.attribute()) {
  227. if (!attributes.empty()) tensorflow::strings::StrAppend(&attributes, "|");
  228. tensorflow::strings::StrAppend(&attributes, attribute.name());
  229. if (attribute.value() != "on") {
  230. tensorflow::strings::StrAppend(&attributes, "=", attribute.value());
  231. }
  232. }
  233. return attributes;
  234. }
  235. void JoinCategoryToPos(Token *token) {
  236. token->set_tag(
  237. tensorflow::strings::StrCat(token->category(), "++", token->tag()));
  238. token->clear_category();
  239. }
  240. void SplitCategoryFromPos(Token *token) {
  241. const string &tag = token->tag();
  242. const size_t pos = tag.find("++");
  243. if (pos != string::npos) {
  244. token->set_category(tag.substr(0, pos));
  245. token->set_tag(tag.substr(pos + 2));
  246. }
  247. }
  248. void AddPosAsAttribute(Token *token) {
  249. if (!token->tag().empty()) {
  250. TokenMorphology *morph =
  251. token->MutableExtension(TokenMorphology::morphology);
  252. TokenMorphology::Attribute *attribute = morph->add_attribute();
  253. attribute->set_name("fPOS");
  254. attribute->set_value(token->tag());
  255. }
  256. }
  257. void RemovePosFromAttributes(Token *token) {
  258. // Assumes the "fPOS" attribute, if present, is the last one.
  259. TokenMorphology *morph =
  260. token->MutableExtension(TokenMorphology::morphology);
  261. if (morph->attribute_size() > 0 &&
  262. morph->attribute().rbegin()->name() == "fPOS") {
  263. morph->mutable_attribute()->RemoveLast();
  264. }
  265. }
  266. void SerializeMorphToPos(Token *token) {
  267. const TokenMorphology &morph =
  268. token->GetExtension(TokenMorphology::morphology);
  269. TextFormat::Printer printer;
  270. printer.SetSingleLineMode(true);
  271. string morph_str;
  272. printer.PrintToString(morph, &morph_str);
  273. token->set_tag(morph_str);
  274. }
  275. bool join_category_to_pos_ = false;
  276. bool add_pos_as_attribute_ = false;
  277. bool serialize_morph_to_pos_ = false;
  278. TF_DISALLOW_COPY_AND_ASSIGN(CoNLLSyntaxFormat);
  279. };
  280. REGISTER_SYNTAXNET_DOCUMENT_FORMAT("conll-sentence", CoNLLSyntaxFormat);
  281. // Reader for segmentation training data format. This reader assumes the input
  282. // format is similar to CoNLL format but with only two fileds:
  283. //
  284. // Fields:
  285. // 1 FORM: Word form or punctuation symbol.
  286. // 2 SPACE FLAG: Can be either 'SPACE' or 'NO_SPACE' indicates that whether
  287. // there should be a space between this word and the next one in
  288. // the raw text.
  289. //
  290. // Examples:
  291. // To create a training example for sentence with raw text:
  292. // That's a good point.
  293. // and the corresponding gold segmentation:
  294. // That 's a good point .
  295. // Then the correct input is:
  296. // That NO_SPACE
  297. // 's SPACE
  298. // a SPACE
  299. // good SPACE
  300. // point NO_SPACE
  301. // . NO_SPACE
  302. //
  303. // Yet another example:
  304. // To create a training example for sentence with raw text:
  305. // 这是一个测试
  306. // and the corresponding gold segmentation:
  307. // 这 是 一 个 测试
  308. // Then the correct input is:
  309. // 这 NO_SPACE
  310. // 是 NO_SPACE
  311. // 一 NO_SPACE
  312. // 个 NO_SPACE
  313. // 测试 NO_SPACE
  314. class SegmentationTrainingDataFormat : public CoNLLSyntaxFormat {
  315. public:
  316. // Converts to segmentation training data by breaking those word in the input
  317. // tokens to utf8 character based tokens. Moreover, if a character is the
  318. // first char of the word in the original token, then its break level is set
  319. // to SPACE_BREAK to indicate that the corresponding gold transition for that
  320. // character token is START. Otherwise NO_BREAK to indicate MERGE.
  321. void ConvertFromString(const string &key, const string &value,
  322. std::vector<Sentence *> *sentences) override {
  323. // Create new sentence.
  324. Sentence *sentence = new Sentence();
  325. // Each line corresponds to one token.
  326. string text;
  327. std::vector<string> lines = utils::Split(value, '\n');
  328. // Add each token to the sentence.
  329. std::vector<string> fields;
  330. for (size_t i = 0; i < lines.size(); ++i) {
  331. // Split line into tab-separated fields.
  332. fields.clear();
  333. fields = utils::Split(lines[i], '\t');
  334. if (fields.empty()) continue;
  335. // Skip comment lines.
  336. if (fields[0][0] == '#') continue;
  337. // Check that the line is valid.
  338. CHECK_GE(fields.size(), 2)
  339. << "Every line has to have at least 8 tab separated fields.";
  340. // Get relevant fields.
  341. const string &word = fields[0];
  342. CHECK(fields[1] == "SPACE" || fields[1] == "NO_SPACE")
  343. << "The space field can only be either 'SPACE' or 'NO_SPACE'";
  344. const bool space_after = fields[1] == "SPACE";
  345. // Add token to sentence text.
  346. int start = text.size();
  347. text.append(word);
  348. if (space_after && i != lines.size() - 1) {
  349. text.append(" ");
  350. }
  351. // Add character-based token to sentence.
  352. std::vector<tensorflow::StringPiece> chars;
  353. SegmenterUtils::GetUTF8Chars(word, &chars);
  354. bool is_first_char = true;
  355. for (auto utf8char : chars) {
  356. Token *char_token = sentence->add_token();
  357. char_token->set_word(utf8char.ToString());
  358. char_token->set_start(start);
  359. start += char_token->word().size();
  360. char_token->set_end(start - 1);
  361. char_token->set_break_level(
  362. is_first_char ? Token::SPACE_BREAK : Token::NO_BREAK);
  363. is_first_char = false;
  364. }
  365. // Add another space token.
  366. if (space_after) {
  367. Token *char_token = sentence->add_token();
  368. char_token->set_word(" ");
  369. char_token->set_start(start);
  370. char_token->set_end(start);
  371. char_token->set_break_level(Token::SPACE_BREAK);
  372. }
  373. }
  374. if (sentence->token_size() > 0) {
  375. sentence->set_docid(key);
  376. sentence->set_text(text);
  377. sentences->push_back(sentence);
  378. } else {
  379. // If the sentence was empty (e.g., blank lines at the beginning of a
  380. // file), then don't save it.
  381. delete sentence;
  382. }
  383. }
  384. };
  385. REGISTER_SYNTAXNET_DOCUMENT_FORMAT("segment-train-data",
  386. SegmentationTrainingDataFormat);
  387. // Reader for tokenized text. This reader expects every sentence to be on a
  388. // single line and tokens on that line to be separated by single spaces.
  389. //
  390. class TokenizedTextFormat : public DocumentFormat {
  391. public:
  392. TokenizedTextFormat() {}
  393. // Reads a line and returns false if end of file is reached.
  394. bool ReadRecord(tensorflow::io::BufferedInputStream *buffer,
  395. string *record) override {
  396. return buffer->ReadLine(record).ok();
  397. }
  398. void ConvertFromString(const string &key, const string &value,
  399. std::vector<Sentence *> *sentences) override {
  400. Sentence *sentence = new Sentence();
  401. string text;
  402. for (const string &word : utils::Split(value, ' ')) {
  403. if (word.empty()) continue;
  404. const int start = text.size();
  405. const int end = start + word.size() - 1;
  406. if (!text.empty()) text.append(" ");
  407. text.append(word);
  408. Token *token = sentence->add_token();
  409. token->set_word(word);
  410. token->set_start(start);
  411. token->set_end(end);
  412. }
  413. if (sentence->token_size() > 0) {
  414. sentence->set_docid(key);
  415. sentence->set_text(text);
  416. sentences->push_back(sentence);
  417. } else {
  418. // If the sentence was empty (e.g., blank lines at the beginning of a
  419. // file), then don't save it.
  420. delete sentence;
  421. }
  422. }
  423. void ConvertToString(const Sentence &sentence, string *key,
  424. string *value) override {
  425. *key = sentence.docid();
  426. value->clear();
  427. for (const Token &token : sentence.token()) {
  428. if (!value->empty()) value->append(" ");
  429. value->append(token.word());
  430. if (token.has_tag()) {
  431. value->append("_");
  432. value->append(token.tag());
  433. }
  434. if (token.has_head()) {
  435. value->append("_");
  436. value->append(tensorflow::strings::StrCat(token.head()));
  437. }
  438. }
  439. value->append("\n");
  440. }
  441. private:
  442. TF_DISALLOW_COPY_AND_ASSIGN(TokenizedTextFormat);
  443. };
  444. REGISTER_SYNTAXNET_DOCUMENT_FORMAT("tokenized-text", TokenizedTextFormat);
  445. // Reader for un-tokenized text. This reader expects every sentence to be on a
  446. // single line. For each line in the input, a sentence proto will be created,
  447. // where tokens are utf8 characters of that line.
  448. //
  449. class UntokenizedTextFormat : public TokenizedTextFormat {
  450. public:
  451. UntokenizedTextFormat() {}
  452. void ConvertFromString(const string &key, const string &value,
  453. std::vector<Sentence *> *sentences) override {
  454. Sentence *sentence = new Sentence();
  455. std::vector<tensorflow::StringPiece> chars;
  456. SegmenterUtils::GetUTF8Chars(value, &chars);
  457. int start = 0;
  458. for (auto utf8char : chars) {
  459. Token *token = sentence->add_token();
  460. token->set_word(utf8char.ToString());
  461. token->set_start(start);
  462. start += utf8char.size();
  463. token->set_end(start - 1);
  464. }
  465. if (sentence->token_size() > 0) {
  466. sentence->set_docid(key);
  467. sentence->set_text(value);
  468. sentences->push_back(sentence);
  469. } else {
  470. // If the sentence was empty (e.g., blank lines at the beginning of a
  471. // file), then don't save it.
  472. delete sentence;
  473. }
  474. }
  475. private:
  476. TF_DISALLOW_COPY_AND_ASSIGN(UntokenizedTextFormat);
  477. };
  478. REGISTER_SYNTAXNET_DOCUMENT_FORMAT("untokenized-text", UntokenizedTextFormat);
  479. // Text reader that attmpts to perform Penn Treebank tokenization on arbitrary
  480. // raw text. Adapted from https://www.cis.upenn.edu/~treebank/tokenizer.sed
  481. // by Robert MacIntyre, University of Pennsylvania, late 1995.
  482. // Expected input: raw text with one sentence per line.
  483. //
  484. class EnglishTextFormat : public TokenizedTextFormat {
  485. public:
  486. EnglishTextFormat() {}
  487. void ConvertFromString(const string &key, const string &value,
  488. std::vector<Sentence *> *sentences) override {
  489. std::vector<std::pair<string, string>> preproc_rules = {
  490. // Punctuation.
  491. {"’", "'"},
  492. {"…", "..."},
  493. {"---", "--"},
  494. {"—", "--"},
  495. {"–", "--"},
  496. {",", ","},
  497. {"。", "."},
  498. {"!", "!"},
  499. {"?", "?"},
  500. {":", ":"},
  501. {";", ";"},
  502. {"&", "&"},
  503. // Brackets.
  504. {"\\[", "("},
  505. {"]", ")"},
  506. {"{", "("},
  507. {"}", ")"},
  508. {"【", "("},
  509. {"】", ")"},
  510. {"(", "("},
  511. {")", ")"},
  512. // Quotation marks.
  513. {"\"", "\""},
  514. {"″", "\""},
  515. {"“", "\""},
  516. {"„", "\""},
  517. {"‵‵", "\""},
  518. {"”", "\""},
  519. {"’", "\""},
  520. {"‘", "\""},
  521. {"′′", "\""},
  522. {"‹", "\""},
  523. {"›", "\""},
  524. {"«", "\""},
  525. {"»", "\""},
  526. // Discarded punctuation that breaks sentences.
  527. {"|", ""},
  528. {"·", ""},
  529. {"•", ""},
  530. {"●", ""},
  531. {"▪", ""},
  532. {"■", ""},
  533. {"□", ""},
  534. {"❑", ""},
  535. {"◆", ""},
  536. {"★", ""},
  537. {"*", ""},
  538. {"♦", ""},
  539. };
  540. std::vector<std::pair<string, string>> rules = {
  541. // attempt to get correct directional quotes
  542. {R"re(^")re", "`` "},
  543. {R"re(([ \([{<])")re", "\\1 `` "},
  544. // close quotes handled at end
  545. {R"re(\.\.\.)re", " ... "},
  546. {"[,;:@#$%&]", " \\0 "},
  547. // Assume sentence tokenization has been done first, so split FINAL
  548. // periods only.
  549. {R"re(([^.])(\.)([\]\)}>"']*)[ ]*$)re", "\\1 \\2\\3 "},
  550. // however, we may as well split ALL question marks and exclamation
  551. // points, since they shouldn't have the abbrev.-marker ambiguity
  552. // problem
  553. {"[?!]", " \\0 "},
  554. // parentheses, brackets, etc.
  555. {R"re([\]\[\(\){}<>])re", " \\0 "},
  556. // Like Adwait Ratnaparkhi's MXPOST, we use the parsed-file version of
  557. // these symbols.
  558. {"\\(", "-LRB-"},
  559. {"\\)", "-RRB-"},
  560. {"\\]", "-LSB-"},
  561. {"\\]", "-RSB-"},
  562. {"{", "-LCB-"},
  563. {"}", "-RCB-"},
  564. {"--", " -- "},
  565. // First off, add a space to the beginning and end of each line, to
  566. // reduce necessary number of regexps.
  567. {"$", " "},
  568. {"^", " "},
  569. {"\"", " '' "},
  570. // possessive or close-single-quote
  571. {"([^'])' ", "\\1 ' "},
  572. // as in it's, I'm, we'd
  573. {"'([sSmMdD]) ", " '\\1 "},
  574. {"'ll ", " 'll "},
  575. {"'re ", " 're "},
  576. {"'ve ", " 've "},
  577. {"n't ", " n't "},
  578. {"'LL ", " 'LL "},
  579. {"'RE ", " 'RE "},
  580. {"'VE ", " 'VE "},
  581. {"N'T ", " N'T "},
  582. {" ([Cc])annot ", " \\1an not "},
  583. {" ([Dd])'ye ", " \\1' ye "},
  584. {" ([Gg])imme ", " \\1im me "},
  585. {" ([Gg])onna ", " \\1on na "},
  586. {" ([Gg])otta ", " \\1ot ta "},
  587. {" ([Ll])emme ", " \\1em me "},
  588. {" ([Mm])ore'n ", " \\1ore 'n "},
  589. {" '([Tt])is ", " '\\1 is "},
  590. {" '([Tt])was ", " '\\1 was "},
  591. {" ([Ww])anna ", " \\1an na "},
  592. {" ([Ww])haddya ", " \\1ha dd ya "},
  593. {" ([Ww])hatcha ", " \\1ha t cha "},
  594. // clean out extra spaces
  595. {" *", " "},
  596. {"^ *", ""},
  597. };
  598. string rewritten = value;
  599. for (const std::pair<string, string> &rule : preproc_rules) {
  600. RE2::GlobalReplace(&rewritten, rule.first, rule.second);
  601. }
  602. for (const std::pair<string, string> &rule : rules) {
  603. RE2::GlobalReplace(&rewritten, rule.first, rule.second);
  604. }
  605. TokenizedTextFormat::ConvertFromString(key, rewritten, sentences);
  606. }
  607. private:
  608. TF_DISALLOW_COPY_AND_ASSIGN(EnglishTextFormat);
  609. };
  610. REGISTER_SYNTAXNET_DOCUMENT_FORMAT("english-text", EnglishTextFormat);
  611. // Converts double-newline-separated prototext records into sentences.
  612. class SentencePrototextFormat : public DocumentFormat {
  613. public:
  614. SentencePrototextFormat() {}
  615. bool ReadRecord(tensorflow::io::BufferedInputStream *buffer,
  616. string *record) override {
  617. return DoubleNewlineReadRecord(buffer, record);
  618. }
  619. void ConvertFromString(const string &key, const string &value,
  620. std::vector<Sentence *> *sentences) override {
  621. Sentence *sentence = new Sentence();
  622. CHECK(TextFormat::ParseFromString(value, sentence))
  623. << "Failed to parse " << value;
  624. sentences->push_back(sentence);
  625. }
  626. void ConvertToString(const Sentence &sentence, string *key,
  627. string *value) override {
  628. *key = sentence.docid();
  629. string as_prototext;
  630. CHECK(TextFormat::PrintToString(sentence, &as_prototext))
  631. << "Failed to sentence with ID " << (*key);
  632. *value = tensorflow::strings::StrCat(as_prototext, "\n\n");
  633. }
  634. };
  635. REGISTER_SYNTAXNET_DOCUMENT_FORMAT("sentence-prototext",
  636. SentencePrototextFormat);
  637. } // namespace syntaxnet