text_formats.cc 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542
  1. /* Copyright 2016 Google Inc. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. #include <memory>
  13. #include <string>
  14. #include <vector>
  15. #include "syntaxnet/document_format.h"
  16. #include "syntaxnet/sentence.pb.h"
  17. #include "syntaxnet/segmenter_utils.h"
  18. #include "syntaxnet/utils.h"
  19. #include "tensorflow/core/lib/io/inputbuffer.h"
  20. #include "tensorflow/core/lib/strings/strcat.h"
  21. #include "tensorflow/core/lib/strings/stringprintf.h"
  22. #include "tensorflow/core/platform/regexp.h"
  23. namespace syntaxnet {
  24. // CoNLL document format reader for dependency annotated corpora.
  25. // The expected format is described e.g. at http://ilk.uvt.nl/conll/#dataformat
  26. //
  27. // Data should adhere to the following rules:
  28. // - Data files contain sentences separated by a blank line.
  29. // - A sentence consists of one or tokens, each one starting on a new line.
  30. // - A token consists of ten fields described in the table below.
  31. // - Fields are separated by a single tab character.
  32. // - All data files will contains these ten fields, although only the ID
  33. // column is required to contain non-dummy (i.e. non-underscore) values.
  34. // Data files should be UTF-8 encoded (Unicode).
  35. //
  36. // Fields:
  37. // 1 ID: Token counter, starting at 1 for each new sentence and increasing
  38. // by 1 for every new token.
  39. // 2 FORM: Word form or punctuation symbol.
  40. // 3 LEMMA: Lemma or stem.
  41. // 4 CPOSTAG: Coarse-grained part-of-speech tag or category.
  42. // 5 POSTAG: Fine-grained part-of-speech tag. Note that the same POS tag
  43. // cannot appear with multiple coarse-grained POS tags.
  44. // 6 FEATS: Unordered set of syntactic and/or morphological features.
  45. // 7 HEAD: Head of the current token, which is either a value of ID or '0'.
  46. // 8 DEPREL: Dependency relation to the HEAD.
  47. // 9 PHEAD: Projective head of current token.
  48. // 10 PDEPREL: Dependency relation to the PHEAD.
  49. //
  50. // This CoNLL reader is compatible with the CoNLL-U format described at
  51. // http://universaldependencies.org/format.html
  52. // Note that this reader skips CoNLL-U multiword tokens and ignores the last two
  53. // fields of every line, which are PHEAD and PDEPREL in CoNLL format, but are
  54. // replaced by DEPS and MISC in CoNLL-U.
  55. //
  56. class CoNLLSyntaxFormat : public DocumentFormat {
  57. public:
  58. CoNLLSyntaxFormat() {}
  59. void Setup(TaskContext *context) override {
  60. join_category_to_pos_ = context->GetBoolParameter("join_category_to_pos");
  61. add_pos_as_attribute_ = context->GetBoolParameter("add_pos_as_attribute");
  62. }
  63. // Reads up to the first empty line and returns false end of file is reached.
  64. bool ReadRecord(tensorflow::io::InputBuffer *buffer,
  65. string *record) override {
  66. string line;
  67. record->clear();
  68. tensorflow::Status status = buffer->ReadLine(&line);
  69. while (!line.empty() && status.ok()) {
  70. tensorflow::strings::StrAppend(record, line, "\n");
  71. status = buffer->ReadLine(&line);
  72. }
  73. return status.ok() || !record->empty();
  74. }
  75. void ConvertFromString(const string &key, const string &value,
  76. vector<Sentence *> *sentences) override {
  77. // Create new sentence.
  78. Sentence *sentence = new Sentence();
  79. // Each line corresponds to one token.
  80. string text;
  81. vector<string> lines = utils::Split(value, '\n');
  82. // Add each token to the sentence.
  83. vector<string> fields;
  84. int expected_id = 1;
  85. for (size_t i = 0; i < lines.size(); ++i) {
  86. // Split line into tab-separated fields.
  87. fields.clear();
  88. fields = utils::Split(lines[i], '\t');
  89. if (fields.empty()) continue;
  90. // Skip comment lines.
  91. if (fields[0][0] == '#') continue;
  92. // Skip CoNLLU lines for multiword tokens which are indicated by
  93. // hyphenated line numbers, e.g., "2-4".
  94. // http://universaldependencies.github.io/docs/format.html
  95. if (RE2::FullMatch(fields[0], "[0-9]+-[0-9]+")) continue;
  96. // Clear all optional fields equal to '_'.
  97. for (size_t j = 2; j < fields.size(); ++j) {
  98. if (fields[j].length() == 1 && fields[j][0] == '_') fields[j].clear();
  99. }
  100. // Check that the line is valid.
  101. CHECK_GE(fields.size(), 8)
  102. << "Every line has to have at least 8 tab separated fields.";
  103. // Check that the ids follow the expected format.
  104. const int id = utils::ParseUsing<int>(fields[0], 0, utils::ParseInt32);
  105. CHECK_EQ(expected_id++, id)
  106. << "Token ids start at 1 for each new sentence and increase by 1 "
  107. << "on each new token. Sentences are separated by an empty line.";
  108. // Get relevant fields.
  109. const string &word = fields[1];
  110. const string &cpostag = fields[3];
  111. const string &tag = fields[4];
  112. const string &attributes = fields[5];
  113. const int head = utils::ParseUsing<int>(fields[6], 0, utils::ParseInt32);
  114. const string &label = fields[7];
  115. // Add token to sentence text.
  116. if (!text.empty()) text.append(" ");
  117. const int start = text.size();
  118. const int end = start + word.size() - 1;
  119. text.append(word);
  120. // Add token to sentence.
  121. Token *token = sentence->add_token();
  122. token->set_word(word);
  123. token->set_start(start);
  124. token->set_end(end);
  125. if (head > 0) token->set_head(head - 1);
  126. if (!tag.empty()) token->set_tag(tag);
  127. if (!cpostag.empty()) token->set_category(cpostag);
  128. if (!label.empty()) token->set_label(label);
  129. if (!attributes.empty()) AddMorphAttributes(attributes, token);
  130. if (join_category_to_pos_) JoinCategoryToPos(token);
  131. if (add_pos_as_attribute_) AddPosAsAttribute(token);
  132. }
  133. if (sentence->token_size() > 0) {
  134. sentence->set_docid(key);
  135. sentence->set_text(text);
  136. sentences->push_back(sentence);
  137. } else {
  138. // If the sentence was empty (e.g., blank lines at the beginning of a
  139. // file), then don't save it.
  140. delete sentence;
  141. }
  142. }
  143. // Converts a sentence to a key/value pair.
  144. void ConvertToString(const Sentence &sentence, string *key,
  145. string *value) override {
  146. *key = sentence.docid();
  147. vector<string> lines;
  148. for (int i = 0; i < sentence.token_size(); ++i) {
  149. Token token = sentence.token(i);
  150. if (join_category_to_pos_) SplitCategoryFromPos(&token);
  151. if (add_pos_as_attribute_) RemovePosFromAttributes(&token);
  152. vector<string> fields(10);
  153. fields[0] = tensorflow::strings::Printf("%d", i + 1);
  154. fields[1] = UnderscoreIfEmpty(token.word());
  155. fields[2] = "_";
  156. fields[3] = UnderscoreIfEmpty(token.category());
  157. fields[4] = UnderscoreIfEmpty(token.tag());
  158. fields[5] = GetMorphAttributes(token);
  159. fields[6] = tensorflow::strings::Printf("%d", token.head() + 1);
  160. fields[7] = UnderscoreIfEmpty(token.label());
  161. fields[8] = "_";
  162. fields[9] = "_";
  163. lines.push_back(utils::Join(fields, "\t"));
  164. }
  165. *value = tensorflow::strings::StrCat(utils::Join(lines, "\n"), "\n\n");
  166. }
  167. private:
  168. // Replaces empty fields with an undescore.
  169. string UnderscoreIfEmpty(const string &field) {
  170. return field.empty() ? "_" : field;
  171. }
  172. // Creates a TokenMorphology object out of a list of attribute values of the
  173. // form: a1=v1|a2=v2|... or v1|v2|...
  174. void AddMorphAttributes(const string &attributes, Token *token) {
  175. TokenMorphology *morph =
  176. token->MutableExtension(TokenMorphology::morphology);
  177. vector<string> att_vals = utils::Split(attributes, '|');
  178. for (int i = 0; i < att_vals.size(); ++i) {
  179. vector<string> att_val = utils::SplitOne(att_vals[i], '=');
  180. // Format is either:
  181. // 1) a1=v1|a2=v2..., e.g., Czech CoNLL data, or,
  182. // 2) v1|v2|..., e.g., German CoNLL data.
  183. const pair<string, string> name_value =
  184. att_val.size() == 2 ? std::make_pair(att_val[0], att_val[1])
  185. : std::make_pair(att_val[0], "on");
  186. // We currently don't expect an empty attribute value, but might have an
  187. // empty attribute name due to data input errors.
  188. if (name_value.second.empty()) {
  189. LOG(WARNING) << "Invalid attributes string: " << attributes
  190. << " for token: " << token->ShortDebugString();
  191. continue;
  192. }
  193. if (!name_value.first.empty()) {
  194. TokenMorphology::Attribute *attribute = morph->add_attribute();
  195. attribute->set_name(name_value.first);
  196. attribute->set_value(name_value.second);
  197. }
  198. }
  199. }
  200. // Creates a list of attribute values of the form a1=v1|a2=v2|... or v1|v2|...
  201. // from a TokenMorphology object.
  202. string GetMorphAttributes(const Token &token) {
  203. const TokenMorphology &morph =
  204. token.GetExtension(TokenMorphology::morphology);
  205. if (morph.attribute_size() == 0) return "_";
  206. string attributes;
  207. for (const TokenMorphology::Attribute &attribute : morph.attribute()) {
  208. if (!attributes.empty()) tensorflow::strings::StrAppend(&attributes, "|");
  209. tensorflow::strings::StrAppend(&attributes, attribute.name());
  210. if (attribute.value() != "on") {
  211. tensorflow::strings::StrAppend(&attributes, "=", attribute.value());
  212. }
  213. }
  214. return attributes;
  215. }
  216. void JoinCategoryToPos(Token *token) {
  217. token->set_tag(
  218. tensorflow::strings::StrCat(token->category(), "++", token->tag()));
  219. token->clear_category();
  220. }
  221. void SplitCategoryFromPos(Token *token) {
  222. const string &tag = token->tag();
  223. const size_t pos = tag.find("++");
  224. if (pos != string::npos) {
  225. token->set_category(tag.substr(0, pos));
  226. token->set_tag(tag.substr(pos + 2));
  227. }
  228. }
  229. void AddPosAsAttribute(Token *token) {
  230. if (!token->tag().empty()) {
  231. TokenMorphology *morph =
  232. token->MutableExtension(TokenMorphology::morphology);
  233. TokenMorphology::Attribute *attribute = morph->add_attribute();
  234. attribute->set_name("fPOS");
  235. attribute->set_value(token->tag());
  236. }
  237. }
  238. void RemovePosFromAttributes(Token *token) {
  239. // Assumes the "fPOS" attribute, if present, is the last one.
  240. TokenMorphology *morph =
  241. token->MutableExtension(TokenMorphology::morphology);
  242. if (morph->attribute_size() > 0 &&
  243. morph->attribute().rbegin()->name() == "fPOS") {
  244. morph->mutable_attribute()->RemoveLast();
  245. }
  246. }
  247. bool join_category_to_pos_ = false;
  248. bool add_pos_as_attribute_ = false;
  249. TF_DISALLOW_COPY_AND_ASSIGN(CoNLLSyntaxFormat);
  250. };
  251. REGISTER_DOCUMENT_FORMAT("conll-sentence", CoNLLSyntaxFormat);
  252. // Reader for tokenized text. This reader expects every sentence to be on a
  253. // single line and tokens on that line to be separated by single spaces.
  254. //
  255. class TokenizedTextFormat : public DocumentFormat {
  256. public:
  257. TokenizedTextFormat() {}
  258. // Reads a line and returns false if end of file is reached.
  259. bool ReadRecord(tensorflow::io::InputBuffer *buffer,
  260. string *record) override {
  261. return buffer->ReadLine(record).ok();
  262. }
  263. void ConvertFromString(const string &key, const string &value,
  264. vector<Sentence *> *sentences) override {
  265. Sentence *sentence = new Sentence();
  266. string text;
  267. for (const string &word : utils::Split(value, ' ')) {
  268. if (word.empty()) continue;
  269. const int start = text.size();
  270. const int end = start + word.size() - 1;
  271. if (!text.empty()) text.append(" ");
  272. text.append(word);
  273. Token *token = sentence->add_token();
  274. token->set_word(word);
  275. token->set_start(start);
  276. token->set_end(end);
  277. }
  278. if (sentence->token_size() > 0) {
  279. sentence->set_docid(key);
  280. sentence->set_text(text);
  281. sentences->push_back(sentence);
  282. } else {
  283. // If the sentence was empty (e.g., blank lines at the beginning of a
  284. // file), then don't save it.
  285. delete sentence;
  286. }
  287. }
  288. void ConvertToString(const Sentence &sentence, string *key,
  289. string *value) override {
  290. *key = sentence.docid();
  291. value->clear();
  292. for (const Token &token : sentence.token()) {
  293. if (!value->empty()) value->append(" ");
  294. value->append(token.word());
  295. if (token.has_tag()) {
  296. value->append("_");
  297. value->append(token.tag());
  298. }
  299. if (token.has_head()) {
  300. value->append("_");
  301. value->append(tensorflow::strings::StrCat(token.head()));
  302. }
  303. }
  304. value->append("\n");
  305. }
  306. private:
  307. TF_DISALLOW_COPY_AND_ASSIGN(TokenizedTextFormat);
  308. };
  309. REGISTER_DOCUMENT_FORMAT("tokenized-text", TokenizedTextFormat);
  310. // Reader for un-tokenized text. This reader expects every sentence to be on a
  311. // single line. For each line in the input, a sentence proto will be created,
  312. // where tokens are utf8 characters of that line.
  313. //
  314. class UntokenizedTextFormat : public TokenizedTextFormat {
  315. public:
  316. UntokenizedTextFormat() {}
  317. void ConvertFromString(const string &key, const string &value,
  318. vector<Sentence *> *sentences) override {
  319. Sentence *sentence = new Sentence();
  320. vector<tensorflow::StringPiece> chars;
  321. SegmenterUtils::GetUTF8Chars(value, &chars);
  322. int start = 0;
  323. for (auto utf8char : chars) {
  324. Token *token = sentence->add_token();
  325. token->set_word(utf8char.ToString());
  326. token->set_start(start);
  327. start += utf8char.size();
  328. token->set_end(start - 1);
  329. }
  330. if (sentence->token_size() > 0) {
  331. sentence->set_docid(key);
  332. sentence->set_text(value);
  333. sentences->push_back(sentence);
  334. } else {
  335. // If the sentence was empty (e.g., blank lines at the beginning of a
  336. // file), then don't save it.
  337. delete sentence;
  338. }
  339. }
  340. private:
  341. TF_DISALLOW_COPY_AND_ASSIGN(UntokenizedTextFormat);
  342. };
  343. REGISTER_DOCUMENT_FORMAT("untokenized-text", UntokenizedTextFormat);
  344. // Text reader that attmpts to perform Penn Treebank tokenization on arbitrary
  345. // raw text. Adapted from https://www.cis.upenn.edu/~treebank/tokenizer.sed
  346. // by Robert MacIntyre, University of Pennsylvania, late 1995.
  347. // Expected input: raw text with one sentence per line.
  348. //
  349. class EnglishTextFormat : public TokenizedTextFormat {
  350. public:
  351. EnglishTextFormat() {}
  352. void ConvertFromString(const string &key, const string &value,
  353. vector<Sentence *> *sentences) override {
  354. vector<pair<string, string>> preproc_rules = {
  355. // Punctuation.
  356. {"’", "'"},
  357. {"…", "..."},
  358. {"---", "--"},
  359. {"—", "--"},
  360. {"–", "--"},
  361. {",", ","},
  362. {"。", "."},
  363. {"!", "!"},
  364. {"?", "?"},
  365. {":", ":"},
  366. {";", ";"},
  367. {"&", "&"},
  368. // Brackets.
  369. {"\\[", "("},
  370. {"]", ")"},
  371. {"{", "("},
  372. {"}", ")"},
  373. {"【", "("},
  374. {"】", ")"},
  375. {"(", "("},
  376. {")", ")"},
  377. // Quotation marks.
  378. {"\"", "\""},
  379. {"″", "\""},
  380. {"“", "\""},
  381. {"„", "\""},
  382. {"‵‵", "\""},
  383. {"”", "\""},
  384. {"’", "\""},
  385. {"‘", "\""},
  386. {"′′", "\""},
  387. {"‹", "\""},
  388. {"›", "\""},
  389. {"«", "\""},
  390. {"»", "\""},
  391. // Discarded punctuation that breaks sentences.
  392. {"|", ""},
  393. {"·", ""},
  394. {"•", ""},
  395. {"●", ""},
  396. {"▪", ""},
  397. {"■", ""},
  398. {"□", ""},
  399. {"❑", ""},
  400. {"◆", ""},
  401. {"★", ""},
  402. {"*", ""},
  403. {"♦", ""},
  404. };
  405. vector<pair<string, string>> rules = {
  406. // attempt to get correct directional quotes
  407. {R"re(^")re", "`` "},
  408. {R"re(([ \([{<])")re", "\\1 `` "},
  409. // close quotes handled at end
  410. {R"re(\.\.\.)re", " ... "},
  411. {"[,;:@#$%&]", " \\0 "},
  412. // Assume sentence tokenization has been done first, so split FINAL
  413. // periods only.
  414. {R"re(([^.])(\.)([\]\)}>"']*)[ ]*$)re", "\\1 \\2\\3 "},
  415. // however, we may as well split ALL question marks and exclamation
  416. // points, since they shouldn't have the abbrev.-marker ambiguity
  417. // problem
  418. {"[?!]", " \\0 "},
  419. // parentheses, brackets, etc.
  420. {R"re([\]\[\(\){}<>])re", " \\0 "},
  421. // Like Adwait Ratnaparkhi's MXPOST, we use the parsed-file version of
  422. // these symbols.
  423. {"\\(", "-LRB-"},
  424. {"\\)", "-RRB-"},
  425. {"\\]", "-LSB-"},
  426. {"\\]", "-RSB-"},
  427. {"{", "-LCB-"},
  428. {"}", "-RCB-"},
  429. {"--", " -- "},
  430. // First off, add a space to the beginning and end of each line, to
  431. // reduce necessary number of regexps.
  432. {"$", " "},
  433. {"^", " "},
  434. {"\"", " '' "},
  435. // possessive or close-single-quote
  436. {"([^'])' ", "\\1 ' "},
  437. // as in it's, I'm, we'd
  438. {"'([sSmMdD]) ", " '\\1 "},
  439. {"'ll ", " 'll "},
  440. {"'re ", " 're "},
  441. {"'ve ", " 've "},
  442. {"n't ", " n't "},
  443. {"'LL ", " 'LL "},
  444. {"'RE ", " 'RE "},
  445. {"'VE ", " 'VE "},
  446. {"N'T ", " N'T "},
  447. {" ([Cc])annot ", " \\1an not "},
  448. {" ([Dd])'ye ", " \\1' ye "},
  449. {" ([Gg])imme ", " \\1im me "},
  450. {" ([Gg])onna ", " \\1on na "},
  451. {" ([Gg])otta ", " \\1ot ta "},
  452. {" ([Ll])emme ", " \\1em me "},
  453. {" ([Mm])ore'n ", " \\1ore 'n "},
  454. {" '([Tt])is ", " '\\1 is "},
  455. {" '([Tt])was ", " '\\1 was "},
  456. {" ([Ww])anna ", " \\1an na "},
  457. {" ([Ww])haddya ", " \\1ha dd ya "},
  458. {" ([Ww])hatcha ", " \\1ha t cha "},
  459. // clean out extra spaces
  460. {" *", " "},
  461. {"^ *", ""},
  462. };
  463. string rewritten = value;
  464. for (const pair<string, string> &rule : preproc_rules) {
  465. RE2::GlobalReplace(&rewritten, rule.first, rule.second);
  466. }
  467. for (const pair<string, string> &rule : rules) {
  468. RE2::GlobalReplace(&rewritten, rule.first, rule.second);
  469. }
  470. TokenizedTextFormat::ConvertFromString(key, rewritten, sentences);
  471. }
  472. private:
  473. TF_DISALLOW_COPY_AND_ASSIGN(EnglishTextFormat);
  474. };
  475. REGISTER_DOCUMENT_FORMAT("english-text", EnglishTextFormat);
  476. } // namespace syntaxnet