| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542 |
- /* Copyright 2016 Google Inc. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==============================================================================*/
- #include <memory>
- #include <string>
- #include <vector>
- #include "syntaxnet/document_format.h"
- #include "syntaxnet/sentence.pb.h"
- #include "syntaxnet/segmenter_utils.h"
- #include "syntaxnet/utils.h"
- #include "tensorflow/core/lib/io/inputbuffer.h"
- #include "tensorflow/core/lib/strings/strcat.h"
- #include "tensorflow/core/lib/strings/stringprintf.h"
- #include "tensorflow/core/platform/regexp.h"
- namespace syntaxnet {
- // CoNLL document format reader for dependency annotated corpora.
- // The expected format is described e.g. at http://ilk.uvt.nl/conll/#dataformat
- //
- // Data should adhere to the following rules:
- // - Data files contain sentences separated by a blank line.
- // - A sentence consists of one or tokens, each one starting on a new line.
- // - A token consists of ten fields described in the table below.
- // - Fields are separated by a single tab character.
- // - All data files will contains these ten fields, although only the ID
- // column is required to contain non-dummy (i.e. non-underscore) values.
- // Data files should be UTF-8 encoded (Unicode).
- //
- // Fields:
- // 1 ID: Token counter, starting at 1 for each new sentence and increasing
- // by 1 for every new token.
- // 2 FORM: Word form or punctuation symbol.
- // 3 LEMMA: Lemma or stem.
- // 4 CPOSTAG: Coarse-grained part-of-speech tag or category.
- // 5 POSTAG: Fine-grained part-of-speech tag. Note that the same POS tag
- // cannot appear with multiple coarse-grained POS tags.
- // 6 FEATS: Unordered set of syntactic and/or morphological features.
- // 7 HEAD: Head of the current token, which is either a value of ID or '0'.
- // 8 DEPREL: Dependency relation to the HEAD.
- // 9 PHEAD: Projective head of current token.
- // 10 PDEPREL: Dependency relation to the PHEAD.
- //
- // This CoNLL reader is compatible with the CoNLL-U format described at
- // http://universaldependencies.org/format.html
- // Note that this reader skips CoNLL-U multiword tokens and ignores the last two
- // fields of every line, which are PHEAD and PDEPREL in CoNLL format, but are
- // replaced by DEPS and MISC in CoNLL-U.
- //
- class CoNLLSyntaxFormat : public DocumentFormat {
- public:
- CoNLLSyntaxFormat() {}
- void Setup(TaskContext *context) override {
- join_category_to_pos_ = context->GetBoolParameter("join_category_to_pos");
- add_pos_as_attribute_ = context->GetBoolParameter("add_pos_as_attribute");
- }
- // Reads up to the first empty line and returns false end of file is reached.
- bool ReadRecord(tensorflow::io::InputBuffer *buffer,
- string *record) override {
- string line;
- record->clear();
- tensorflow::Status status = buffer->ReadLine(&line);
- while (!line.empty() && status.ok()) {
- tensorflow::strings::StrAppend(record, line, "\n");
- status = buffer->ReadLine(&line);
- }
- return status.ok() || !record->empty();
- }
- void ConvertFromString(const string &key, const string &value,
- vector<Sentence *> *sentences) override {
- // Create new sentence.
- Sentence *sentence = new Sentence();
- // Each line corresponds to one token.
- string text;
- vector<string> lines = utils::Split(value, '\n');
- // Add each token to the sentence.
- vector<string> fields;
- int expected_id = 1;
- for (size_t i = 0; i < lines.size(); ++i) {
- // Split line into tab-separated fields.
- fields.clear();
- fields = utils::Split(lines[i], '\t');
- if (fields.empty()) continue;
- // Skip comment lines.
- if (fields[0][0] == '#') continue;
- // Skip CoNLLU lines for multiword tokens which are indicated by
- // hyphenated line numbers, e.g., "2-4".
- // http://universaldependencies.github.io/docs/format.html
- if (RE2::FullMatch(fields[0], "[0-9]+-[0-9]+")) continue;
- // Clear all optional fields equal to '_'.
- for (size_t j = 2; j < fields.size(); ++j) {
- if (fields[j].length() == 1 && fields[j][0] == '_') fields[j].clear();
- }
- // Check that the line is valid.
- CHECK_GE(fields.size(), 8)
- << "Every line has to have at least 8 tab separated fields.";
- // Check that the ids follow the expected format.
- const int id = utils::ParseUsing<int>(fields[0], 0, utils::ParseInt32);
- CHECK_EQ(expected_id++, id)
- << "Token ids start at 1 for each new sentence and increase by 1 "
- << "on each new token. Sentences are separated by an empty line.";
- // Get relevant fields.
- const string &word = fields[1];
- const string &cpostag = fields[3];
- const string &tag = fields[4];
- const string &attributes = fields[5];
- const int head = utils::ParseUsing<int>(fields[6], 0, utils::ParseInt32);
- const string &label = fields[7];
- // Add token to sentence text.
- if (!text.empty()) text.append(" ");
- const int start = text.size();
- const int end = start + word.size() - 1;
- text.append(word);
- // Add token to sentence.
- Token *token = sentence->add_token();
- token->set_word(word);
- token->set_start(start);
- token->set_end(end);
- if (head > 0) token->set_head(head - 1);
- if (!tag.empty()) token->set_tag(tag);
- if (!cpostag.empty()) token->set_category(cpostag);
- if (!label.empty()) token->set_label(label);
- if (!attributes.empty()) AddMorphAttributes(attributes, token);
- if (join_category_to_pos_) JoinCategoryToPos(token);
- if (add_pos_as_attribute_) AddPosAsAttribute(token);
- }
- if (sentence->token_size() > 0) {
- sentence->set_docid(key);
- sentence->set_text(text);
- sentences->push_back(sentence);
- } else {
- // If the sentence was empty (e.g., blank lines at the beginning of a
- // file), then don't save it.
- delete sentence;
- }
- }
- // Converts a sentence to a key/value pair.
- void ConvertToString(const Sentence &sentence, string *key,
- string *value) override {
- *key = sentence.docid();
- vector<string> lines;
- for (int i = 0; i < sentence.token_size(); ++i) {
- Token token = sentence.token(i);
- if (join_category_to_pos_) SplitCategoryFromPos(&token);
- if (add_pos_as_attribute_) RemovePosFromAttributes(&token);
- vector<string> fields(10);
- fields[0] = tensorflow::strings::Printf("%d", i + 1);
- fields[1] = UnderscoreIfEmpty(token.word());
- fields[2] = "_";
- fields[3] = UnderscoreIfEmpty(token.category());
- fields[4] = UnderscoreIfEmpty(token.tag());
- fields[5] = GetMorphAttributes(token);
- fields[6] = tensorflow::strings::Printf("%d", token.head() + 1);
- fields[7] = UnderscoreIfEmpty(token.label());
- fields[8] = "_";
- fields[9] = "_";
- lines.push_back(utils::Join(fields, "\t"));
- }
- *value = tensorflow::strings::StrCat(utils::Join(lines, "\n"), "\n\n");
- }
- private:
- // Replaces empty fields with an undescore.
- string UnderscoreIfEmpty(const string &field) {
- return field.empty() ? "_" : field;
- }
- // Creates a TokenMorphology object out of a list of attribute values of the
- // form: a1=v1|a2=v2|... or v1|v2|...
- void AddMorphAttributes(const string &attributes, Token *token) {
- TokenMorphology *morph =
- token->MutableExtension(TokenMorphology::morphology);
- vector<string> att_vals = utils::Split(attributes, '|');
- for (int i = 0; i < att_vals.size(); ++i) {
- vector<string> att_val = utils::SplitOne(att_vals[i], '=');
- // Format is either:
- // 1) a1=v1|a2=v2..., e.g., Czech CoNLL data, or,
- // 2) v1|v2|..., e.g., German CoNLL data.
- const pair<string, string> name_value =
- att_val.size() == 2 ? std::make_pair(att_val[0], att_val[1])
- : std::make_pair(att_val[0], "on");
- // We currently don't expect an empty attribute value, but might have an
- // empty attribute name due to data input errors.
- if (name_value.second.empty()) {
- LOG(WARNING) << "Invalid attributes string: " << attributes
- << " for token: " << token->ShortDebugString();
- continue;
- }
- if (!name_value.first.empty()) {
- TokenMorphology::Attribute *attribute = morph->add_attribute();
- attribute->set_name(name_value.first);
- attribute->set_value(name_value.second);
- }
- }
- }
- // Creates a list of attribute values of the form a1=v1|a2=v2|... or v1|v2|...
- // from a TokenMorphology object.
- string GetMorphAttributes(const Token &token) {
- const TokenMorphology &morph =
- token.GetExtension(TokenMorphology::morphology);
- if (morph.attribute_size() == 0) return "_";
- string attributes;
- for (const TokenMorphology::Attribute &attribute : morph.attribute()) {
- if (!attributes.empty()) tensorflow::strings::StrAppend(&attributes, "|");
- tensorflow::strings::StrAppend(&attributes, attribute.name());
- if (attribute.value() != "on") {
- tensorflow::strings::StrAppend(&attributes, "=", attribute.value());
- }
- }
- return attributes;
- }
- void JoinCategoryToPos(Token *token) {
- token->set_tag(
- tensorflow::strings::StrCat(token->category(), "++", token->tag()));
- token->clear_category();
- }
- void SplitCategoryFromPos(Token *token) {
- const string &tag = token->tag();
- const size_t pos = tag.find("++");
- if (pos != string::npos) {
- token->set_category(tag.substr(0, pos));
- token->set_tag(tag.substr(pos + 2));
- }
- }
- void AddPosAsAttribute(Token *token) {
- if (!token->tag().empty()) {
- TokenMorphology *morph =
- token->MutableExtension(TokenMorphology::morphology);
- TokenMorphology::Attribute *attribute = morph->add_attribute();
- attribute->set_name("fPOS");
- attribute->set_value(token->tag());
- }
- }
- void RemovePosFromAttributes(Token *token) {
- // Assumes the "fPOS" attribute, if present, is the last one.
- TokenMorphology *morph =
- token->MutableExtension(TokenMorphology::morphology);
- if (morph->attribute_size() > 0 &&
- morph->attribute().rbegin()->name() == "fPOS") {
- morph->mutable_attribute()->RemoveLast();
- }
- }
- bool join_category_to_pos_ = false;
- bool add_pos_as_attribute_ = false;
- TF_DISALLOW_COPY_AND_ASSIGN(CoNLLSyntaxFormat);
- };
- REGISTER_DOCUMENT_FORMAT("conll-sentence", CoNLLSyntaxFormat);
- // Reader for tokenized text. This reader expects every sentence to be on a
- // single line and tokens on that line to be separated by single spaces.
- //
- class TokenizedTextFormat : public DocumentFormat {
- public:
- TokenizedTextFormat() {}
- // Reads a line and returns false if end of file is reached.
- bool ReadRecord(tensorflow::io::InputBuffer *buffer,
- string *record) override {
- return buffer->ReadLine(record).ok();
- }
- void ConvertFromString(const string &key, const string &value,
- vector<Sentence *> *sentences) override {
- Sentence *sentence = new Sentence();
- string text;
- for (const string &word : utils::Split(value, ' ')) {
- if (word.empty()) continue;
- const int start = text.size();
- const int end = start + word.size() - 1;
- if (!text.empty()) text.append(" ");
- text.append(word);
- Token *token = sentence->add_token();
- token->set_word(word);
- token->set_start(start);
- token->set_end(end);
- }
- if (sentence->token_size() > 0) {
- sentence->set_docid(key);
- sentence->set_text(text);
- sentences->push_back(sentence);
- } else {
- // If the sentence was empty (e.g., blank lines at the beginning of a
- // file), then don't save it.
- delete sentence;
- }
- }
- void ConvertToString(const Sentence &sentence, string *key,
- string *value) override {
- *key = sentence.docid();
- value->clear();
- for (const Token &token : sentence.token()) {
- if (!value->empty()) value->append(" ");
- value->append(token.word());
- if (token.has_tag()) {
- value->append("_");
- value->append(token.tag());
- }
- if (token.has_head()) {
- value->append("_");
- value->append(tensorflow::strings::StrCat(token.head()));
- }
- }
- value->append("\n");
- }
- private:
- TF_DISALLOW_COPY_AND_ASSIGN(TokenizedTextFormat);
- };
- REGISTER_DOCUMENT_FORMAT("tokenized-text", TokenizedTextFormat);
- // Reader for un-tokenized text. This reader expects every sentence to be on a
- // single line. For each line in the input, a sentence proto will be created,
- // where tokens are utf8 characters of that line.
- //
- class UntokenizedTextFormat : public TokenizedTextFormat {
- public:
- UntokenizedTextFormat() {}
- void ConvertFromString(const string &key, const string &value,
- vector<Sentence *> *sentences) override {
- Sentence *sentence = new Sentence();
- vector<tensorflow::StringPiece> chars;
- SegmenterUtils::GetUTF8Chars(value, &chars);
- int start = 0;
- for (auto utf8char : chars) {
- Token *token = sentence->add_token();
- token->set_word(utf8char.ToString());
- token->set_start(start);
- start += utf8char.size();
- token->set_end(start - 1);
- }
- if (sentence->token_size() > 0) {
- sentence->set_docid(key);
- sentence->set_text(value);
- sentences->push_back(sentence);
- } else {
- // If the sentence was empty (e.g., blank lines at the beginning of a
- // file), then don't save it.
- delete sentence;
- }
- }
- private:
- TF_DISALLOW_COPY_AND_ASSIGN(UntokenizedTextFormat);
- };
- REGISTER_DOCUMENT_FORMAT("untokenized-text", UntokenizedTextFormat);
- // Text reader that attmpts to perform Penn Treebank tokenization on arbitrary
- // raw text. Adapted from https://www.cis.upenn.edu/~treebank/tokenizer.sed
- // by Robert MacIntyre, University of Pennsylvania, late 1995.
- // Expected input: raw text with one sentence per line.
- //
- class EnglishTextFormat : public TokenizedTextFormat {
- public:
- EnglishTextFormat() {}
- void ConvertFromString(const string &key, const string &value,
- vector<Sentence *> *sentences) override {
- vector<pair<string, string>> preproc_rules = {
- // Punctuation.
- {"’", "'"},
- {"…", "..."},
- {"---", "--"},
- {"—", "--"},
- {"–", "--"},
- {",", ","},
- {"。", "."},
- {"!", "!"},
- {"?", "?"},
- {":", ":"},
- {";", ";"},
- {"&", "&"},
- // Brackets.
- {"\\[", "("},
- {"]", ")"},
- {"{", "("},
- {"}", ")"},
- {"【", "("},
- {"】", ")"},
- {"(", "("},
- {")", ")"},
- // Quotation marks.
- {"\"", "\""},
- {"″", "\""},
- {"“", "\""},
- {"„", "\""},
- {"‵‵", "\""},
- {"”", "\""},
- {"’", "\""},
- {"‘", "\""},
- {"′′", "\""},
- {"‹", "\""},
- {"›", "\""},
- {"«", "\""},
- {"»", "\""},
- // Discarded punctuation that breaks sentences.
- {"|", ""},
- {"·", ""},
- {"•", ""},
- {"●", ""},
- {"▪", ""},
- {"■", ""},
- {"□", ""},
- {"❑", ""},
- {"◆", ""},
- {"★", ""},
- {"*", ""},
- {"♦", ""},
- };
- vector<pair<string, string>> rules = {
- // attempt to get correct directional quotes
- {R"re(^")re", "`` "},
- {R"re(([ \([{<])")re", "\\1 `` "},
- // close quotes handled at end
- {R"re(\.\.\.)re", " ... "},
- {"[,;:@#$%&]", " \\0 "},
- // Assume sentence tokenization has been done first, so split FINAL
- // periods only.
- {R"re(([^.])(\.)([\]\)}>"']*)[ ]*$)re", "\\1 \\2\\3 "},
- // however, we may as well split ALL question marks and exclamation
- // points, since they shouldn't have the abbrev.-marker ambiguity
- // problem
- {"[?!]", " \\0 "},
- // parentheses, brackets, etc.
- {R"re([\]\[\(\){}<>])re", " \\0 "},
- // Like Adwait Ratnaparkhi's MXPOST, we use the parsed-file version of
- // these symbols.
- {"\\(", "-LRB-"},
- {"\\)", "-RRB-"},
- {"\\]", "-LSB-"},
- {"\\]", "-RSB-"},
- {"{", "-LCB-"},
- {"}", "-RCB-"},
- {"--", " -- "},
- // First off, add a space to the beginning and end of each line, to
- // reduce necessary number of regexps.
- {"$", " "},
- {"^", " "},
- {"\"", " '' "},
- // possessive or close-single-quote
- {"([^'])' ", "\\1 ' "},
- // as in it's, I'm, we'd
- {"'([sSmMdD]) ", " '\\1 "},
- {"'ll ", " 'll "},
- {"'re ", " 're "},
- {"'ve ", " 've "},
- {"n't ", " n't "},
- {"'LL ", " 'LL "},
- {"'RE ", " 'RE "},
- {"'VE ", " 'VE "},
- {"N'T ", " N'T "},
- {" ([Cc])annot ", " \\1an not "},
- {" ([Dd])'ye ", " \\1' ye "},
- {" ([Gg])imme ", " \\1im me "},
- {" ([Gg])onna ", " \\1on na "},
- {" ([Gg])otta ", " \\1ot ta "},
- {" ([Ll])emme ", " \\1em me "},
- {" ([Mm])ore'n ", " \\1ore 'n "},
- {" '([Tt])is ", " '\\1 is "},
- {" '([Tt])was ", " '\\1 was "},
- {" ([Ww])anna ", " \\1an na "},
- {" ([Ww])haddya ", " \\1ha dd ya "},
- {" ([Ww])hatcha ", " \\1ha t cha "},
- // clean out extra spaces
- {" *", " "},
- {"^ *", ""},
- };
- string rewritten = value;
- for (const pair<string, string> &rule : preproc_rules) {
- RE2::GlobalReplace(&rewritten, rule.first, rule.second);
- }
- for (const pair<string, string> &rule : rules) {
- RE2::GlobalReplace(&rewritten, rule.first, rule.second);
- }
- TokenizedTextFormat::ConvertFromString(key, rewritten, sentences);
- }
- private:
- TF_DISALLOW_COPY_AND_ASSIGN(EnglishTextFormat);
- };
- REGISTER_DOCUMENT_FORMAT("english-text", EnglishTextFormat);
- } // namespace syntaxnet
|