|
@@ -83,16 +83,16 @@ class CoNLLSyntaxFormat : public DocumentFormat {
|
|
|
}
|
|
|
|
|
|
void ConvertFromString(const string &key, const string &value,
|
|
|
- vector<Sentence *> *sentences) override {
|
|
|
+ std::vector<Sentence *> *sentences) override {
|
|
|
// Create new sentence.
|
|
|
Sentence *sentence = new Sentence();
|
|
|
|
|
|
// Each line corresponds to one token.
|
|
|
string text;
|
|
|
- vector<string> lines = utils::Split(value, '\n');
|
|
|
+ std::vector<string> lines = utils::Split(value, '\n');
|
|
|
|
|
|
// Add each token to the sentence.
|
|
|
- vector<string> fields;
|
|
|
+ std::vector<string> fields;
|
|
|
int expected_id = 1;
|
|
|
for (size_t i = 0; i < lines.size(); ++i) {
|
|
|
// Split line into tab-separated fields.
|
|
@@ -166,12 +166,12 @@ class CoNLLSyntaxFormat : public DocumentFormat {
|
|
|
void ConvertToString(const Sentence &sentence, string *key,
|
|
|
string *value) override {
|
|
|
*key = sentence.docid();
|
|
|
- vector<string> lines;
|
|
|
+ std::vector<string> lines;
|
|
|
for (int i = 0; i < sentence.token_size(); ++i) {
|
|
|
Token token = sentence.token(i);
|
|
|
if (join_category_to_pos_) SplitCategoryFromPos(&token);
|
|
|
if (add_pos_as_attribute_) RemovePosFromAttributes(&token);
|
|
|
- vector<string> fields(10);
|
|
|
+ std::vector<string> fields(10);
|
|
|
fields[0] = tensorflow::strings::Printf("%d", i + 1);
|
|
|
fields[1] = UnderscoreIfEmpty(token.word());
|
|
|
fields[2] = "_";
|
|
@@ -198,14 +198,14 @@ class CoNLLSyntaxFormat : public DocumentFormat {
|
|
|
void AddMorphAttributes(const string &attributes, Token *token) {
|
|
|
TokenMorphology *morph =
|
|
|
token->MutableExtension(TokenMorphology::morphology);
|
|
|
- vector<string> att_vals = utils::Split(attributes, '|');
|
|
|
+ std::vector<string> att_vals = utils::Split(attributes, '|');
|
|
|
for (int i = 0; i < att_vals.size(); ++i) {
|
|
|
- vector<string> att_val = utils::SplitOne(att_vals[i], '=');
|
|
|
+ std::vector<string> att_val = utils::SplitOne(att_vals[i], '=');
|
|
|
|
|
|
// Format is either:
|
|
|
// 1) a1=v1|a2=v2..., e.g., Czech CoNLL data, or,
|
|
|
// 2) v1|v2|..., e.g., German CoNLL data.
|
|
|
- const pair<string, string> name_value =
|
|
|
+ const std::pair<string, string> name_value =
|
|
|
att_val.size() == 2 ? std::make_pair(att_val[0], att_val[1])
|
|
|
: std::make_pair(att_val[0], "on");
|
|
|
|
|
@@ -282,7 +282,7 @@ class CoNLLSyntaxFormat : public DocumentFormat {
|
|
|
TF_DISALLOW_COPY_AND_ASSIGN(CoNLLSyntaxFormat);
|
|
|
};
|
|
|
|
|
|
-REGISTER_DOCUMENT_FORMAT("conll-sentence", CoNLLSyntaxFormat);
|
|
|
+REGISTER_SYNTAXNET_DOCUMENT_FORMAT("conll-sentence", CoNLLSyntaxFormat);
|
|
|
|
|
|
// Reader for segmentation training data format. This reader assumes the input
|
|
|
// format is similar to CoNLL format but with only two fileds:
|
|
@@ -325,16 +325,16 @@ class SegmentationTrainingDataFormat : public CoNLLSyntaxFormat {
|
|
|
// to SPACE_BREAK to indicate that the corresponding gold transition for that
|
|
|
// character token is START. Otherwise NO_BREAK to indicate MERGE.
|
|
|
void ConvertFromString(const string &key, const string &value,
|
|
|
- vector<Sentence *> *sentences) override {
|
|
|
+ std::vector<Sentence *> *sentences) override {
|
|
|
// Create new sentence.
|
|
|
Sentence *sentence = new Sentence();
|
|
|
|
|
|
// Each line corresponds to one token.
|
|
|
string text;
|
|
|
- vector<string> lines = utils::Split(value, '\n');
|
|
|
+ std::vector<string> lines = utils::Split(value, '\n');
|
|
|
|
|
|
// Add each token to the sentence.
|
|
|
- vector<string> fields;
|
|
|
+ std::vector<string> fields;
|
|
|
for (size_t i = 0; i < lines.size(); ++i) {
|
|
|
// Split line into tab-separated fields.
|
|
|
fields.clear();
|
|
@@ -362,7 +362,7 @@ class SegmentationTrainingDataFormat : public CoNLLSyntaxFormat {
|
|
|
}
|
|
|
|
|
|
// Add character-based token to sentence.
|
|
|
- vector<tensorflow::StringPiece> chars;
|
|
|
+ std::vector<tensorflow::StringPiece> chars;
|
|
|
SegmenterUtils::GetUTF8Chars(word, &chars);
|
|
|
bool is_first_char = true;
|
|
|
for (auto utf8char : chars) {
|
|
@@ -398,7 +398,8 @@ class SegmentationTrainingDataFormat : public CoNLLSyntaxFormat {
|
|
|
}
|
|
|
};
|
|
|
|
|
|
-REGISTER_DOCUMENT_FORMAT("segment-train-data", SegmentationTrainingDataFormat);
|
|
|
+REGISTER_SYNTAXNET_DOCUMENT_FORMAT("segment-train-data",
|
|
|
+ SegmentationTrainingDataFormat);
|
|
|
|
|
|
// Reader for tokenized text. This reader expects every sentence to be on a
|
|
|
// single line and tokens on that line to be separated by single spaces.
|
|
@@ -414,7 +415,7 @@ class TokenizedTextFormat : public DocumentFormat {
|
|
|
}
|
|
|
|
|
|
void ConvertFromString(const string &key, const string &value,
|
|
|
- vector<Sentence *> *sentences) override {
|
|
|
+ std::vector<Sentence *> *sentences) override {
|
|
|
Sentence *sentence = new Sentence();
|
|
|
string text;
|
|
|
for (const string &word : utils::Split(value, ' ')) {
|
|
@@ -463,7 +464,7 @@ class TokenizedTextFormat : public DocumentFormat {
|
|
|
TF_DISALLOW_COPY_AND_ASSIGN(TokenizedTextFormat);
|
|
|
};
|
|
|
|
|
|
-REGISTER_DOCUMENT_FORMAT("tokenized-text", TokenizedTextFormat);
|
|
|
+REGISTER_SYNTAXNET_DOCUMENT_FORMAT("tokenized-text", TokenizedTextFormat);
|
|
|
|
|
|
// Reader for un-tokenized text. This reader expects every sentence to be on a
|
|
|
// single line. For each line in the input, a sentence proto will be created,
|
|
@@ -474,9 +475,9 @@ class UntokenizedTextFormat : public TokenizedTextFormat {
|
|
|
UntokenizedTextFormat() {}
|
|
|
|
|
|
void ConvertFromString(const string &key, const string &value,
|
|
|
- vector<Sentence *> *sentences) override {
|
|
|
+ std::vector<Sentence *> *sentences) override {
|
|
|
Sentence *sentence = new Sentence();
|
|
|
- vector<tensorflow::StringPiece> chars;
|
|
|
+ std::vector<tensorflow::StringPiece> chars;
|
|
|
SegmenterUtils::GetUTF8Chars(value, &chars);
|
|
|
int start = 0;
|
|
|
for (auto utf8char : chars) {
|
|
@@ -502,7 +503,7 @@ class UntokenizedTextFormat : public TokenizedTextFormat {
|
|
|
TF_DISALLOW_COPY_AND_ASSIGN(UntokenizedTextFormat);
|
|
|
};
|
|
|
|
|
|
-REGISTER_DOCUMENT_FORMAT("untokenized-text", UntokenizedTextFormat);
|
|
|
+REGISTER_SYNTAXNET_DOCUMENT_FORMAT("untokenized-text", UntokenizedTextFormat);
|
|
|
|
|
|
// Text reader that attmpts to perform Penn Treebank tokenization on arbitrary
|
|
|
// raw text. Adapted from https://www.cis.upenn.edu/~treebank/tokenizer.sed
|
|
@@ -514,8 +515,8 @@ class EnglishTextFormat : public TokenizedTextFormat {
|
|
|
EnglishTextFormat() {}
|
|
|
|
|
|
void ConvertFromString(const string &key, const string &value,
|
|
|
- vector<Sentence *> *sentences) override {
|
|
|
- vector<pair<string, string>> preproc_rules = {
|
|
|
+ std::vector<Sentence *> *sentences) override {
|
|
|
+ std::vector<std::pair<string, string>> preproc_rules = {
|
|
|
// Punctuation.
|
|
|
{"’", "'"},
|
|
|
{"…", "..."},
|
|
@@ -570,7 +571,7 @@ class EnglishTextFormat : public TokenizedTextFormat {
|
|
|
{"♦", ""},
|
|
|
};
|
|
|
|
|
|
- vector<pair<string, string>> rules = {
|
|
|
+ std::vector<std::pair<string, string>> rules = {
|
|
|
// attempt to get correct directional quotes
|
|
|
{R"re(^")re", "`` "},
|
|
|
{R"re(([ \([{<])")re", "\\1 `` "},
|
|
@@ -639,10 +640,10 @@ class EnglishTextFormat : public TokenizedTextFormat {
|
|
|
};
|
|
|
|
|
|
string rewritten = value;
|
|
|
- for (const pair<string, string> &rule : preproc_rules) {
|
|
|
+ for (const std::pair<string, string> &rule : preproc_rules) {
|
|
|
RE2::GlobalReplace(&rewritten, rule.first, rule.second);
|
|
|
}
|
|
|
- for (const pair<string, string> &rule : rules) {
|
|
|
+ for (const std::pair<string, string> &rule : rules) {
|
|
|
RE2::GlobalReplace(&rewritten, rule.first, rule.second);
|
|
|
}
|
|
|
TokenizedTextFormat::ConvertFromString(key, rewritten, sentences);
|
|
@@ -652,6 +653,6 @@ class EnglishTextFormat : public TokenizedTextFormat {
|
|
|
TF_DISALLOW_COPY_AND_ASSIGN(EnglishTextFormat);
|
|
|
};
|
|
|
|
|
|
-REGISTER_DOCUMENT_FORMAT("english-text", EnglishTextFormat);
|
|
|
+REGISTER_SYNTAXNET_DOCUMENT_FORMAT("english-text", EnglishTextFormat);
|
|
|
|
|
|
} // namespace syntaxnet
|