Browse Source

New transition systems and features for syntaxnet (#301)

* Morpher and segmenter transition systems and new features (quotes, punctuation, capitalization, character ngrams, morphology attributes).
calberti 10 years ago
parent
commit
64675fc72f
37 changed files with 4257 additions and 62 deletions
  1. 3 2
      syntaxnet/README.md
  2. 105 23
      syntaxnet/syntaxnet/BUILD
  3. 102 0
      syntaxnet/syntaxnet/binary_segment_state.cc
  4. 99 0
      syntaxnet/syntaxnet/binary_segment_state.h
  5. 218 0
      syntaxnet/syntaxnet/binary_segment_state_test.cc
  6. 121 0
      syntaxnet/syntaxnet/binary_segment_transitions.cc
  7. 111 0
      syntaxnet/syntaxnet/binary_segment_transitions_test.cc
  8. 845 0
      syntaxnet/syntaxnet/char_properties.cc
  9. 362 0
      syntaxnet/syntaxnet/char_properties.h
  10. 364 0
      syntaxnet/syntaxnet/char_properties_test.cc
  11. 4 2
      syntaxnet/syntaxnet/document_filters.cc
  12. 2 0
      syntaxnet/syntaxnet/document_format.h
  13. 12 1
      syntaxnet/syntaxnet/lexicon_builder.cc
  14. 25 1
      syntaxnet/syntaxnet/lexicon_builder_test.py
  15. 298 0
      syntaxnet/syntaxnet/morpher_transitions.cc
  16. 91 0
      syntaxnet/syntaxnet/morphology_label_set.cc
  17. 110 0
      syntaxnet/syntaxnet/morphology_label_set.h
  18. 101 0
      syntaxnet/syntaxnet/morphology_label_set_test.cc
  19. 0 1
      syntaxnet/syntaxnet/parser_eval.py
  20. 18 0
      syntaxnet/syntaxnet/parser_features.cc
  21. 4 2
      syntaxnet/syntaxnet/proto_io.h
  22. 85 0
      syntaxnet/syntaxnet/segmenter_utils.cc
  23. 93 0
      syntaxnet/syntaxnet/segmenter_utils.h
  24. 149 0
      syntaxnet/syntaxnet/segmenter_utils_test.cc
  25. 15 0
      syntaxnet/syntaxnet/sentence.proto
  26. 1 1
      syntaxnet/syntaxnet/sentence_batch.cc
  27. 233 3
      syntaxnet/syntaxnet/sentence_features.cc
  28. 329 5
      syntaxnet/syntaxnet/sentence_features.h
  29. 123 4
      syntaxnet/syntaxnet/sentence_features_test.cc
  30. 40 5
      syntaxnet/syntaxnet/tagger_transitions.cc
  31. 6 0
      syntaxnet/syntaxnet/testdata/context.pbtxt
  32. 107 7
      syntaxnet/syntaxnet/text_formats.cc
  33. 59 1
      syntaxnet/syntaxnet/utils.h
  34. 2 0
      syntaxnet/syntaxnet/workspace.h
  35. 6 0
      syntaxnet/util/utf8/unicodetext.h
  36. 0 4
      syntaxnet/util/utf8/unicodetext_unittest.cc
  37. 14 0
      syntaxnet/util/utf8/unilib_utf8_utils.h

+ 3 - 2
syntaxnet/README.md

@@ -107,8 +107,8 @@ Bazel should complete reporting all tests passed.
 You can also compile SyntaxNet in a [Docker](https://www.docker.com/what-docker)
 container using this [Dockerfile](Dockerfile).
 
-**Note:** If you are running Docker on OSX, make sure that you have enough memory allocated
-for your Docker VM.
+**Note:** If you are running Docker on OSX, make sure that you have enough
+memory allocated for your Docker VM.
 
 ## Getting Started
 
@@ -612,6 +612,7 @@ Original authors of the code in this package include (in alphabetical order):
 *   David Weiss
 *   Emily Pitler
 *   Greg Coppola
+*   Ji Ma
 *   Keith Hall
 *   Kuzman Ganchev
 *   Michael Collins

+ 105 - 23
syntaxnet/syntaxnet/BUILD

@@ -159,6 +159,31 @@ cc_library(
 )
 
 cc_library(
+    name = "char_properties",
+    srcs = ["char_properties.cc"],
+    hdrs = ["char_properties.h"],
+    deps = [
+        ":registry",
+        ":utils",
+        "//util/utf8:unicodetext",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "segmenter_utils",
+    srcs = ["segmenter_utils.cc"],
+    hdrs = ["segmenter_utils.h"],
+    deps = [
+        ":base",
+        ":char_properties",
+        ":sentence_proto",
+        "//util/utf8:unicodetext",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
     name = "feature_extractor",
     srcs = ["feature_extractor.cc"],
     hdrs = [
@@ -199,6 +224,7 @@ cc_library(
         ":affix",
         ":feature_extractor",
         ":registry",
+        ":segmenter_utils",
     ],
 )
 
@@ -251,24 +277,50 @@ cc_library(
 )
 
 cc_library(
+    name = "morphology_label_set",
+    srcs = ["morphology_label_set.cc"],
+    hdrs = ["morphology_label_set.h"],
+    deps = [
+        ":document_format",
+        ":feature_extractor",
+        ":proto_io",
+        ":registry",
+        ":sentence_proto",
+        ":utils",
+    ],
+)
+
+cc_library(
     name = "parser_transitions",
     srcs = [
         "arc_standard_transitions.cc",
+        "binary_segment_state.cc",
+        "binary_segment_transitions.cc",
+        "morpher_transitions.cc",
+        "parser_features.cc",
         "parser_state.cc",
         "parser_transitions.cc",
         "tagger_transitions.cc",
     ],
     hdrs = [
+        "binary_segment_state.h",
+        "parser_features.h",
         "parser_state.h",
         "parser_transitions.h",
     ],
     deps = [
+        ":affix",
+        ":feature_extractor",
         ":kbest_syntax_proto",
+        ":morphology_label_set",
         ":registry",
+        ":segmenter_utils",
+        ":sentence_features",
         ":sentence_proto",
         ":shared_store",
         ":task_context",
         ":term_frequency_map",
+        ":workspace",
     ],
     alwayslink = 1,
 )
@@ -289,29 +341,11 @@ cc_library(
 )
 
 cc_library(
-    name = "parser_features",
-    srcs = ["parser_features.cc"],
-    hdrs = ["parser_features.h"],
-    deps = [
-        ":affix",
-        ":feature_extractor",
-        ":parser_transitions",
-        ":registry",
-        ":sentence_features",
-        ":task_context",
-        ":term_frequency_map",
-        ":workspace",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
     name = "embedding_feature_extractor",
     srcs = ["embedding_feature_extractor.cc"],
     hdrs = ["embedding_feature_extractor.h"],
     deps = [
         ":feature_extractor",
-        ":parser_features",
         ":parser_transitions",
         ":sparse_proto",
         ":task_context",
@@ -326,7 +360,6 @@ cc_library(
     deps = [
         ":embedding_feature_extractor",
         ":feature_extractor",
-        ":parser_features",
         ":parser_transitions",
         ":sentence_proto",
         ":sparse_proto",
@@ -344,7 +377,6 @@ cc_library(
         "reader_ops.cc",
     ],
     deps = [
-        ":parser_features",
         ":parser_transitions",
         ":sentence_batch",
         ":sentence_proto",
@@ -360,7 +392,6 @@ cc_library(
     srcs = ["document_filters.cc"],
     deps = [
         ":document_format",
-        ":parser_features",
         ":parser_transitions",
         ":sentence_batch",
         ":sentence_proto",
@@ -376,8 +407,8 @@ cc_library(
     deps = [
         ":dictionary_proto",
         ":document_format",
-        ":parser_features",
         ":parser_transitions",
+        ":segmenter_utils",
         ":sentence_batch",
         ":sentence_proto",
         ":task_context",
@@ -439,6 +470,18 @@ filegroup(
 )
 
 cc_test(
+    name = "binary_segment_state_test",
+    size = "small",
+    srcs = ["binary_segment_state_test.cc"],
+    deps = [
+        ":base",
+        ":parser_transitions",
+        ":term_frequency_map",
+        ":test_main",
+    ],
+)
+
+cc_test(
     name = "shared_store_test",
     size = "small",
     srcs = ["shared_store_test.cc"],
@@ -449,6 +492,26 @@ cc_test(
 )
 
 cc_test(
+    name = "char_properties_test",
+    srcs = ["char_properties_test.cc"],
+    deps = [
+        ":char_properties",
+        ":test_main",
+    ],
+)
+
+cc_test(
+    name = "segmenter_utils_test",
+    srcs = ["segmenter_utils_test.cc"],
+    deps = [
+        ":base",
+        ":segmenter_utils",
+        ":sentence_proto",
+        ":test_main",
+    ],
+)
+
+cc_test(
     name = "sentence_features_test",
     size = "medium",
     srcs = ["sentence_features_test.cc"],
@@ -466,6 +529,15 @@ cc_test(
 )
 
 cc_test(
+    name = "morphology_label_set_test",
+    srcs = ["morphology_label_set_test.cc"],
+    deps = [
+        ":morphology_label_set",
+        ":test_main",
+    ],
+)
+
+cc_test(
     name = "arc_standard_transitions_test",
     size = "small",
     srcs = ["arc_standard_transitions_test.cc"],
@@ -480,6 +552,17 @@ cc_test(
 )
 
 cc_test(
+    name = "binary_segment_transitions_test",
+    size = "small",
+    srcs = ["binary_segment_transitions_test.cc"],
+    deps = [
+        ":parser_transitions",
+        ":sentence_proto",
+        ":test_main",
+    ],
+)
+
+cc_test(
     name = "tagger_transitions_test",
     size = "small",
     srcs = ["tagger_transitions_test.cc"],
@@ -499,7 +582,6 @@ cc_test(
     srcs = ["parser_features_test.cc"],
     deps = [
         ":feature_extractor",
-        ":parser_features",
         ":parser_transitions",
         ":populate_test_inputs",
         ":sentence_proto",

+ 102 - 0
syntaxnet/syntaxnet/binary_segment_state.cc

@@ -0,0 +1,102 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "syntaxnet/binary_segment_state.h"
+
+#include <string>
+#include "syntaxnet/segmenter_utils.h"
+#include "syntaxnet/sentence.pb.h"
+
+namespace syntaxnet {
+
+ParserTransitionState *BinarySegmentState::Clone() const {
+  return new BinarySegmentState();
+}
+
+string BinarySegmentState::ToString(const ParserState &state) const {
+  string str("[");
+  for (int i = NumStarts(state) - 1; i >=0; --i) {
+    int start = LastStart(i, state);
+    int end = 0;
+    if (i - 1 >= 0) {
+      end = LastStart(i - 1, state) - 1;
+    } else if (state.EndOfInput()) {
+      end = state.sentence().token_size() - 1;
+    } else {
+      end = state.Next() - 1;
+    }
+    for (int k = start; k <= end; ++k) {
+      str.append(state.GetToken(k).word());
+    }
+    if (i >= 1) str.append(" ");
+  }
+
+  str.append("] ");
+  for (int i = state.Next(); i < state.NumTokens(); ++i) {
+    str.append(state.GetToken(i).word());
+  }
+  return str;
+}
+
+void BinarySegmentState::AddParseToDocument(const ParserState &state,
+                                            bool rewrite_root_labels,
+                                            Sentence *sentence) const {
+  if (sentence->token_size() == 0) return;
+  vector<bool> is_starts(sentence->token_size(), false);
+  for (int i = 0; i < NumStarts(state); ++i) {
+    is_starts[LastStart(i, state)] = true;
+  }
+
+  // Break level of the current token is determined based on its previous token.
+  Token::BreakLevel break_level = Token::NO_BREAK;
+  bool is_first_token = true;
+  Sentence new_sentence;
+  for (int i = 0; i < sentence->token_size(); ++i) {
+    const Token &token = sentence->token(i);
+    const string &word = token.word();
+    bool is_break = SegmenterUtils::IsBreakChar(word);
+    if (is_starts[i] || is_first_token) {
+      if (!is_break) {
+        // The current character is the first char of a new token/word.
+        Token *new_token = new_sentence.add_token();
+        new_token->set_start(token.start());
+        new_token->set_end(token.end());
+        new_token->set_word(word);
+
+        // For the first token, keep the old break level to make sure that the
+        // number of sentences stays unchanged.
+        new_token->set_break_level(break_level);
+        is_first_token = false;
+      }
+    } else {
+      // Append the character to the previous token.
+      if (!is_break) {
+        int index = new_sentence.token_size() - 1;
+        auto *last_token = new_sentence.mutable_token(index);
+        last_token->mutable_word()->append(word);
+        last_token->set_end(token.end());
+      }
+    }
+
+    // Update break level. Note we do not introduce new sentences in the
+    // transition system, thus anything goes beyond line break would be reduced
+    // to line break.
+    break_level = is_break ? SegmenterUtils::BreakLevel(word) : Token::NO_BREAK;
+    if (break_level >= Token::LINE_BREAK) break_level = Token::LINE_BREAK;
+  }
+  sentence->mutable_token()->Swap(new_sentence.mutable_token());
+}
+
+}  // namespace syntaxnet

+ 99 - 0
syntaxnet/syntaxnet/binary_segment_state.h

@@ -0,0 +1,99 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef SYNTAXNET_BINARY_SEGMENT_STATE_H_
+#define SYNTAXNET_BINARY_SEGMENT_STATE_H_
+
+#include "syntaxnet/parser_state.h"
+#include "syntaxnet/parser_transitions.h"
+
+namespace syntaxnet {
+
+class Sentence;
+
+// Parser state for binary segmentation transition system. The input of the
+// system is a sequence of utf8 characters that are to be segmented into tokens.
+// The system contains two type of transitions/actions:
+//  -START: the token at input is the first character of a new word.
+//  -MERGE: the token at input is to be merged with the its previous token.
+//
+// A BinarySegmentState is used to store segmentation histories that can be used
+// as features. In addition, it also provides the functionality to add
+// segmentation results to the document. The function assumes that sentences in
+// a document are processed in left-to-right order. See also the comments of
+// the FinishDocument function for explaination.
+//
+// Note on spaces:
+// Spaces, or more generally break-characters, should never be any part of a
+// word, and the START/MERGE of spaces would be ignored. In addition, if a space
+// starts a new word, then the actual first char of that word is the first
+// non-space token following the space.
+// Some examples:
+//  -chars:  ' ' A B
+//  -tags:    S  M M
+//  -result: 'AB'
+//
+//  -chars:  A ' ' B
+//  -tags:   S  M  M
+//  -result: 'AB'
+//
+//  -chars:  A ' ' B
+//  -tags:   S  S  M
+//  -result: 'AB'
+//
+//  -chars:  A  B  ' '
+//  -tags:   S  S  M
+//  -result: 'A', 'B'
+class BinarySegmentState : public ParserTransitionState {
+ public:
+  ParserTransitionState *Clone() const override;
+  void Init(ParserState *state) override {}
+
+  // Returns the number of start tokens that have already been identified. In
+  // other words, number of start tokens between the first token of the sentence
+  // and state.Input(), with state.Input() excluded.
+  static int NumStarts(const ParserState &state) {
+    return state.StackSize();
+  }
+
+  // Returns the index of the k-th most recent start token.
+  static int LastStart(int k, const ParserState &state) {
+    DCHECK_GE(k, 0);
+    DCHECK_LT(k, NumStarts(state));
+    return state.Stack(k);
+  }
+
+  // Adds the token at given index as a new start token.
+  static void AddStart(int index, ParserState *state) {
+    state->Push(index);
+  }
+
+  // Adds segmentation results to the given sentence.
+  void AddParseToDocument(const ParserState &state,
+                          bool rewrite_root_labels,
+                          Sentence *sentence) const override;
+
+  // Whether a parsed token should be considered correct for evaluation.
+  bool IsTokenCorrect(const ParserState &state, int index) const override {
+    return true;
+  }
+
+  // Returns a human readable string representation of this state.
+  string ToString(const ParserState &state) const override;
+};
+
+}  // namespace syntaxnet
+
+#endif  // SYNTAXNET_BINARY_SEGMENT_STATE_H_

+ 218 - 0
syntaxnet/syntaxnet/binary_segment_state_test.cc

@@ -0,0 +1,218 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "syntaxnet/binary_segment_state.h"
+
+#include <memory>
+
+#include "syntaxnet/base.h"
+#include "syntaxnet/sentence.pb.h"
+#include "syntaxnet/term_frequency_map.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace syntaxnet {
+
+class BinarySegmentStateTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Prepare a sentence.
+    const char *str_sentence = "text: '测试 的 句子' "
+        "token { word: '测' start: 0 end: 2 } "
+        "token { word: '试' start: 3 end: 5 } "
+        "token { word: ' ' start: 6 end: 6 } "
+        "token { word: '的' start: 7 end: 9 } "
+        "token { word: ' ' start: 10 end: 10 } "
+        "token { word: '句' start: 11 end: 13 } "
+        "token { word: '子' start: 14 end: 16 } ";
+    sentence_ = std::unique_ptr<Sentence>(new Sentence());
+    TextFormat::ParseFromString(str_sentence, sentence_.get());
+  }
+
+  // The test document, parse tree, and sentence.
+  std::unique_ptr<Sentence> sentence_;
+  TermFrequencyMap label_map_;
+};
+
+TEST_F(BinarySegmentStateTest, AddStartLastStartNumStartsTest) {
+  BinarySegmentState *segment_state = new BinarySegmentState();
+  ParserState state(sentence_.get(), segment_state, &label_map_);
+
+  // Test segment_state initialized with zero starts.
+  EXPECT_EQ(0, segment_state->NumStarts(state));
+
+  // Adding the first token as a start token.
+  segment_state->AddStart(0, &state);
+  ASSERT_EQ(1, segment_state->NumStarts(state));
+  EXPECT_EQ(0, segment_state->LastStart(0, state));
+
+  // Adding more starts.
+  segment_state->AddStart(2, &state);
+  segment_state->AddStart(3, &state);
+  segment_state->AddStart(4, &state);
+  segment_state->AddStart(5, &state);
+  ASSERT_EQ(5, segment_state->NumStarts(state));
+  EXPECT_EQ(5, segment_state->LastStart(0, state));
+  EXPECT_EQ(4, segment_state->LastStart(1, state));
+  EXPECT_EQ(3, segment_state->LastStart(2, state));
+  EXPECT_EQ(2, segment_state->LastStart(3, state));
+  EXPECT_EQ(0, segment_state->LastStart(4, state));
+}
+
+TEST_F(BinarySegmentStateTest, AddParseToDocumentTest) {
+  BinarySegmentState *segment_state = new BinarySegmentState();
+  ParserState state(sentence_.get(), segment_state, &label_map_);
+
+  // Test gold segmentation.
+  // 0   1   2    3   4   5   6
+  // 测  试  ' '  的  ' '  句  子
+  // S   M   S    S   S   S   M
+  segment_state->AddStart(0, &state);
+  segment_state->AddStart(2, &state);
+  segment_state->AddStart(3, &state);
+  segment_state->AddStart(4, &state);
+  segment_state->AddStart(5, &state);
+  Sentence sentence_with_annotation = *sentence_;
+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
+
+  // Test the number of tokens as well as the start/end byte-offsets of each
+  // token.
+  ASSERT_EQ(3, sentence_with_annotation.token_size());
+
+  // The first token is 测试.
+  EXPECT_EQ(0, sentence_with_annotation.token(0).start());
+  EXPECT_EQ(5, sentence_with_annotation.token(0).end());
+
+  // The second token is 的.
+  EXPECT_EQ(7, sentence_with_annotation.token(1).start());
+  EXPECT_EQ(9, sentence_with_annotation.token(1).end());
+
+  // The third token is 句子.
+  EXPECT_EQ(11, sentence_with_annotation.token(2).start());
+  EXPECT_EQ(16, sentence_with_annotation.token(2).end());
+
+  // Test merge space to other tokens. Since spaces, or more generally break
+  // characters, should never be a part of any word, they are skipped no matter
+  // how they are tagged.
+  // 0   1   2    3   4   5   6
+  // 测  试  ' '  的  ' '  句  子
+  // S   M   M    S   M   M   M
+  while (!state.StackEmpty()) state.Pop();
+  segment_state->AddStart(0, &state);
+  segment_state->AddStart(3, &state);
+  sentence_with_annotation = *sentence_;
+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
+
+  ASSERT_EQ(2, sentence_with_annotation.token_size());
+
+  // The first token is 测试. Note even a space is tagged as "merge", it is not
+  // attached to its previous word.
+  EXPECT_EQ(0, sentence_with_annotation.token(0).start());
+  EXPECT_EQ(5, sentence_with_annotation.token(0).end());
+
+  // The second token is 的句子.
+  EXPECT_EQ(7, sentence_with_annotation.token(1).start());
+  EXPECT_EQ(16, sentence_with_annotation.token(1).end());
+
+  // Test merge a token to space tokens. In such case, the current token would
+  // be merged to the first non-space token on its left side.
+  // 0   1   2    3   4   5   6
+  // 测  试  ' '  的  ' '  句  子
+  // S   M   S    M   S   M   M
+  while (!state.StackEmpty()) state.Pop();
+  segment_state->AddStart(0, &state);
+  segment_state->AddStart(2, &state);
+  segment_state->AddStart(4, &state);
+  sentence_with_annotation = *sentence_;
+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
+  ASSERT_EQ(1, sentence_with_annotation.token_size());
+  EXPECT_EQ(0, sentence_with_annotation.token(0).start());
+  EXPECT_EQ(16, sentence_with_annotation.token(0).end());
+}
+
+TEST_F(BinarySegmentStateTest, SpaceDocumentTest) {
+  const char *str_sentence = "text: ' \t\t' "
+      "token { word: ' ' start: 0 end: 0 } "
+      "token { word: '\t' start: 1 end: 1 } "
+      "token { word: '\t' start: 2 end: 2 } ";
+  TextFormat::ParseFromString(str_sentence, sentence_.get());
+  BinarySegmentState *segment_state = new BinarySegmentState();
+  ParserState state(sentence_.get(), segment_state, &label_map_);
+
+  // Break-chars should always be skipped, no matter how they are tagged.
+  // 0    1     2
+  //' '   '\t'  '\t'
+  // M    M     M
+  Sentence sentence_with_annotation = *sentence_;
+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
+  ASSERT_EQ(0, sentence_with_annotation.token_size());
+
+  // 0    1     2
+  //' '   '\t'  '\t'
+  // S    S     S
+  segment_state->AddStart(0, &state);
+  segment_state->AddStart(1, &state);
+  segment_state->AddStart(2, &state);
+  sentence_with_annotation = *sentence_;
+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
+  ASSERT_EQ(0, sentence_with_annotation.token_size());
+}
+
+TEST_F(BinarySegmentStateTest, DocumentBeginWithSpaceTest) {
+  const char *str_sentence = "text: ' 空格' "
+      "token { word: ' ' start: 0 end: 0 } "
+      "token { word: '空' start: 1 end: 3 } "
+      "token { word: '格' start: 4 end: 6 } ";
+  TextFormat::ParseFromString(str_sentence, sentence_.get());
+  BinarySegmentState *segment_state = new BinarySegmentState();
+  ParserState state(sentence_.get(), segment_state, &label_map_);
+
+  // 0    1    2
+  //' '   空   格
+  // M    M    M
+  Sentence sentence_with_annotation = *sentence_;
+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
+
+  ASSERT_EQ(1, sentence_with_annotation.token_size());
+
+  // The first token is 空格.
+  EXPECT_EQ(1, sentence_with_annotation.token(0).start());
+  EXPECT_EQ(6, sentence_with_annotation.token(0).end());
+
+  // 0    1    2
+  //' '   空   格
+  // S    M    M
+  while (!state.StackEmpty()) state.Pop();
+  segment_state->AddStart(0, &state);
+  sentence_with_annotation = *sentence_;
+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
+
+  ASSERT_EQ(1, sentence_with_annotation.token_size());
+
+  // The first token is 空格.
+  EXPECT_EQ(1, sentence_with_annotation.token(0).start());
+  EXPECT_EQ(6, sentence_with_annotation.token(0).end());
+}
+
+TEST_F(BinarySegmentStateTest, EmptyDocumentTest) {
+  const char *str_sentence = "text: '' ";
+  TextFormat::ParseFromString(str_sentence, sentence_.get());
+  BinarySegmentState *segment_state = new BinarySegmentState();
+  ParserState state(sentence_.get(), segment_state, &label_map_);
+  Sentence sentence_with_annotation = *sentence_;
+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
+  ASSERT_EQ(0, sentence_with_annotation.token_size());
+}
+
+}  // namespace syntaxnet

+ 121 - 0
syntaxnet/syntaxnet/binary_segment_transitions.cc

@@ -0,0 +1,121 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "syntaxnet/binary_segment_state.h"
+#include "syntaxnet/parser_state.h"
+#include "syntaxnet/parser_transitions.h"
+
+namespace syntaxnet {
+
+// Given an input of utf8 characters, the BinarySegmentTransitionSystem
+// conducts word segmentation by performing one of the following two actions:
+//  -START: starts a new word with the token at state.input, and also advances
+//          the state.input.
+//  -MERGE: adds the token at state.input to its prevous word, and also advances
+//          state.input.
+//
+// Also see nlp/saft/components/segmentation/transition/binary-segment-state.h
+// for examples on handling spaces.
+class BinarySegmentTransitionSystem : public ParserTransitionSystem {
+ public:
+  BinarySegmentTransitionSystem() {}
+  ParserTransitionState *NewTransitionState(bool train_mode) const override {
+    return new BinarySegmentState();
+  }
+
+  // Action types for the segmentation-transition system.
+  enum ParserActionType {
+    START = 0,
+    MERGE = 1,
+    CARDINAL = 2
+  };
+
+  static int StartAction() { return 0; }
+  static int MergeAction() { return 1; }
+
+  // The system always starts a new word by default.
+  ParserAction GetDefaultAction(const ParserState &state) const override {
+    return START;
+  }
+
+  // Returns the number of action types.
+  int NumActionTypes() const override {
+    return CARDINAL;
+  }
+
+  // Returns the number of possible actions.
+  int NumActions(int num_labels) const override {
+    return CARDINAL;
+  }
+
+  // Returns the next gold action for a given state according to the underlying
+  // annotated sentence. The training data for the transition system is created
+  // by the binary-segmenter-data task. If a token's break_level is NO_BREAK,
+  // then it is a MERGE, START otherwise. The only exception is that the first
+  // token in a sentence for the transition sysytem is always a START.
+  ParserAction GetNextGoldAction(const ParserState &state) const override {
+    if (state.Next() == 0) return StartAction();
+    const Token &token = state.GetToken(state.Next());
+    return (token.break_level() != Token::NO_BREAK ?
+        StartAction() : MergeAction());
+  }
+
+  // Both START and MERGE can be applied to any tokens in the sentence.
+  bool IsAllowedAction(
+      ParserAction action, const ParserState &state) const override {
+    return true;
+  }
+
+  // Performs the specified action on a given parser state, without adding the
+  // action to the state's history.
+  void PerformActionWithoutHistory(
+      ParserAction action, ParserState *state) const override {
+    // Note when the action is less than 0, it is treated as a START.
+    if (action < 0 || action == StartAction()) {
+      MutableTransitionState(state)->AddStart(state->Next(), state);
+    }
+    state->Advance();
+  }
+
+  // Allows backoff to best allowable transition.
+  bool BackOffToBestAllowableTransition() const override { return true; }
+
+  // A state is a deterministic state iff no tokens have been consumed.
+  bool IsDeterministicState(const ParserState &state) const override {
+    return state.Next() == 0;
+  }
+
+  // For binary segmentation, a state is a final state iff all tokens have been
+  // consumed.
+  bool IsFinalState(const ParserState &state) const override {
+    return state.EndOfInput();
+  }
+
+  // Returns a string representation of a parser action.
+  string ActionAsString(
+      ParserAction action, const ParserState &state) const override {
+    return action == StartAction() ? "START" : "MERGE";
+  }
+
+  // Downcasts the TransitionState in ParserState to an BinarySegmentState.
+  static BinarySegmentState *MutableTransitionState(ParserState *state) {
+    return static_cast<BinarySegmentState *>(state->mutable_transition_state());
+  }
+};
+
+REGISTER_TRANSITION_SYSTEM("binary-segment-transitions",
+                           BinarySegmentTransitionSystem);
+
+}  // namespace syntaxnet

+ 111 - 0
syntaxnet/syntaxnet/binary_segment_transitions_test.cc

@@ -0,0 +1,111 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "syntaxnet/binary_segment_state.h"
+#include "syntaxnet/parser_state.h"
+#include "syntaxnet/parser_transitions.h"
+#include "syntaxnet/term_frequency_map.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace syntaxnet {
+
+class SegmentationTransitionTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    transition_system_ = std::unique_ptr<ParserTransitionSystem>(
+        ParserTransitionSystem::Create("binary-segment-transitions"));
+
+    // Prepare a sentence.
+    const char *str_sentence = "text: '因为 有 这样' "
+        "token { word: '因' start: 0 end: 2 break_level: SPACE_BREAK } "
+        "token { word: '为' start: 3 end: 5 break_level: NO_BREAK } "
+        "token { word: ' ' start: 6 end: 6 break_level: SPACE_BREAK } "
+        "token { word: '有' start: 7 end: 9 break_level: SPACE_BREAK } "
+        "token { word: ' ' start: 10 end: 10 break_level: SPACE_BREAK } "
+        "token { word: '这' start: 11 end: 13 break_level: SPACE_BREAK } "
+        "token { word: '样' start: 14 end: 16 break_level: NO_BREAK } ";
+    sentence_ = std::unique_ptr<Sentence>(new Sentence());
+    TextFormat::ParseFromString(str_sentence, sentence_.get());
+  }
+
+  void CheckStarts(const ParserState &state, const vector<int> &target) {
+    ASSERT_EQ(state.StackSize(), target.size());
+    vector<int> starts;
+    for (int i = 0; i < state.StackSize(); ++i) {
+      EXPECT_EQ(state.Stack(i), target[i]);
+    }
+  }
+
+  // The test document, parse tree, and sentence with tags and partial parses.
+  std::unique_ptr<Sentence> sentence_;
+  std::unique_ptr<ParserTransitionSystem> transition_system_;
+  TermFrequencyMap label_map_;
+};
+
+TEST_F(SegmentationTransitionTest, GoldNextActionTest) {
+  BinarySegmentState *segment_state = static_cast<BinarySegmentState *>(
+      transition_system_->NewTransitionState(true));
+  ParserState state(sentence_.get(), segment_state, &label_map_);
+
+  // Do segmentation by following the gold actions.
+  while (transition_system_->IsFinalState(state) == false) {
+    ParserAction action = transition_system_->GetNextGoldAction(state);
+    transition_system_->PerformActionWithoutHistory(action, &state);
+  }
+
+  // Test STARTs.
+  CheckStarts(state, {5, 4, 3, 2, 0});
+
+  // Test the annotated tokens.
+  segment_state->AddParseToDocument(state, false, sentence_.get());
+  ASSERT_EQ(sentence_->token_size(), 3);
+  EXPECT_EQ(sentence_->token(0).word(), "因为");
+  EXPECT_EQ(sentence_->token(1).word(), "有");
+  EXPECT_EQ(sentence_->token(2).word(), "这样");
+
+  // Test start/end annotation of each token.
+  EXPECT_EQ(sentence_->token(0).start(), 0);
+  EXPECT_EQ(sentence_->token(0).end(), 5);
+  EXPECT_EQ(sentence_->token(1).start(), 7);
+  EXPECT_EQ(sentence_->token(1).end(), 9);
+  EXPECT_EQ(sentence_->token(2).start(), 11);
+  EXPECT_EQ(sentence_->token(2).end(), 16);
+}
+
+TEST_F(SegmentationTransitionTest, DefaultActionTest) {
+  BinarySegmentState *segment_state = static_cast<BinarySegmentState *>(
+      transition_system_->NewTransitionState(true));
+  ParserState state(sentence_.get(), segment_state, &label_map_);
+
+  // Do segmentation, tagging and parsing by following the gold actions.
+  while (transition_system_->IsFinalState(state) == false) {
+    ParserAction action = transition_system_->GetDefaultAction(state);
+    transition_system_->PerformActionWithoutHistory(action, &state);
+  }
+
+  // Every character should be START.
+  CheckStarts(state, {6, 5, 4, 3, 2, 1, 0});
+
+  // Every non-space character should be a word.
+  segment_state->AddParseToDocument(state, false, sentence_.get());
+  ASSERT_EQ(sentence_->token_size(), 5);
+  EXPECT_EQ(sentence_->token(0).word(), "因");
+  EXPECT_EQ(sentence_->token(1).word(), "为");
+  EXPECT_EQ(sentence_->token(2).word(), "有");
+  EXPECT_EQ(sentence_->token(3).word(), "这");
+  EXPECT_EQ(sentence_->token(4).word(), "样");
+}
+
+}  // namespace syntaxnet

+ 845 - 0
syntaxnet/syntaxnet/char_properties.cc

@@ -0,0 +1,845 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// char_properties.cc - define is_X() tests for various character properties
+//
+// See char_properties.h for how to write a character property.
+//
+// References for the char sets below:
+//
+// . http://www.unicode.org/Public/UNIDATA/PropList.txt
+//
+//   Large (but not exhaustive) list of Unicode chars and their "properties"
+//   (e.g., the property "Pi" = an initial quote punctuation char).
+//
+// . http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
+//
+//   Defines the list of properties, such as "Pi", used in the above list.
+//
+// . http://www.unipad.org/unimap/index.php?param_char=XXXX&page=detail
+//
+//   Gives detail about a particular character code.
+//   XXXX is a 4-hex-digit Unicode character code.
+//
+// . http://www.unicode.org/Public/UNIDATA/UCD.html
+//
+//   General reference for Unicode characters.
+//
+
+#include "syntaxnet/char_properties.h"
+
+#include <ctype.h>  // for ispunct, isspace
+#include <memory>
+#include <utility>
+#include <vector>  // for vector
+
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "third_party/utf/utf.h"      // for runetochar, ::UTFmax, Rune
+#include "util/utf8/unilib.h"  // for IsValidCodepoint, etc
+#include "util/utf8/unilib_utf8_utils.h"
+
+//============================================================
+// CharPropertyImplementation
+//
+
+// A CharPropertyImplementation stores a set of Unicode characters,
+// encoded in UTF-8, as a trie.  The trie is represented as a vector
+// of nodes.  Each node is a 256-element array that specifies what to
+// do with one byte of the UTF-8 sequence.  Each element n of a node
+// is one of:
+//  n = 0,  indicating that the Property is not true of any
+//          character whose UTF-8 encoding includes this byte at
+//          this position
+//  n = -1, indicating that the Property is true for the UTF-8 sequence
+//          that ends with this byte.
+//  n > 0,  indicating the index of the row that describes the
+//          remaining bytes in the UTF-8 sequence.
+//
+// The only operation that needs to be fast is HoldsFor, which tests
+// whether a character has a given property. We use each byte of the
+// character's UTF-8 encoding to index into a row. If the value is 0,
+// then the property is not true for the character. (We might discover
+// this even before getting to the end of the sequence.) If the value
+// is -1, then the property is true for this character. Otherwise,
+// the value is the index of another row, which we index using the next
+// byte in the sequence, and so on. The design of UTF-8 prevents
+// ambiguities here; no prefix of a UTF-8 sequence is a valid UTF-8
+// sequence.
+//
+// While it is possible to implement an iterator for this representation,
+// it is much easier to use set<char32> for this purpose. In fact, we
+// would use that as the entire representation, were it not for concerns
+// that HoldsFor might be slower.
+
+namespace syntaxnet {
+
+struct CharPropertyImplementation {
+  unordered_set<char32> chars;
+  vector<vector<int> > rows;
+  CharPropertyImplementation() {
+    rows.reserve(10);
+    rows.resize(1);
+    rows[0].resize(256, 0);
+  }
+  void AddChar(char *buf, int len) {
+    int n = 0;  // row index
+    for (int i = 0; i < len; ++i) {
+      int ch = reinterpret_cast<unsigned char *>(buf)[i];
+      int m = rows[n][ch];
+      if (m > 0) {
+        CHECK_LT(i, len - 1)
+            << " : " << (i + 1) << "-byte UTF-8 sequence "
+            << "(" << tensorflow::str_util::CEscape(string(buf, i + 1)) << ")"
+            << " is prefix of previously-seen UTF-8 sequence(s)";
+        n = m;
+      } else if (i == len - 1) {
+        rows[n][ch] = -1;
+      } else {
+        CHECK_EQ(m, 0) << " : UTF-8 sequence is extension of previously-seen "
+                       << (i + 1) << "-byte UTF-8 sequence "
+                       << "("
+                       << tensorflow::str_util::CEscape(string(buf, i + 1))
+                       << ")";
+        int a = rows.size();
+        rows.resize(a + 1);
+        rows[a].resize(256, 0);
+        rows[n][ch] = a;
+        n = a;
+      }
+    }
+  }
+
+  bool HoldsFor(const char *buf) const {
+    const unsigned char *bytes = reinterpret_cast<const unsigned char *>(buf);
+
+    // Lookup each byte of the UTF-8 sequence, starting in row 0.
+    int n = rows[0][*bytes];
+    if (n == 0) return false;
+    if (n == -1) return true;
+
+    // If the value is not 0 or -1, then it is the index of the row for the
+    // second byte in the sequence.
+    n = rows[n][*++bytes];
+    if (n == 0) return false;
+    if (n == -1) return true;
+    n = rows[n][*++bytes];  // Likewise for the third byte.
+    if (n == 0) return false;
+    if (n == -1) return true;
+    n = rows[n][*++bytes];  // Likewise for the fourth byte.
+    if (n == 0) return false;
+
+    // Since there can be at most 4 bytes in the sequence, n must be -1.
+    return true;
+
+    // Implementation note: it is possible (and perhaps clearer) to write this
+    // code as a loop, "for (int i = 0; i < 4; ++i) ...", but the TestHoldsFor
+    // benchmark results indicate that doing so produces slower code for
+    // anything other than short 7-bit ASCII strings (< 512 bytes). This is
+    // mysterious, since the compiler unrolls the loop, producing code that
+    // is almost the same as what we have here, except for the shortcut on
+    // the 4th byte.
+  }
+};
+
+//============================================================
+// CharProperty - a property that holds for selected Unicode chars
+//
+
+CharProperty::CharProperty(const char *name,
+                           const int *unicodes,
+                           int num_unicodes)
+    : name_(name),
+      impl_(new CharPropertyImplementation) {
+  // Initialize CharProperty to its char set.
+  AddCharSpec(unicodes, num_unicodes);
+}
+
+CharProperty::CharProperty(const char *name, CharPropertyInitializer *init_fn)
+    : name_(name),
+      impl_(new CharPropertyImplementation) {
+  (*init_fn)(this);
+}
+
+CharProperty::~CharProperty() {
+  delete impl_;
+}
+
+void CharProperty::AddChar(int c) {
+  CheckUnicodeVal(c);
+  impl_->chars.insert(c);
+
+  char buf[UTFmax];
+  Rune r = c;
+  int len = runetochar(buf, &r);
+  impl_->AddChar(buf, len);
+}
+
+void CharProperty::AddCharRange(int c1, int c2) {
+  for (int c = c1; c <= c2; ++c) {
+    AddChar(c);
+  }
+}
+
+void CharProperty::AddAsciiPredicate(AsciiPredicate *pred) {
+  for (int c = 0; c < 256; ++c) {
+    if ((*pred)(c)) {
+      AddChar(c);
+    }
+  }
+}
+
+void CharProperty::AddCharProperty(const char *propname) {
+  const CharProperty *prop = CharProperty::Lookup(propname);
+  CHECK(prop != NULL) << ": unknown char property \"" << propname
+                      << "\" in " << name_;
+  int c = -1;
+  while ((c = prop->NextElementAfter(c)) >= 0) {
+    AddChar(c);
+  }
+}
+
+void CharProperty::AddCharSpec(const int *unicodes, int num_unicodes) {
+  for (int i = 0; i < num_unicodes; ++i) {
+    if (i + 3 < num_unicodes && unicodes[i] == kPreUnicodeRange &&
+        unicodes[i + 3] == kPostUnicodeRange) {
+      // Range of unicode values
+      int lower = unicodes[i + 1];
+      int upper = unicodes[i + 2];
+      i += 3;  // i will be incremented once more at top of loop
+      CHECK(lower <= upper) << ": invalid char range in " << name_
+                            << ": [" << UnicodeToString(lower) << ", "
+                            << UnicodeToString(upper) << "]";
+      AddCharRange(lower, upper);
+    } else {
+      AddChar(unicodes[i]);
+    }
+  }
+}
+
+bool CharProperty::HoldsFor(int c) const {
+  if (!UniLib::IsValidCodepoint(c)) return false;
+  char buf[UTFmax];
+  Rune r = c;
+  runetochar(buf, &r);
+  return impl_->HoldsFor(buf);
+}
+
+bool CharProperty::HoldsFor(const char *str, int len) const {
+  // UniLib::IsUTF8ValidCodepoint also checks for structural validity.
+  return len > 0 && UniLib::IsUTF8ValidCodepoint(StringPiece(str, len)) &&
+         impl_->HoldsFor(str);
+}
+
+// Return -1 or the smallest Unicode char greater than c for which
+// the CharProperty holds.  Expects c == -1 or HoldsFor(c).
+int CharProperty::NextElementAfter(int c) const {
+  DCHECK(c == -1 || HoldsFor(c));
+  unordered_set<char32>::const_iterator end = impl_->chars.end();
+  if (c < 0) {
+    unordered_set<char32>::const_iterator it = impl_->chars.begin();
+    if (it == end) return -1;
+    return *it;
+  }
+  char32 r = c;
+  unordered_set<char32>::const_iterator it = impl_->chars.find(r);
+  if (it == end) return -1;
+  it++;
+  if (it == end) return -1;
+  return *it;
+}
+
+REGISTER_CLASS_REGISTRY("char property wrapper", CharPropertyWrapper);
+
+const CharProperty *CharProperty::Lookup(const char *subclass) {
+  // Create a CharPropertyWrapper object and delete it.  We only care about
+  // the CharProperty it provides.
+  std::unique_ptr<CharPropertyWrapper> wrapper(
+      CharPropertyWrapper::Create(subclass));
+  if (wrapper.get() == NULL) {
+    LOG(ERROR) << "CharPropertyWrapper not found for subclass: "
+               << "\"" << subclass << "\"";
+    return NULL;
+  }
+  return wrapper->GetCharProperty();
+}
+
+// Check that a given Unicode value is in range.
+void CharProperty::CheckUnicodeVal(int c) const {
+  CHECK(UniLib::IsValidCodepoint(c))
+      << "Unicode in " << name_ << " out of range: " << UnicodeToString(c);
+}
+
+// Converts a Unicode value to a string (for error messages).
+string CharProperty::UnicodeToString(int c) {
+  const char *fmt;
+
+  if (c < 0) {
+    fmt = "%d";      // out-of-range
+  } else if (c <= 0x7f) {
+    fmt = "'%c'";    // ascii
+  } else if (c <= 0xffff) {
+    fmt = "0x%04X";  // 4 hex digits
+  } else {
+    fmt = "0x%X";    // also out-of-range
+  }
+
+  return tensorflow::strings::Printf(fmt, c);
+}
+
+//======================================================================
+// Expression-level punctuation
+//
+
+// Punctuation that starts a sentence.
+DEFINE_CHAR_PROPERTY_AS_SET(start_sentence_punc,
+  0x00A1,  // Spanish inverted exclamation mark
+  0x00BF,  // Spanish inverted question mark
+)
+
+// Punctuation that ends a sentence.
+// Based on: http://www.unicode.org/unicode/reports/tr29/#Sentence_Boundaries
+DEFINE_CHAR_PROPERTY_AS_SET(end_sentence_punc,
+  '.',
+  '!',
+  '?',
+  0x055C,  // Armenian exclamation mark
+  0x055E,  // Armenian question mark
+  0x0589,  // Armenian full stop
+  0x061F,  // Arabic question mark
+  0x06D4,  // Arabic full stop
+  0x0700,  // Syriac end of paragraph
+  0x0701,  // Syriac supralinear full stop
+  0x0702,  // Syriac sublinear full stop
+  RANGE(0x0964, 0x0965),  // Devanagari danda..Devanagari double danda
+  0x1362,  // Ethiopic full stop
+  0x1367,  // Ethiopic question mark
+  0x1368,  // Ethiopic paragraph separator
+  0x104A,  // Myanmar sign little section
+  0x104B,  // Myanmar sign section
+  0x166E,  // Canadian syllabics full stop
+  0x17d4,  // Khmer sign khan
+  0x1803,  // Mongolian full stop
+  0x1809,  // Mongolian Manchu full stop
+  0x1944,  // Limbu exclamation mark
+  0x1945,  // Limbu question mark
+  0x203C,  // double exclamation mark
+  0x203D,  // interrobang
+  0x2047,  // double question mark
+  0x2048,  // question exclamation mark
+  0x2049,  // exclamation question mark
+  0x3002,  // ideographic full stop
+  0x037E,  // Greek question mark
+  0xFE52,  // small full stop
+  0xFE56,  // small question mark
+  0xFE57,  // small exclamation mark
+  0xFF01,  // fullwidth exclamation mark
+  0xFF0E,  // fullwidth full stop
+  0xFF1F,  // fullwidth question mark
+  0xFF61,  // halfwidth ideographic full stop
+  0x2026,  // ellipsis
+)
+
+// Punctuation, such as parens, that opens a "nested expression" of text.
+DEFINE_CHAR_PROPERTY_AS_SET(open_expr_punc,
+  '(',
+  '[',
+  '<',
+  '{',
+  0x207D,  // superscript left parenthesis
+  0x208D,  // subscript left parenthesis
+  0x27E6,  // mathematical left white square bracket
+  0x27E8,  // mathematical left angle bracket
+  0x27EA,  // mathematical left double angle bracket
+  0x2983,  // left white curly bracket
+  0x2985,  // left white parenthesis
+  0x2987,  // Z notation left image bracket
+  0x2989,  // Z notation left binding bracket
+  0x298B,  // left square bracket with underbar
+  0x298D,  // left square bracket with tick in top corner
+  0x298F,  // left square bracket with tick in bottom corner
+  0x2991,  // left angle bracket with dot
+  0x2993,  // left arc less-than bracket
+  0x2995,  // double left arc greater-than bracket
+  0x2997,  // left black tortoise shell bracket
+  0x29D8,  // left wiggly fence
+  0x29DA,  // left double wiggly fence
+  0x29FC,  // left-pointing curved angle bracket
+  0x3008,  // CJK left angle bracket
+  0x300A,  // CJK left double angle bracket
+  0x3010,  // CJK left black lenticular bracket
+  0x3014,  // CJK left tortoise shell bracket
+  0x3016,  // CJK left white lenticular bracket
+  0x3018,  // CJK left white tortoise shell bracket
+  0x301A,  // CJK left white square bracket
+  0xFD3E,  // Ornate left parenthesis
+  0xFE59,  // small left parenthesis
+  0xFE5B,  // small left curly bracket
+  0xFF08,  // fullwidth left parenthesis
+  0xFF3B,  // fullwidth left square bracket
+  0xFF5B,  // fullwidth left curly bracket
+)
+
+// Punctuation, such as parens, that closes a "nested expression" of text.
+DEFINE_CHAR_PROPERTY_AS_SET(close_expr_punc,
+  ')',
+  ']',
+  '>',
+  '}',
+  0x207E,  // superscript right parenthesis
+  0x208E,  // subscript right parenthesis
+  0x27E7,  // mathematical right white square bracket
+  0x27E9,  // mathematical right angle bracket
+  0x27EB,  // mathematical right double angle bracket
+  0x2984,  // right white curly bracket
+  0x2986,  // right white parenthesis
+  0x2988,  // Z notation right image bracket
+  0x298A,  // Z notation right binding bracket
+  0x298C,  // right square bracket with underbar
+  0x298E,  // right square bracket with tick in top corner
+  0x2990,  // right square bracket with tick in bottom corner
+  0x2992,  // right angle bracket with dot
+  0x2994,  // right arc greater-than bracket
+  0x2996,  // double right arc less-than bracket
+  0x2998,  // right black tortoise shell bracket
+  0x29D9,  // right wiggly fence
+  0x29DB,  // right double wiggly fence
+  0x29FD,  // right-pointing curved angle bracket
+  0x3009,  // CJK right angle bracket
+  0x300B,  // CJK right double angle bracket
+  0x3011,  // CJK right black lenticular bracket
+  0x3015,  // CJK right tortoise shell bracket
+  0x3017,  // CJK right white lenticular bracket
+  0x3019,  // CJK right white tortoise shell bracket
+  0x301B,  // CJK right white square bracket
+  0xFD3F,  // Ornate right parenthesis
+  0xFE5A,  // small right parenthesis
+  0xFE5C,  // small right curly bracket
+  0xFF09,  // fullwidth right parenthesis
+  0xFF3D,  // fullwidth right square bracket
+  0xFF5D,  // fullwidth right curly bracket
+)
+
+// Chars that open a quotation.
+// Based on: http://www.unicode.org/uni2book/ch06.pdf
+DEFINE_CHAR_PROPERTY_AS_SET(open_quote,
+  '"',
+  '\'',
+  '`',
+  0xFF07,  // fullwidth apostrophe
+  0xFF02,  // fullwidth quotation mark
+  0x2018,  // left single quotation mark (English, others)
+  0x201C,  // left double quotation mark (English, others)
+  0x201B,  // single high-reveresed-9 quotation mark (PropList.txt)
+  0x201A,  // single low-9 quotation mark (Czech, German, Slovak)
+  0x201E,  // double low-9 quotation mark (Czech, German, Slovak)
+  0x201F,  // double high-reversed-9 quotation mark (PropList.txt)
+  0x2019,  // right single quotation mark (Danish, Finnish, Swedish, Norw.)
+  0x201D,  // right double quotation mark (Danish, Finnish, Swedish, Norw.)
+  0x2039,  // single left-pointing angle quotation mark (French, others)
+  0x00AB,  // left-pointing double angle quotation mark (French, others)
+  0x203A,  // single right-pointing angle quotation mark (Slovenian, others)
+  0x00BB,  // right-pointing double angle quotation mark (Slovenian, others)
+  0x300C,  // left corner bracket (East Asian languages)
+  0xFE41,  // presentation form for vertical left corner bracket
+  0xFF62,  // halfwidth left corner bracket (East Asian languages)
+  0x300E,  // left white corner bracket (East Asian languages)
+  0xFE43,  // presentation form for vertical left white corner bracket
+  0x301D,  // reversed double prime quotation mark (East Asian langs, horiz.)
+)
+
+// Chars that close a quotation.
+// Based on: http://www.unicode.org/uni2book/ch06.pdf
+DEFINE_CHAR_PROPERTY_AS_SET(close_quote,
+  '\'',
+  '"',
+  '`',
+  0xFF07,  // fullwidth apostrophe
+  0xFF02,  // fullwidth quotation mark
+  0x2019,  // right single quotation mark (English, others)
+  0x201D,  // right double quotation mark (English, others)
+  0x2018,  // left single quotation mark (Czech, German, Slovak)
+  0x201C,  // left double quotation mark (Czech, German, Slovak)
+  0x203A,  // single right-pointing angle quotation mark (French, others)
+  0x00BB,  // right-pointing double angle quotation mark (French, others)
+  0x2039,  // single left-pointing angle quotation mark (Slovenian, others)
+  0x00AB,  // left-pointing double angle quotation mark (Slovenian, others)
+  0x300D,  // right corner bracket (East Asian languages)
+  0xfe42,  // presentation form for vertical right corner bracket
+  0xFF63,  // halfwidth right corner bracket (East Asian languages)
+  0x300F,  // right white corner bracket (East Asian languages)
+  0xfe44,  // presentation form for vertical right white corner bracket
+  0x301F,  // low double prime quotation mark (East Asian languages)
+  0x301E,  // close double prime (East Asian languages written horizontally)
+)
+
+// Punctuation chars that open an expression or a quotation.
+DEFINE_CHAR_PROPERTY(open_punc, prop) {
+  prop->AddCharProperty("open_expr_punc");
+  prop->AddCharProperty("open_quote");
+}
+
+// Punctuation chars that close an expression or a quotation.
+DEFINE_CHAR_PROPERTY(close_punc, prop) {
+  prop->AddCharProperty("close_expr_punc");
+  prop->AddCharProperty("close_quote");
+}
+
+// Punctuation chars that can come at the beginning of a sentence.
+DEFINE_CHAR_PROPERTY(leading_sentence_punc, prop) {
+  prop->AddCharProperty("open_punc");
+  prop->AddCharProperty("start_sentence_punc");
+}
+
+// Punctuation chars that can come at the end of a sentence.
+DEFINE_CHAR_PROPERTY(trailing_sentence_punc, prop) {
+  prop->AddCharProperty("close_punc");
+  prop->AddCharProperty("end_sentence_punc");
+}
+
+//======================================================================
+// Special symbols
+//
+
+// Currency symbols.
+// From: http://www.unicode.org/charts/PDF/U20A0.pdf
+DEFINE_CHAR_PROPERTY_AS_SET(currency_symbol,
+  '$',
+  // 0x00A2,  // cents (NB: typically FOLLOWS the amount)
+  0x00A3,  // pounds and liras
+  0x00A4,  // general currency sign
+  0x00A5,  // yen or yuan
+  0x0192,  // Dutch florin (latin small letter "f" with hook)
+  0x09F2,  // Bengali rupee mark
+  0x09F3,  // Bengali rupee sign
+  0x0AF1,  // Guajarati rupee sign
+  0x0BF9,  // Tamil rupee sign
+  0x0E3F,  // Thai baht
+  0x17DB,  // Khmer riel
+  0x20A0,  // alternative euro sign
+  0x20A1,  // Costa Rica, El Salvador (colon sign)
+  0x20A2,  // Brazilian cruzeiro
+  0x20A3,  // French Franc
+  0x20A4,  // alternative lira sign
+  0x20A5,  // mill sign (USA 1/10 cent)
+  0x20A6,  // Nigerian Naira
+  0x20A7,  // Spanish peseta
+  0x20A8,  // Indian rupee
+  0x20A9,  // Korean won
+  0x20AA,  // Israeli new sheqel
+  0x20AB,  // Vietnam dong
+  0x20AC,  // euro sign
+  0x20AD,  // Laotian kip
+  0x20AE,  // Mongolian tugrik
+  0x20AF,  // Greek drachma
+  0x20B0,  // German penny
+  0x20B1,  // Philippine peso (Mexican peso uses "$")
+  0x2133,  // Old German mark (script capital M)
+  0xFDFC,  // rial sign
+  0xFFE0,  // fullwidth cents
+  0xFFE1,  // fullwidth pounds
+  0xFFE5,  // fullwidth Japanese yen
+  0xFFE6,  // fullwidth Korean won
+)
+
+// Chinese bookquotes.
+// They look like "<<" and ">>" except that they are single UTF8 chars
+// (U+300A, U+300B). These are used in chinese as special
+// punctuation, refering to the title of a book, an article, a movie,
+// etc.  For example: "cellphone" means cellphone, but <<cellphone>>
+// means (exclusively) the movie.
+DEFINE_CHAR_PROPERTY_AS_SET(open_bookquote,
+ 0x300A
+)
+
+DEFINE_CHAR_PROPERTY_AS_SET(close_bookquote,
+ 0x300B
+)
+
+//======================================================================
+// Token-level punctuation
+//
+
+// Token-prefix symbols, excluding currency symbols -- glom on
+// to following token (esp. if no space after)
+DEFINE_CHAR_PROPERTY_AS_SET(noncurrency_token_prefix_symbol,
+  '#',
+  0x2116,  // numero sign ("No")
+)
+
+// Token-prefix symbols -- glom on to following token (esp. if no space after)
+DEFINE_CHAR_PROPERTY(token_prefix_symbol, prop) {
+  prop->AddCharProperty("currency_symbol");
+  prop->AddCharProperty("noncurrency_token_prefix_symbol");
+}
+
+// Token-suffix symbols -- glom on to preceding token (esp. if no space before)
+DEFINE_CHAR_PROPERTY_AS_SET(token_suffix_symbol,
+  '%',
+  0x066A,  // Arabic percent sign
+  0x2030,  // per mille
+  0x2031,  // per ten thousand
+  0x00A2,  // cents sign
+  0x2125,  // ounces sign
+  0x00AA,  // feminine ordinal indicator (Spanish)
+  0x00BA,  // masculine ordinal indicator (Spanish)
+  0x00B0,  // degrees
+  0x2109,  // degrees Fahrenheit
+  0x2103,  // degrees Celsius
+  0x2126,  // ohms
+  0x212A,  // Kelvin
+  0x212B,  // Angstroms ("A" with circle on top)
+  0x00A9,  // copyright
+  0x2117,  // sound recording copyright (circled "P")
+  0x2122,  // trade mark
+  0x00AE,  // registered trade mark
+  0x2120,  // service mark
+  0x2106,  // cada una ("c/a" == "each" in Spanish)
+  0x2020,  // dagger (can be used for footnotes)
+  0x2021,  // double dagger (can be used for footnotes)
+)
+
+// Subscripts
+DEFINE_CHAR_PROPERTY_AS_SET(subscript_symbol,
+  0x2080,  // subscript 0
+  0x2081,  // subscript 1
+  0x2082,  // subscript 2
+  0x2083,  // subscript 3
+  0x2084,  // subscript 4
+  0x2085,  // subscript 5
+  0x2086,  // subscript 6
+  0x2087,  // subscript 7
+  0x2088,  // subscript 8
+  0x2089,  // subscript 9
+  0x208A,  // subscript "+"
+  0x208B,  // subscript "-"
+  0x208C,  // subscript "="
+  0x208D,  // subscript "("
+  0x208E,  // subscript ")"
+)
+
+// Superscripts
+DEFINE_CHAR_PROPERTY_AS_SET(superscript_symbol,
+  0x2070,  // superscript 0
+  0x00B9,  // superscript 1
+  0x00B2,  // superscript 2
+  0x00B3,  // superscript 3
+  0x2074,  // superscript 4
+  0x2075,  // superscript 5
+  0x2076,  // superscript 6
+  0x2077,  // superscript 7
+  0x2078,  // superscript 8
+  0x2079,  // superscript 9
+  0x2071,  // superscript Latin small "i"
+  0x207A,  // superscript "+"
+  0x207B,  // superscript "-"
+  0x207C,  // superscript "="
+  0x207D,  // superscript "("
+  0x207E,  // superscript ")"
+  0x207F,  // superscript Latin small "n"
+)
+
+//======================================================================
+// General punctuation
+//
+
+// Connector punctuation
+// Code Pc from http://www.unicode.org/Public/UNIDATA/PropList.txt
+// NB: This list is not necessarily exhaustive.
+DEFINE_CHAR_PROPERTY_AS_SET(connector_punc,
+  0x30fb,  // Katakana middle dot
+  0xff65,  // halfwidth Katakana middle dot
+  0x2040,  // character tie
+)
+
+// Dashes
+// Code Pd from http://www.unicode.org/Public/UNIDATA/PropList.txt
+// NB: This list is not necessarily exhaustive.
+DEFINE_CHAR_PROPERTY_AS_SET(dash_punc,
+  '-',
+  '~',
+  0x058a,  // Armenian hyphen
+  0x1806,  // Mongolian todo soft hyphen
+  RANGE(0x2010, 0x2015),  // hyphen..horizontal bar
+  0x2053,  // swung dash -- from Table 6-3 of Unicode book
+  0x207b,  // superscript minus
+  0x208b,  // subscript minus
+  0x2212,  // minus sign
+  0x301c,  // wave dash
+  0x3030,  // wavy dash
+  RANGE(0xfe31, 0xfe32),  // presentation form for vertical em dash..en dash
+  0xfe58,  // small em dash
+  0xfe63,  // small hyphen-minus
+  0xff0d,  // fullwidth hyphen-minus
+)
+
+// Other punctuation
+// Code Po from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
+// NB: This list is not exhaustive.
+DEFINE_CHAR_PROPERTY_AS_SET(other_punc,
+  ',',
+  ':',
+  ';',
+  0x00b7,  // middle dot
+  0x0387,  // Greek ano teleia
+  0x05c3,  // Hebrew punctuation sof pasuq
+  0x060c,  // Arabic comma
+  0x061b,  // Arabic semicolon
+  0x066b,  // Arabic decimal separator
+  0x066c,  // Arabic thousands separator
+  RANGE(0x0703, 0x70a),  // Syriac contraction and others
+  0x070c,  // Syric harklean metobelus
+  0x0e5a,  // Thai character angkhankhu
+  0x0e5b,  // Thai character khomut
+  0x0f08,  // Tibetan mark sbrul shad
+  RANGE(0x0f0d, 0x0f12),  // Tibetan mark shad..Tibetan mark rgya gram shad
+  0x1361,  // Ethiopic wordspace
+  RANGE(0x1363, 0x1366),  // other Ethiopic chars
+  0x166d,  // Canadian syllabics chi sign
+  RANGE(0x16eb, 0x16ed),  // Runic single punctuation..Runic cross punctuation
+  RANGE(0x17d5, 0x17d6),  // Khmer sign camnuc pii huuh and other
+  0x17da,  // Khmer sign koomut
+  0x1802,  // Mongolian comma
+  RANGE(0x1804, 0x1805),  // Mongolian four dots and other
+  0x1808,  // Mongolian manchu comma
+  0x3001,  // ideographic comma
+  RANGE(0xfe50, 0xfe51),  // small comma and others
+  RANGE(0xfe54, 0xfe55),  // small semicolon and other
+  0xff0c,  // fullwidth comma
+  RANGE(0xff0e, 0xff0f),  // fullwidth stop..fullwidth solidus
+  RANGE(0xff1a, 0xff1b),  // fullwidth colon..fullwidth semicolon
+  0xff64,  // halfwidth ideographic comma
+  0x2016,  // double vertical line
+  RANGE(0x2032, 0x2034),  // prime..triple prime
+  0xfe61,  // small asterisk
+  0xfe68,  // small reverse solidus
+  0xff3c,  // fullwidth reverse solidus
+)
+
+// All punctuation.
+// Code P from http://www.unicode.org/Public/UNIDATA/PropList.txt
+// NB: This list is not necessarily exhaustive.
+DEFINE_CHAR_PROPERTY(punctuation, prop) {
+  prop->AddCharProperty("open_punc");
+  prop->AddCharProperty("close_punc");
+  prop->AddCharProperty("leading_sentence_punc");
+  prop->AddCharProperty("trailing_sentence_punc");
+  prop->AddCharProperty("connector_punc");
+  prop->AddCharProperty("dash_punc");
+  prop->AddCharProperty("other_punc");
+  prop->AddAsciiPredicate(&ispunct);
+}
+
+//======================================================================
+// Separators
+//
+
+// Line separators
+// Code Zl from http://www.unicode.org/Public/UNIDATA/PropList.txt
+// NB: This list is not necessarily exhaustive.
+DEFINE_CHAR_PROPERTY_AS_SET(line_separator,
+  0x2028,                           // line separator
+)
+
+// Paragraph separators
+// Code Zp from http://www.unicode.org/Public/UNIDATA/PropList.txt
+// NB: This list is not necessarily exhaustive.
+DEFINE_CHAR_PROPERTY_AS_SET(paragraph_separator,
+  0x2029,                           // paragraph separator
+)
+
+// Space separators
+// Code Zs from http://www.unicode.org/Public/UNIDATA/PropList.txt
+// NB: This list is not necessarily exhaustive.
+DEFINE_CHAR_PROPERTY_AS_SET(space_separator,
+  0x0020,                           // space
+  0x00a0,                           // no-break space
+  0x1680,                           // Ogham space mark
+  0x180e,                           // Mongolian vowel separator
+  RANGE(0x2000, 0x200a),            // en quad..hair space
+  0x202f,                           // narrow no-break space
+  0x205f,                           // medium mathematical space
+  0x3000,                           // ideographic space
+
+  // Google additions
+  0xe5e5,                           // "private" char used as space in Chinese
+)
+
+// Separators -- all line, paragraph, and space separators.
+// Code Z from http://www.unicode.org/Public/UNIDATA/PropList.txt
+// NB: This list is not necessarily exhaustive.
+DEFINE_CHAR_PROPERTY(separator, prop) {
+  prop->AddCharProperty("line_separator");
+  prop->AddCharProperty("paragraph_separator");
+  prop->AddCharProperty("space_separator");
+  prop->AddAsciiPredicate(&isspace);
+}
+
+//======================================================================
+// Alphanumeric Characters
+//
+
+// Digits
+DEFINE_CHAR_PROPERTY_AS_SET(digit,
+  RANGE('0', '9'),
+  RANGE(0x0660, 0x0669),  // Arabic-Indic digits
+  RANGE(0x06F0, 0x06F9),  // Eastern Arabic-Indic digits
+)
+
+//======================================================================
+// Japanese Katakana
+//
+
+DEFINE_CHAR_PROPERTY_AS_SET(katakana,
+  0x3099,  // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
+  0x309A,  // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
+  0x309B,  // KATAKANA-HIRAGANA VOICED SOUND MARK
+  0x309C,  // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
+  RANGE(0x30A0, 0x30FF),  // Fullwidth Katakana
+  RANGE(0xFF65, 0xFF9F),  // Halfwidth Katakana
+)
+
+//======================================================================
+// BiDi Directional Formatting Codes
+//
+
+// See http://www.unicode.org/reports/tr9/ for a description of Bidi
+// and http://www.unicode.org/charts/PDF/U2000.pdf for the character codes.
+DEFINE_CHAR_PROPERTY_AS_SET(directional_formatting_code,
+  0x200E,  // LRM (Left-to-Right Mark)
+  0x200F,  // RLM (Right-to-Left Mark)
+  0x202A,  // LRE (Left-to-Right Embedding)
+  0x202B,  // RLE (Right-to-Left Embedding)
+  0x202C,  // PDF (Pop Directional Format)
+  0x202D,  // LRO (Left-to-Right Override)
+  0x202E,  // RLO (Right-to-Left Override)
+)
+
+//======================================================================
+// Special collections
+//
+
+// NB: This does not check for all punctuation and symbols in the
+// standard; just those listed in our code. See the definitions in
+// char_properties.cc
+DEFINE_CHAR_PROPERTY(punctuation_or_symbol, prop) {
+  prop->AddCharProperty("punctuation");
+  prop->AddCharProperty("subscript_symbol");
+  prop->AddCharProperty("superscript_symbol");
+  prop->AddCharProperty("token_prefix_symbol");
+  prop->AddCharProperty("token_suffix_symbol");
+}
+
+}  // namespace syntaxnet

+ 362 - 0
syntaxnet/syntaxnet/char_properties.h

@@ -0,0 +1,362 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// char_properties.h - define is_X() tests for various character properties
+//
+// Character properties can be defined in two ways:
+//
+// (1) Set-based:
+//
+//     Enumerate the chars that have the property.  Example:
+//
+//       DEFINE_CHAR_PROPERTY_AS_SET(my_fave,
+//         RANGE('0', '9'),
+//         '\'',
+//         0x00BF,   // Spanish inverted question mark
+//       )
+//
+//     Characters are expressed as Unicode code points; note that ascii codes
+//     are a subset.  RANGE() specifies an inclusive range of code points.
+//
+//     This defines two functions:
+//
+//       bool is_my_fave(const char *str, int len)
+//       bool is_my_fave(int c)
+//
+//     Each returns true for precisely the 12 characters specified above.
+//     Each takes a *single* UTf8 char as its argument -- the first expresses
+//     it as a char * and a length, the second as a Unicode code point.
+//     Please do not pass a string of multiple UTF8 chars to the first one.
+//
+//     To make is_my_fave() externally accessible, put in your .h file:
+//
+//       DECLARE_CHAR_PROPERTY(my_fave)
+//
+// (2) Function-based:
+//
+//     Specify a function that assigns the desired chars to a CharProperty
+//     object.  Example:
+//
+//       DEFINE_CHAR_PROPERTY(my_other_fave, prop) {
+//         for (int i = '0'; i <= '9'; i += 2) {
+//           prop->AddChar(i);
+//         }
+//         prop->AddAsciiPredicate(&ispunct);
+//         prop->AddCharProperty("currency_symbol");
+//       }
+//
+//     This defines a function of one arg: CharProperty *prop.  The function
+//     calls various CharProperty methods to populate the prop.  The last call
+//     above, AddCharProperty(), adds the chars from another char property
+//     ("currency_symbol").
+//
+//     As in the set-based case, put a DECLARE_CHAR_PROPERTY(my_other_fave)
+//     in your .h if you want is_my_other_fave() to be externally accessible.
+//
+
+#ifndef SYNTAXNET_CHAR_PROPERTIES_H_
+#define SYNTAXNET_CHAR_PROPERTIES_H_
+
+#include <string>  // for string
+
+#include "syntaxnet/registry.h"
+#include "syntaxnet/utils.h"
+
+// =====================================================================
+// Registry for accessing CharProperties by name
+//
+// This is for internal use by the CharProperty class and macros; callers
+// should not use it explicitly.
+//
+
+namespace syntaxnet {
+
+class CharProperty;   // forward declaration
+
+// Wrapper around a CharProperty, allowing it to be stored in a registry.
+struct CharPropertyWrapper : RegisterableClass<CharPropertyWrapper> {
+  virtual ~CharPropertyWrapper() { }
+  virtual CharProperty *GetCharProperty() = 0;
+};
+
+#define REGISTER_CHAR_PROPERTY_WRAPPER(type, component) \
+  REGISTER_CLASS_COMPONENT(CharPropertyWrapper, type, component)
+
+#define REGISTER_CHAR_PROPERTY(lsp, name)                         \
+  struct name##CharPropertyWrapper : public CharPropertyWrapper { \
+    CharProperty *GetCharProperty() { return lsp.get(); }         \
+  };                                                              \
+  REGISTER_CHAR_PROPERTY_WRAPPER(#name, name##CharPropertyWrapper)
+
+// =====================================================================
+// Macros for defining character properties
+//
+
+// Define is_X() functions to test whether a single UTF8 character has
+// the 'X' char prop.
+#define DEFINE_IS_X_CHAR_PROPERTY_FUNCTIONS(lsp, name) \
+  bool is_##name(const char *str, int len) {                                 \
+    return lsp->HoldsFor(str, len);                                          \
+  }                                                                          \
+  bool is_##name(int c) {                                                    \
+    return lsp->HoldsFor(c);                                                 \
+  }
+
+// Define a char property by enumerating the unicode char points,
+// or RANGE()s thereof, for which it holds.  Example:
+//
+//   DEFINE_CHAR_PROPERTY_AS_SET(my_fave,
+//     'q',
+//     RANGE('0', '9'),
+//     0x20AB,
+//   )
+//
+// "..." is a GNU extension.
+#define DEFINE_CHAR_PROPERTY_AS_SET(name, unicodes...)                         \
+  static const int k_##name##_unicodes[] = {unicodes};                         \
+  static utils::LazyStaticPtr<CharProperty, const char *, const int *, size_t> \
+      name##_char_property = {#name, k_##name##_unicodes,                      \
+                              arraysize(k_##name##_unicodes)};                 \
+  REGISTER_CHAR_PROPERTY(name##_char_property, name);                          \
+  DEFINE_IS_X_CHAR_PROPERTY_FUNCTIONS(name##_char_property, name)
+
+// Specify a range (inclusive) of Unicode character values.
+// Example: RANGE('0', '9') specifies the 10 digits.
+// For use as an element in a DEFINE_CHAR_PROPERTY_AS_SET() list.
+static const int kPreUnicodeRange = -1;
+static const int kPostUnicodeRange = -2;
+#define RANGE(lower, upper) \
+  kPreUnicodeRange, lower, upper, kPostUnicodeRange
+
+// A function to initialize a CharProperty.
+typedef void CharPropertyInitializer(CharProperty *prop);
+
+// Define a char property by specifying a block of code that initializes it.
+// Example:
+//
+//   DEFINE_CHAR_PROPERTY(my_other_fave, prop) {
+//     for (int i = '0'; i <= '9'; i += 2) {
+//       prop->AddChar(i);
+//     }
+//     prop->AddAsciiPredicate(&ispunct);
+//     prop->AddCharProperty("currency_symbol");
+//   }
+//
+#define DEFINE_CHAR_PROPERTY(name, charpropvar)                       \
+  static void init_##name##_char_property(CharProperty *charpropvar); \
+  static utils::LazyStaticPtr<CharProperty, const char *,             \
+                              CharPropertyInitializer *>              \
+      name##_char_property = {#name, &init_##name##_char_property};   \
+  REGISTER_CHAR_PROPERTY(name##_char_property, name);                 \
+  DEFINE_IS_X_CHAR_PROPERTY_FUNCTIONS(name##_char_property, name)     \
+  static void init_##name##_char_property(CharProperty *charpropvar)
+
+// =====================================================================
+// Macro for declaring character properties
+//
+
+#define DECLARE_CHAR_PROPERTY(name) \
+  extern bool is_##name(const char *str, int len);                           \
+  extern bool is_##name(int c);                                              \
+
+// ===========================================================
+// CharProperty - a property that holds for selected Unicode chars
+//
+// A CharProperty is semantically equivalent to set<char32>.
+//
+// The characters for which a CharProperty holds are represented as a trie,
+// i.e., a tree that is indexed by successive bytes of the UTF-8 encoding
+// of the characters.  This permits fast lookup (HoldsFor).
+//
+
+// A function that defines a subset of [0..255], e.g., isspace.
+typedef int AsciiPredicate(int c);
+
+class CharProperty {
+ public:
+  // Constructor for set-based char properties.
+  CharProperty(const char *name, const int *unicodes, int num_unicodes);
+
+  // Constructor for function-based char properties.
+  CharProperty(const char *name, CharPropertyInitializer *init_fn);
+
+  virtual ~CharProperty();
+
+  // Various ways of adding chars to a CharProperty; for use only in
+  // CharPropertyInitializer functions.
+  void AddChar(int c);
+  void AddCharRange(int c1, int c2);
+  void AddAsciiPredicate(AsciiPredicate *pred);
+  void AddCharProperty(const char *name);
+  void AddCharSpec(const int *unicodes, int num_unicodes);
+
+  // Return true iff the CharProperty holds for a single given UTF8 char.
+  bool HoldsFor(const char *str, int len) const;
+
+  // Return true iff the CharProperty holds for a single given Unicode char.
+  bool HoldsFor(int c) const;
+
+  // You can use this to enumerate the set elements (it was easier
+  // than defining a real iterator).  Returns -1 if there are no more.
+  // Call with -1 to get the first element.  Expects c == -1 or HoldsFor(c).
+  int NextElementAfter(int c) const;
+
+  // Return NULL or the CharProperty with the given name.  Looks up the name
+  // in a CharProperty registry.
+  static const CharProperty *Lookup(const char *name);
+
+ private:
+  void CheckUnicodeVal(int c) const;
+  static string UnicodeToString(int c);
+
+  const char *name_;
+  struct CharPropertyImplementation *impl_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CharProperty);
+};
+
+//======================================================================
+// Expression-level punctuation
+//
+
+// Punctuation that starts a sentence.
+DECLARE_CHAR_PROPERTY(start_sentence_punc);
+
+// Punctuation that ends a sentence.
+DECLARE_CHAR_PROPERTY(end_sentence_punc);
+
+// Punctuation, such as parens, that opens a "nested expression" of text.
+DECLARE_CHAR_PROPERTY(open_expr_punc);
+
+// Punctuation, such as parens, that closes a "nested expression" of text.
+DECLARE_CHAR_PROPERTY(close_expr_punc);
+
+// Chars that open a quotation.
+DECLARE_CHAR_PROPERTY(open_quote);
+
+// Chars that close a quotation.
+DECLARE_CHAR_PROPERTY(close_quote);
+
+// Punctuation chars that open an expression or a quotation.
+DECLARE_CHAR_PROPERTY(open_punc);
+
+// Punctuation chars that close an expression or a quotation.
+DECLARE_CHAR_PROPERTY(close_punc);
+
+// Punctuation chars that can come at the beginning of a sentence.
+DECLARE_CHAR_PROPERTY(leading_sentence_punc);
+
+// Punctuation chars that can come at the end of a sentence.
+DECLARE_CHAR_PROPERTY(trailing_sentence_punc);
+
+//======================================================================
+// Token-level punctuation
+//
+
+// Token-prefix symbols -- glom on to following token
+// (esp. if no space after) -- except for currency symbols.
+DECLARE_CHAR_PROPERTY(noncurrency_token_prefix_symbol);
+
+// Token-prefix symbols -- glom on to following token (esp. if no space after).
+DECLARE_CHAR_PROPERTY(token_prefix_symbol);
+
+// Token-suffix symbols -- glom on to preceding token (esp. if no space
+// before).
+DECLARE_CHAR_PROPERTY(token_suffix_symbol);
+
+// Subscripts.
+DECLARE_CHAR_PROPERTY(subscript_symbol);
+
+// Superscripts.
+DECLARE_CHAR_PROPERTY(superscript_symbol);
+
+//======================================================================
+// General punctuation
+//
+
+// Connector punctuation.
+DECLARE_CHAR_PROPERTY(connector_punc);
+
+// Dashes.
+DECLARE_CHAR_PROPERTY(dash_punc);
+
+// Other punctuation.
+DECLARE_CHAR_PROPERTY(other_punc);
+
+// All punctuation.
+DECLARE_CHAR_PROPERTY(punctuation);
+
+//======================================================================
+// Special symbols
+//
+
+// Currency symbols.
+DECLARE_CHAR_PROPERTY(currency_symbol);
+
+// Chinese bookquotes.
+DECLARE_CHAR_PROPERTY(open_bookquote);
+DECLARE_CHAR_PROPERTY(close_bookquote);
+
+//======================================================================
+// Separators
+//
+
+// Line separators.
+DECLARE_CHAR_PROPERTY(line_separator);
+
+// Paragraph separators.
+DECLARE_CHAR_PROPERTY(paragraph_separator);
+
+// Space separators.
+DECLARE_CHAR_PROPERTY(space_separator);
+
+// Separators -- all line, paragraph, and space separators.
+DECLARE_CHAR_PROPERTY(separator);
+
+//======================================================================
+// Alphanumeric Characters
+//
+
+// Digits.
+DECLARE_CHAR_PROPERTY(digit);
+
+// Japanese Katakana.
+DECLARE_CHAR_PROPERTY(katakana);
+
+//======================================================================
+// BiDi Directional Formatting Codes
+//
+
+// Explicit directional formatting codes (LRM, RLM, LRE, RLE, PDF, LRO, RLO)
+// used by the bidirectional algorithm.
+//
+// Note: Use this only to classify characters. To actually determine
+// directionality of BiDi text, look under i18n/bidi.
+//
+// See http://www.unicode.org/reports/tr9/ for a description of the algorithm
+// and http://www.unicode.org/charts/PDF/U2000.pdf for the character codes.
+DECLARE_CHAR_PROPERTY(directional_formatting_code);
+
+//======================================================================
+// Special collections
+//
+
+// NB: This does not check for all punctuation and symbols in the standard;
+// just those listed in our code. See the definitions in char_properties.cc.
+DECLARE_CHAR_PROPERTY(punctuation_or_symbol);
+
+}  // namespace syntaxnet
+
+#endif  // SYNTAXNET_CHAR_PROPERTIES_H_

+ 364 - 0
syntaxnet/syntaxnet/char_properties_test.cc

@@ -0,0 +1,364 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Tests for char_properties.cc:
+//
+// (1) Test the DEFINE_CHAR_PROPERTY_AS_SET and DEFINE_CHAR_PROPERTY macros
+//     by defining a few fake char properties and verifying their contents.
+//
+// (2) Test the char properties defined in char_properties.cc by spot-checking
+//     a few chars.
+//
+
+#include "syntaxnet/char_properties.h"
+
+#include <ctype.h>  // for ispunct, isspace
+#include <map>
+#include <set>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>  // for ContainerEq, EXPECT_THAT
+#include "tensorflow/core/platform/test.h"
+#include "third_party/utf/utf.h"
+#include "util/utf8/unilib.h"  // for IsValidCodepoint, etc
+#include "util/utf8/unilib_utf8_utils.h"
+
+using ::testing::ContainerEq;
+
+namespace syntaxnet {
+
+// Invalid UTF-8 bytes are decoded as the Replacement Character, U+FFFD
+// (which is also Runeerror). Invalid code points are encoded in UTF-8
+// with the UTF-8 representation of the Replacement Character.
+static const char ReplacementCharacterUTF8[3] = {'\xEF', '\xBF', '\xBD'};
+
+// ====================================================================
+// CharPropertiesTest
+//
+
+class CharPropertiesTest : public testing::Test {
+ protected:
+  // Collect a set of chars.
+  void CollectChars(const std::set<char32> &chars) {
+    collected_set_.insert(chars.begin(), chars.end());
+  }
+
+  // Collect an array of chars.
+  void CollectArray(const char32 arr[], int len) {
+    collected_set_.insert(arr, arr + len);
+  }
+
+  // Collect the chars for which the named CharProperty holds.
+  void CollectCharProperty(const char *name) {
+    const CharProperty *prop = CharProperty::Lookup(name);
+    ASSERT_TRUE(prop != nullptr) << "for " << name;
+
+    for (char32 c = 0; c <= 0x10FFFF; ++c) {
+      if (UniLib::IsValidCodepoint(c) && prop->HoldsFor(c)) {
+        collected_set_.insert(c);
+      }
+    }
+  }
+
+  // Collect the chars for which an ascii predicate holds.
+  void CollectAsciiPredicate(AsciiPredicate *pred) {
+    for (char32 c = 0; c < 256; ++c) {
+      if ((*pred)(c)) {
+        collected_set_.insert(c);
+      }
+    }
+  }
+
+  // Expect the named char property to be true for precisely the chars in
+  // the collected set.
+  void ExpectCharPropertyEqualsCollectedSet(const char *name) {
+    const CharProperty *prop = CharProperty::Lookup(name);
+    ASSERT_TRUE(prop != nullptr) << "for " << name;
+
+    // Test that char property holds for all collected chars.  Exercises both
+    // signatures of CharProperty::HoldsFor().
+    for (std::set<char32>::const_iterator it = collected_set_.begin();
+         it != collected_set_.end(); ++it) {
+      // Test utf8 version of is_X().
+      const char32 c = *it;
+      string utf8_char = EncodeAsUTF8(&c, 1);
+      EXPECT_TRUE(prop->HoldsFor(utf8_char.c_str(), utf8_char.size()));
+
+      // Test ucs-2 version of is_X().
+      EXPECT_TRUE(prop->HoldsFor(static_cast<int>(c)));
+    }
+
+    // Test that the char property holds for precisely the collected chars.
+    // Somewhat redundant with previous test, but exercises
+    // CharProperty::NextElementAfter().
+    std::set<char32> actual_chars;
+    int c = -1;
+    while ((c = prop->NextElementAfter(c)) >= 0) {
+      actual_chars.insert(static_cast<char32>(c));
+    }
+    EXPECT_THAT(actual_chars, ContainerEq(collected_set_))
+        << " for " << name;
+  }
+
+  // Expect the named char property to be true for at least the chars in
+  // the collected set.
+  void ExpectCharPropertyContainsCollectedSet(const char *name) {
+    const CharProperty *prop = CharProperty::Lookup(name);
+    ASSERT_TRUE(prop != nullptr) << "for " << name;
+
+    for (std::set<char32>::const_iterator it = collected_set_.begin();
+         it != collected_set_.end(); ++it) {
+      EXPECT_TRUE(prop->HoldsFor(static_cast<int>(*it)));
+    }
+  }
+
+  string EncodeAsUTF8(const char32 *in, int size) {
+    string out;
+    out.reserve(size);
+    for (int i = 0; i < size; ++i) {
+      char buf[UTFmax];
+      int len = EncodeAsUTF8Char(*in++, buf);
+      out.append(buf, len);
+    }
+    return out;
+  }
+
+  int EncodeAsUTF8Char(char32 in, char *out) {
+    if (UniLib::IsValidCodepoint(in)) {
+      return runetochar(out, &in);
+    } else {
+      memcpy(out, ReplacementCharacterUTF8, 3);
+      return 3;
+    }
+  }
+
+ private:
+  std::set<char32> collected_set_;
+};
+
+//======================================================================
+// Declarations of the sample character sets below
+// (to test the DECLARE_CHAR_PROPERTY() macro)
+//
+
+DECLARE_CHAR_PROPERTY(test_digit);
+DECLARE_CHAR_PROPERTY(test_wavy_dash);
+DECLARE_CHAR_PROPERTY(test_digit_or_wavy_dash);
+DECLARE_CHAR_PROPERTY(test_punctuation_plus);
+
+//======================================================================
+// Definitions of sample character sets
+//
+
+// Digits.
+DEFINE_CHAR_PROPERTY_AS_SET(test_digit,
+  RANGE('0', '9'),
+)
+
+// Wavy dashes.
+DEFINE_CHAR_PROPERTY_AS_SET(test_wavy_dash,
+  '~',
+  0x301C,  // wave dash
+  0x3030,  // wavy dash
+)
+
+// Digits or wavy dashes.
+DEFINE_CHAR_PROPERTY(test_digit_or_wavy_dash, prop) {
+  prop->AddCharProperty("test_digit");
+  prop->AddCharProperty("test_wavy_dash");
+}
+
+// Punctuation plus a few extraneous chars.
+DEFINE_CHAR_PROPERTY(test_punctuation_plus, prop) {
+  prop->AddChar('a');
+  prop->AddCharRange('b', 'b');
+  prop->AddCharRange('c', 'e');
+  static const int kUnicodes[] = {'f', RANGE('g', 'i'), 'j'};
+  prop->AddCharSpec(kUnicodes, arraysize(kUnicodes));
+  prop->AddCharProperty("punctuation");
+}
+
+//====================================================================
+// Another form of the character sets above -- for verification
+//
+
+const char32 kTestDigit[] = {
+  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
+};
+
+const char32 kTestWavyDash[] = {
+  '~',
+  0x301C,  // wave dash,
+  0x3030,  // wavy dash
+};
+
+const char32 kTestPunctuationPlusExtras[] = {
+  'a',
+  'b',
+  'c',
+  'd',
+  'e',
+  'f',
+  'g',
+  'h',
+  'i',
+  'j',
+};
+
+// ====================================================================
+// Tests
+//
+
+TEST_F(CharPropertiesTest, TestDigit) {
+  CollectArray(kTestDigit, arraysize(kTestDigit));
+  ExpectCharPropertyEqualsCollectedSet("test_digit");
+}
+
+TEST_F(CharPropertiesTest, TestWavyDash) {
+  CollectArray(kTestWavyDash, arraysize(kTestWavyDash));
+  ExpectCharPropertyEqualsCollectedSet("test_wavy_dash");
+}
+
+TEST_F(CharPropertiesTest, TestDigitOrWavyDash) {
+  CollectArray(kTestDigit, arraysize(kTestDigit));
+  CollectArray(kTestWavyDash, arraysize(kTestWavyDash));
+  ExpectCharPropertyEqualsCollectedSet("test_digit_or_wavy_dash");
+}
+
+TEST_F(CharPropertiesTest, TestPunctuationPlus) {
+  CollectCharProperty("punctuation");
+  CollectArray(kTestPunctuationPlusExtras,
+               arraysize(kTestPunctuationPlusExtras));
+  ExpectCharPropertyEqualsCollectedSet("test_punctuation_plus");
+}
+
+// ====================================================================
+// Spot-check predicates in char_properties.cc
+//
+
+TEST_F(CharPropertiesTest, StartSentencePunc) {
+  CollectChars({0x00A1, 0x00BF});
+  ExpectCharPropertyContainsCollectedSet("start_sentence_punc");
+}
+
+TEST_F(CharPropertiesTest, EndSentencePunc) {
+  CollectChars({'.', '!', '?'});
+  ExpectCharPropertyContainsCollectedSet("end_sentence_punc");
+}
+
+TEST_F(CharPropertiesTest, OpenExprPunc) {
+  CollectChars({'(', '['});
+  ExpectCharPropertyContainsCollectedSet("open_expr_punc");
+}
+
+TEST_F(CharPropertiesTest, CloseExprPunc) {
+  CollectChars({')', ']'});
+  ExpectCharPropertyContainsCollectedSet("close_expr_punc");
+}
+
+TEST_F(CharPropertiesTest, OpenQuote) {
+  CollectChars({'\'', '"'});
+  ExpectCharPropertyContainsCollectedSet("open_quote");
+}
+
+TEST_F(CharPropertiesTest, CloseQuote) {
+  CollectChars({'\'', '"'});
+  ExpectCharPropertyContainsCollectedSet("close_quote");
+}
+
+TEST_F(CharPropertiesTest, OpenBookquote) {
+  CollectChars({0x300A});
+  ExpectCharPropertyContainsCollectedSet("open_bookquote");
+}
+
+TEST_F(CharPropertiesTest, CloseBookquote) {
+  CollectChars({0x300B});
+  ExpectCharPropertyContainsCollectedSet("close_bookquote");
+}
+
+TEST_F(CharPropertiesTest, OpenPunc) {
+  CollectChars({'(', '['});
+  CollectChars({'\'', '"'});
+  ExpectCharPropertyContainsCollectedSet("open_punc");
+}
+
+TEST_F(CharPropertiesTest, ClosePunc) {
+  CollectChars({')', ']'});
+  CollectChars({'\'', '"'});
+  ExpectCharPropertyContainsCollectedSet("close_punc");
+}
+
+TEST_F(CharPropertiesTest, LeadingSentencePunc) {
+  CollectChars({'(', '['});
+  CollectChars({'\'', '"'});
+  CollectChars({0x00A1, 0x00BF});
+  ExpectCharPropertyContainsCollectedSet("leading_sentence_punc");
+}
+
+TEST_F(CharPropertiesTest, TrailingSentencePunc) {
+  CollectChars({')', ']'});
+  CollectChars({'\'', '"'});
+  CollectChars({'.', '!', '?'});
+  ExpectCharPropertyContainsCollectedSet("trailing_sentence_punc");
+}
+
+TEST_F(CharPropertiesTest, NoncurrencyTokenPrefixSymbol) {
+  CollectChars({'#'});
+  ExpectCharPropertyContainsCollectedSet("noncurrency_token_prefix_symbol");
+}
+
+TEST_F(CharPropertiesTest, TokenSuffixSymbol) {
+  CollectChars({'%', 0x2122, 0x00A9, 0x00B0});
+  ExpectCharPropertyContainsCollectedSet("token_suffix_symbol");
+}
+
+TEST_F(CharPropertiesTest, TokenPrefixSymbol) {
+  CollectChars({'#'});
+  CollectChars({'$', 0x00A5, 0x20AC});
+  ExpectCharPropertyContainsCollectedSet("token_prefix_symbol");
+}
+
+TEST_F(CharPropertiesTest, SubscriptSymbol) {
+  CollectChars({0x2082, 0x2083});
+  ExpectCharPropertyContainsCollectedSet("subscript_symbol");
+}
+
+TEST_F(CharPropertiesTest, SuperscriptSymbol) {
+  CollectChars({0x00B2, 0x00B3});
+  ExpectCharPropertyContainsCollectedSet("superscript_symbol");
+}
+
+TEST_F(CharPropertiesTest, CurrencySymbol) {
+  CollectChars({'$', 0x00A5, 0x20AC});
+  ExpectCharPropertyContainsCollectedSet("currency_symbol");
+}
+
+TEST_F(CharPropertiesTest, DirectionalFormattingCode) {
+  CollectChars({0x200E, 0x200F, 0x202A, 0x202B, 0x202C, 0x202D, 0x202E});
+  ExpectCharPropertyContainsCollectedSet("directional_formatting_code");
+}
+
+TEST_F(CharPropertiesTest, Punctuation) {
+  CollectAsciiPredicate(ispunct);
+  ExpectCharPropertyContainsCollectedSet("punctuation");
+}
+
+TEST_F(CharPropertiesTest, Separator) {
+  CollectAsciiPredicate(isspace);
+  ExpectCharPropertyContainsCollectedSet("separator");
+}
+
+}  // namespace syntaxnet

+ 4 - 2
syntaxnet/syntaxnet/document_filters.cc

@@ -77,7 +77,8 @@ class DocumentSource : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("batch_size", &batch_size_));
     OP_REQUIRES(context, batch_size_ > 0,
                 InvalidArgument("invalid batch_size provided"));
-    corpus_.reset(new TextReader(*task_context_.GetInput(corpus_name)));
+    corpus_.reset(
+        new TextReader(*task_context_.GetInput(corpus_name), &task_context_));
   }
 
   void Compute(OpKernelContext *context) override {
@@ -124,7 +125,8 @@ class DocumentSink : public OpKernel {
     GetTaskContext(context, &task_context_);
     string corpus_name;
     OP_REQUIRES_OK(context, context->GetAttr("corpus_name", &corpus_name));
-    writer_.reset(new TextWriter(*task_context_.GetInput(corpus_name)));
+    writer_.reset(
+        new TextWriter(*task_context_.GetInput(corpus_name), &task_context_));
   }
 
   void Compute(OpKernelContext *context) override {

+ 2 - 0
syntaxnet/syntaxnet/document_format.h

@@ -38,6 +38,8 @@ class DocumentFormat : public RegisterableClass<DocumentFormat> {
   DocumentFormat() {}
   virtual ~DocumentFormat() {}
 
+  virtual void Setup(TaskContext *context) {}
+
   // Reads a record from the given input buffer with format specific logic.
   // Returns false if no record could be read because we reached end of file.
   virtual bool ReadRecord(tensorflow::io::InputBuffer *buffer,

+ 12 - 1
syntaxnet/syntaxnet/lexicon_builder.cc

@@ -19,6 +19,7 @@ limitations under the License.
 #include "syntaxnet/affix.h"
 #include "syntaxnet/dictionary.pb.h"
 #include "syntaxnet/feature_extractor.h"
+#include "syntaxnet/segmenter_utils.h"
 #include "syntaxnet/sentence.pb.h"
 #include "syntaxnet/sentence_batch.h"
 #include "syntaxnet/term_frequency_map.h"
@@ -75,6 +76,7 @@ class LexiconBuilder : public OpKernel {
     TermFrequencyMap tags;
     TermFrequencyMap categories;
     TermFrequencyMap labels;
+    TermFrequencyMap chars;
 
     // Affix tables to be populated by the corpus.
     AffixTable prefixes(AffixTable::PREFIX, max_prefix_length_);
@@ -87,7 +89,7 @@ class LexiconBuilder : public OpKernel {
     int64 num_tokens = 0;
     int64 num_documents = 0;
     Sentence *document;
-    TextReader corpus(*task_context_.GetInput(corpus_name_));
+    TextReader corpus(*task_context_.GetInput(corpus_name_), &task_context_);
     while ((document = corpus.Read()) != nullptr) {
       // Gather token information.
       for (int t = 0; t < document->token_size(); ++t) {
@@ -114,6 +116,14 @@ class LexiconBuilder : public OpKernel {
         // Add mapping from tag to category.
         tag_to_category.SetCategory(token.tag(), token.category());
 
+        // Add characters.
+        vector<tensorflow::StringPiece> char_sp;
+        SegmenterUtils::GetUTF8Chars(word, &char_sp);
+        for (const auto &c : char_sp) {
+          const string c_str = c.ToString();
+          if (!c_str.empty() && !HasSpaces(c_str)) chars.Increment(c_str);
+        }
+
         // Update the number of processed tokens.
         ++num_tokens;
       }
@@ -131,6 +141,7 @@ class LexiconBuilder : public OpKernel {
     categories.Save(
         TaskContext::InputFile(*task_context_.GetInput("category-map")));
     labels.Save(TaskContext::InputFile(*task_context_.GetInput("label-map")));
+    chars.Save(TaskContext::InputFile(*task_context_.GetInput("char-map")));
 
     // Write affixes to disk.
     WriteAffixTable(prefixes, TaskContext::InputFile(

+ 25 - 1
syntaxnet/syntaxnet/lexicon_builder_test.py

@@ -69,6 +69,8 @@ TOKENIZED_DOCS = u'''बात गलत हो तो गुस्सा से
 लेकिन अभिनेत्री के इस कदम से वहां रंग में भंग पड़ गया ।
 '''
 
+CHARS = u'''अ इ आ क ग ज ट त द न प भ ब य म र ल व ह स ि ा ु ी े ै ो ् ड़ । ं'''
+
 COMMENTS = u'# Line with fake comments.'
 
 
@@ -93,7 +95,7 @@ class LexiconBuilderTest(test_util.TensorFlowTestCase):
     self.AddInput('documents', self.corpus_file, corpus_format, context)
     for name in ('word-map', 'lcword-map', 'tag-map',
                  'category-map', 'label-map', 'prefix-table',
-                 'suffix-table', 'tag-to-category'):
+                 'suffix-table', 'tag-to-category', 'char-map'):
       self.AddInput(name, os.path.join(FLAGS.test_tmpdir, name), '', context)
     logging.info('Writing context to: %s', self.context_file)
     with open(self.context_file, 'w') as f:
@@ -133,6 +135,26 @@ class LexiconBuilderTest(test_util.TensorFlowTestCase):
       self.assertIn(tag, TAGS)
       self.assertIn(category, CATEGORIES)
 
+  def LoadMap(self, map_name):
+    loaded_map = {}
+    with file(os.path.join(FLAGS.test_tmpdir, map_name), 'r') as f:
+      for line in f:
+        entries = line.strip().split(' ')
+        if len(entries) == 2:
+          loaded_map[entries[0]] = entries[1]
+    return loaded_map
+
+  def ValidateCharMap(self):
+    char_map = self.LoadMap('char-map')
+    self.assertEqual(len(char_map), len(CHARS.split(' ')))
+    for char in CHARS.split(' '):
+      self.assertIn(char.encode('utf-8'), char_map)
+
+  def ValidateWordMap(self):
+    word_map = self.LoadMap('word-map')
+    for word in filter(None, TOKENIZED_DOCS.replace('\n', ' ').split(' ')):
+      self.assertIn(word.encode('utf-8'), word_map)
+
   def BuildLexicon(self):
     with self.test_session():
       gen_parser_ops.lexicon_builder(task_context=self.context_file).run()
@@ -146,6 +168,8 @@ class LexiconBuilderTest(test_util.TensorFlowTestCase):
     self.ValidateDocuments()
     self.BuildLexicon()
     self.ValidateTagToCategoryMap()
+    self.ValidateCharMap()
+    self.ValidateWordMap()
 
   def testCoNLLFormatExtraNewlinesAndComments(self):
     self.WriteContext('conll-sentence')

+ 298 - 0
syntaxnet/syntaxnet/morpher_transitions.cc

@@ -0,0 +1,298 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Morpher transition system.
+//
+// This transition system has one type of actions:
+//  - The SHIFT action pushes the next input token to the stack and
+//    advances to the next input token, assigning a part-of-speech tag to the
+//    token that was shifted.
+//
+// The transition system operates with parser actions encoded as integers:
+//  - A SHIFT action is encoded as number starting from 0.
+
+#include <string>
+
+#include "syntaxnet/morphology_label_set.h"
+#include "syntaxnet/parser_features.h"
+#include "syntaxnet/parser_state.h"
+#include "syntaxnet/parser_transitions.h"
+#include "syntaxnet/sentence_features.h"
+#include "syntaxnet/shared_store.h"
+#include "syntaxnet/task_context.h"
+#include "syntaxnet/term_frequency_map.h"
+#include "syntaxnet/utils.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace syntaxnet {
+
+class MorphologyTransitionState : public ParserTransitionState {
+ public:
+  explicit MorphologyTransitionState(const MorphologyLabelSet *label_set)
+      : label_set_(label_set) {}
+
+  explicit MorphologyTransitionState(const MorphologyTransitionState *state)
+      : MorphologyTransitionState(state->label_set_) {
+    tag_ = state->tag_;
+    gold_tag_ = state->gold_tag_;
+  }
+
+  // Clones the transition state by returning a new object.
+  ParserTransitionState *Clone() const override {
+    return new MorphologyTransitionState(this);
+  }
+
+  // Reads gold tags for each token.
+  void Init(ParserState *state) override {
+    tag_.resize(state->sentence().token_size(), -1);
+    gold_tag_.resize(state->sentence().token_size(), -1);
+    for (int pos = 0; pos < state->sentence().token_size(); ++pos) {
+      const Token &token = state->GetToken(pos);
+
+      // NOTE: we allow token to not have a TokenMorphology extension or for the
+      // TokenMorphology to be absent from the label_set_ because this can
+      // happen at test time.
+      gold_tag_[pos] = label_set_->LookupExisting(
+          token.GetExtension(TokenMorphology::morphology));
+    }
+  }
+
+  // Returns the tag assigned to a given token.
+  int Tag(int index) const {
+    DCHECK_GE(index, 0);
+    DCHECK_LT(index, tag_.size());
+    return index == -1 ? -1 : tag_[index];
+  }
+
+  // Sets this tag on the token at index.
+  void SetTag(int index, int tag) {
+    DCHECK_GE(index, 0);
+    DCHECK_LT(index, tag_.size());
+    tag_[index] = tag;
+  }
+
+  // Returns the gold tag for a given token.
+  int GoldTag(int index) const {
+    DCHECK_GE(index, -1);
+    DCHECK_LT(index, gold_tag_.size());
+    return index == -1 ? -1 : gold_tag_[index];
+  }
+
+  // Returns the proto corresponding to the tag, or an empty proto if the tag is
+  // not found.
+  const TokenMorphology &TagAsProto(int tag) const {
+    if (tag >= 0 && tag < label_set_->Size()) {
+      return label_set_->Lookup(tag);
+    }
+    return TokenMorphology::default_instance();
+  }
+
+  // Adds transition state specific annotations to the document.
+  void AddParseToDocument(const ParserState &state, bool rewrite_root_labels,
+                          Sentence *sentence) const override {
+    for (int i = 0; i < tag_.size(); ++i) {
+      Token *token = sentence->mutable_token(i);
+      *token->MutableExtension(TokenMorphology::morphology) =
+          TagAsProto(Tag(i));
+    }
+  }
+
+  // Whether a parsed token should be considered correct for evaluation.
+  bool IsTokenCorrect(const ParserState &state, int index) const override {
+    return GoldTag(index) == Tag(index);
+  }
+
+  // Returns a human readable string representation of this state.
+  string ToString(const ParserState &state) const override {
+    string str;
+    for (int i = state.StackSize(); i > 0; --i) {
+      const string &word = state.GetToken(state.Stack(i - 1)).word();
+      if (i != state.StackSize() - 1) str.append(" ");
+      tensorflow::strings::StrAppend(
+          &str, word, "[",
+          TagAsProto(Tag(state.StackSize() - i)).ShortDebugString(), "]");
+    }
+    for (int i = state.Next(); i < state.NumTokens(); ++i) {
+      tensorflow::strings::StrAppend(&str, " ", state.GetToken(i).word());
+    }
+    return str;
+  }
+
+ private:
+  // Currently assigned morphological analysis for each token in this sentence.
+  vector<int> tag_;
+
+  // Gold morphological analysis from the input document.
+  vector<int> gold_tag_;
+
+  // Tag map used for conversions between integer and string representations
+  // part of speech tags. Not owned.
+  const MorphologyLabelSet *label_set_ = nullptr;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(MorphologyTransitionState);
+};
+
+class MorphologyTransitionSystem : public ParserTransitionSystem {
+ public:
+  ~MorphologyTransitionSystem() override { SharedStore::Release(label_set_); }
+
+  // Determines tag map location.
+  void Setup(TaskContext *context) override {
+    context->GetInput("morph-label-set");
+  }
+
+  // Reads tag map and tag to category map.
+  void Init(TaskContext *context) override {
+    const string fname =
+        TaskContext::InputFile(*context->GetInput("morph-label-set"));
+    label_set_ =
+        SharedStoreUtils::GetWithDefaultName<MorphologyLabelSet>(fname);
+  }
+
+  // The SHIFT action uses the same value as the corresponding action type.
+  static ParserAction ShiftAction(int tag) { return tag; }
+
+  // The morpher transition system doesn't look at the dependency tree, so it
+  // allows non-projective trees.
+  bool AllowsNonProjective() const override { return true; }
+
+  // Returns the number of action types.
+  int NumActionTypes() const override { return 1; }
+
+  // Returns the number of possible actions.
+  int NumActions(int num_labels) const override { return label_set_->Size(); }
+
+  // The default action for a given state is assigning the most frequent tag.
+  ParserAction GetDefaultAction(const ParserState &state) const override {
+    return ShiftAction(0);
+  }
+
+  // Returns the next gold action for a given state according to the
+  // underlying annotated sentence.
+  ParserAction GetNextGoldAction(const ParserState &state) const override {
+    if (!state.EndOfInput()) {
+      return ShiftAction(TransitionState(state).GoldTag(state.Next()));
+    }
+    return ShiftAction(0);
+  }
+
+  // Checks if the action is allowed in a given parser state.
+  bool IsAllowedAction(ParserAction action,
+                       const ParserState &state) const override {
+    return !state.EndOfInput();
+  }
+
+  // Makes a shift by pushing the next input token on the stack and moving to
+  // the next position.
+  void PerformActionWithoutHistory(ParserAction action,
+                                   ParserState *state) const override {
+    DCHECK(!state->EndOfInput());
+    if (!state->EndOfInput()) {
+      MutableTransitionState(state)->SetTag(state->Next(), action);
+      state->Push(state->Next());
+      state->Advance();
+    }
+  }
+
+  // We are in a final state when we reached the end of the input and the stack
+  // is empty.
+  bool IsFinalState(const ParserState &state) const override {
+    return state.EndOfInput();
+  }
+
+  // Returns a string representation of a parser action.
+  string ActionAsString(ParserAction action,
+                        const ParserState &state) const override {
+    return tensorflow::strings::StrCat(
+        "SHIFT(", label_set_->Lookup(action).ShortDebugString(), ")");
+  }
+
+  // No state is deterministic in this transition system.
+  bool IsDeterministicState(const ParserState &state) const override {
+    return false;
+  }
+
+  // Returns a new transition state to be used to enhance the parser state.
+  ParserTransitionState *NewTransitionState(bool training_mode) const override {
+    return new MorphologyTransitionState(label_set_);
+  }
+
+  // Downcasts the const ParserTransitionState in ParserState to a const
+  // MorphologyTransitionState.
+  static const MorphologyTransitionState &TransitionState(
+      const ParserState &state) {
+    return *static_cast<const MorphologyTransitionState *>(
+        state.transition_state());
+  }
+
+  // Downcasts the ParserTransitionState in ParserState to an
+  // MorphologyTransitionState.
+  static MorphologyTransitionState *MutableTransitionState(ParserState *state) {
+    return static_cast<MorphologyTransitionState *>(
+        state->mutable_transition_state());
+  }
+
+  // Input for the tag map. Not owned.
+  TaskInput *input_label_set_ = nullptr;
+
+  // Tag map used for conversions between integer and string representations
+  // morphology labels. Owned through SharedStore.
+  const MorphologyLabelSet *label_set_;
+};
+
+REGISTER_TRANSITION_SYSTEM("morpher", MorphologyTransitionSystem);
+
+// Feature function for retrieving the tag assigned to a token by the tagger
+// transition system.
+class PredictedMorphTagFeatureFunction : public ParserIndexFeatureFunction {
+ public:
+  PredictedMorphTagFeatureFunction() {}
+
+  // Determines tag map location.
+  void Setup(TaskContext *context) override {
+    context->GetInput("morph-label-set", "recordio", "token-morphology");
+  }
+
+  // Reads tag map.
+  void Init(TaskContext *context) override {
+    const string fname =
+        TaskContext::InputFile(*context->GetInput("morph-label-set"));
+    label_set_ = SharedStore::Get<MorphologyLabelSet>(fname, fname);
+    set_feature_type(new FullLabelFeatureType(name(), label_set_));
+  }
+
+  // Gets the MorphologyTransitionState from the parser state and reads the
+  // assigned
+  // tag at the focus index. Returns -1 if the focus is not within the sentence.
+  FeatureValue Compute(const WorkspaceSet &workspaces, const ParserState &state,
+                       int focus, const FeatureVector *result) const override {
+    if (focus < 0 || focus >= state.sentence().token_size()) return -1;
+    return static_cast<const MorphologyTransitionState *>(
+               state.transition_state())
+        ->Tag(focus);
+  }
+
+ private:
+  // Tag map used for conversions between integer and string representations
+  // part of speech tags. Owned through SharedStore.
+  const MorphologyLabelSet *label_set_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(PredictedMorphTagFeatureFunction);
+};
+
+REGISTER_PARSER_IDX_FEATURE_FUNCTION("pred-morph-tag",
+                                     PredictedMorphTagFeatureFunction);
+
+}  // namespace syntaxnet

+ 91 - 0
syntaxnet/syntaxnet/morphology_label_set.cc

@@ -0,0 +1,91 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "syntaxnet/morphology_label_set.h"
+
+namespace syntaxnet {
+
+const char MorphologyLabelSet::kSeparator[] = "\t";
+
+int MorphologyLabelSet::Add(const TokenMorphology &morph) {
+  string repr = StringForMatch(morph);
+  auto it = fast_lookup_.find(repr);
+  if (it != fast_lookup_.end()) return it->second;
+  fast_lookup_[repr] = label_set_.size();
+  label_set_.push_back(morph);
+  return label_set_.size() - 1;
+}
+
+// Look up an existing TokenMorphology.  If it is not present, return -1.
+int MorphologyLabelSet::LookupExisting(const TokenMorphology &morph) const {
+  string repr = StringForMatch(morph);
+  auto it = fast_lookup_.find(repr);
+  if (it != fast_lookup_.end()) return it->second;
+  return -1;
+}
+
+// Return the TokenMorphology at position i.  The input i should be in the range
+// 0..size().
+const TokenMorphology &MorphologyLabelSet::Lookup(int i) const {
+  CHECK_GE(i, 0);
+  CHECK_LT(i, label_set_.size());
+  return label_set_[i];
+}
+
+void MorphologyLabelSet::Read(const string &filename) {
+  ProtoRecordReader reader(filename);
+  Read(&reader);
+}
+
+void MorphologyLabelSet::Read(ProtoRecordReader *reader) {
+  TokenMorphology morph;
+  while (reader->Read(&morph).ok()) {
+    CHECK_EQ(-1, LookupExisting(morph));
+    Add(morph);
+  }
+}
+
+void MorphologyLabelSet::Write(const string &filename) const {
+  ProtoRecordWriter writer(filename);
+  Write(&writer);
+}
+
+void MorphologyLabelSet::Write(ProtoRecordWriter *writer) const {
+  for (const TokenMorphology &morph : label_set_) {
+    writer->Write(morph);
+  }
+}
+
+string MorphologyLabelSet::StringForMatch(const TokenMorphology &morph) const {
+  vector<string> attributes;
+  for (const auto &a : morph.attribute()) {
+    attributes.push_back(
+        tensorflow::strings::StrCat(a.name(), kSeparator, a.value()));
+  }
+  std::sort(attributes.begin(), attributes.end());
+  return utils::Join(attributes, kSeparator);
+}
+
+string FullLabelFeatureType::GetFeatureValueName(FeatureValue value) const {
+  const TokenMorphology &morph = label_set_->Lookup(value);
+  vector<string> attributes;
+  for (const auto &a : morph.attribute()) {
+    attributes.push_back(tensorflow::strings::StrCat(a.name(), ":", a.value()));
+  }
+  std::sort(attributes.begin(), attributes.end());
+  return utils::Join(attributes, ",");
+}
+
+}  // namespace syntaxnet

+ 110 - 0
syntaxnet/syntaxnet/morphology_label_set.h

@@ -0,0 +1,110 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A class to store the set of possible TokenMorphology objects.  This includes
+// lookup, iteration and serialziation.
+
+#ifndef SYNTAXNET_MORPHOLOGY_LABEL_SET_H_
+#define SYNTAXNET_MORPHOLOGY_LABEL_SET_H_
+
+#include <unordered_map>
+#include <string>
+#include <vector>
+
+#include "syntaxnet/proto_io.h"
+#include "syntaxnet/sentence.pb.h"
+
+namespace syntaxnet {
+
+class MorphologyLabelSet {
+ public:
+  // Initalize as an empty morphology.
+  MorphologyLabelSet() {}
+
+  // Initalizes by reading the given file, which has been saved by Write().
+  // This makes using the shared store easier.
+  explicit MorphologyLabelSet(const string &fname) { Read(fname); }
+
+  // Adds a TokenMorphology to the set if it is not present. In any case, return
+  // its position in the list. Note: This is slow, and should not be called
+  // outside of training or init.
+  int Add(const TokenMorphology &morph);
+
+  // Look up an existing TokenMorphology. If it is not present, return -1.
+  // Note: This is slow, and should not be called outside of training workflow
+  // or init.
+  int LookupExisting(const TokenMorphology &morph) const;
+
+  // Return the TokenMorphology at position i. The input i should be in the
+  // range 0..size(). Note: this will be called at inference time and needs to
+  // be kept fast.
+  const TokenMorphology &Lookup(int i) const;
+
+  // Return the number of elements.
+  int Size() const { return label_set_.size(); }
+
+  // Deserialization and serialization.
+  void Read(const string &filename);
+  void Write(const string &filename) const;
+
+ private:
+  string StringForMatch(const TokenMorphology &morhp) const;
+
+  // Deserialization and serialziation implementation.
+  void Read(ProtoRecordReader *reader);
+  void Write(ProtoRecordWriter *writer) const;
+
+  // List of all possible annotations.  This is a unique list, where equality is
+  // defined as follows:
+  //
+  //   a == b iff the set of attribute pairs (attribute, value) is identical.
+  vector<TokenMorphology> label_set_;
+
+  // Because protocol buffer equality is complicated, we implement our own
+  // equality operator based on strings. This unordered_map allows us to do the
+  // lookup more quickly.
+  unordered_map<string, int> fast_lookup_;
+
+  // A separator string that should not occur in any of the attribute names.
+  // This should never be serialized, so that it can be changed in the code if
+  // we change attribute names and it occurs in the new names.
+  static const char kSeparator[];
+};
+
+// A feature type with one value for each complete morphological analysis
+// (analogous to the fulltag analyzer).
+class FullLabelFeatureType : public FeatureType {
+ public:
+  FullLabelFeatureType(const string &name, const MorphologyLabelSet *label_set)
+      : FeatureType(name), label_set_(label_set) {}
+
+  ~FullLabelFeatureType() override {}
+
+  // Converts a feature value to a name.  We don't use StringForMatch, since the
+  // goal of these are to be readable, even if they might occasionally be
+  // non-unique.
+  string GetFeatureValueName(FeatureValue value) const override;
+
+  // Returns the size of the feature values domain.
+  FeatureValue GetDomainSize() const override { return label_set_->Size(); }
+
+ private:
+  // Not owned.
+  const MorphologyLabelSet *label_set_ = nullptr;
+};
+
+}  // namespace syntaxnet
+
+#endif  // SYNTAXNET_MORPHOLOGY_LABEL_SET_H_

+ 101 - 0
syntaxnet/syntaxnet/morphology_label_set_test.cc

@@ -0,0 +1,101 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "syntaxnet/morphology_label_set.h"
+#include "syntaxnet/sentence.pb.h"
+#include <gmock/gmock.h>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace syntaxnet {
+
+class MorphologyLabelSetTest : public ::testing::Test {
+ protected:
+  MorphologyLabelSet label_set_;
+};
+
+// Test that Add and LookupExisting work as expected.
+TEST_F(MorphologyLabelSetTest, AddLookupExisting) {
+  TokenMorphology si1, si2;  // singular, imperative
+  TokenMorphology pi;        // plural, imperative
+  TokenMorphology six;       // singular, imperative with extra value
+  TextFormat::ParseFromString(R"(
+      attribute {name: "Number" value: "Singular"}
+      attribute {name: "POS" value: "IMP"})",
+                                      &si1);
+  TextFormat::ParseFromString(R"(
+      attribute {name: "POS" value: "IMP"}
+      attribute {name: "Number" value: "Singular"})",
+                                      &si2);
+  TextFormat::ParseFromString(R"(
+      attribute {name: "Number" value: "Plural"}
+      attribute {name: "POS" value: "IMP"})",
+                                      &pi);
+  TextFormat::ParseFromString(R"(
+      attribute {name: "Number" value: "Plural"}
+      attribute {name: "POS" value: "IMP"}
+      attribute {name: "x" value: "x"})",
+                                      &six);
+
+  // Check Lookup existing returns -1 for non-existing entries.
+  EXPECT_EQ(-1, label_set_.LookupExisting(si1));
+  EXPECT_EQ(-1, label_set_.LookupExisting(si2));
+  EXPECT_EQ(0, label_set_.Size());
+
+  // Check that adding returns 0 (this is the only possiblity given Size())
+  EXPECT_EQ(0, label_set_.Add(si1));
+  EXPECT_EQ(0, label_set_.Add(si1));  // calling Add twice adds only once
+  EXPECT_EQ(1, label_set_.Size());
+
+  // Check that order of attributes does not matter.
+  EXPECT_EQ(0, label_set_.LookupExisting(si2));
+
+  // Check that un-added entries still are not present.
+  EXPECT_EQ(-1, label_set_.LookupExisting(pi));
+  EXPECT_EQ(-1, label_set_.LookupExisting(six));
+
+  // Check that we can add them.
+  EXPECT_EQ(1, label_set_.Add(pi));
+  EXPECT_EQ(2, label_set_.Add(six));
+  EXPECT_EQ(3, label_set_.Size());
+}
+
+// Test write and deserializing constructor.
+TEST_F(MorphologyLabelSetTest, Serialization) {
+  TokenMorphology si;  // singular, imperative
+  TokenMorphology pi;  // plural, imperative
+  TextFormat::ParseFromString(R"(
+      attribute {name: "Number" value: "Singular"}
+      attribute {name: "POS" value: "IMP"})",
+                                      &si);
+  TextFormat::ParseFromString(R"(
+      attribute {name: "Number" value: "Plural"}
+      attribute {name: "POS" value: "IMP"})",
+                                      &pi);
+  EXPECT_EQ(0, label_set_.Add(si));
+  EXPECT_EQ(1, label_set_.Add(pi));
+
+  // Serialize and deserialize.
+  string fname = utils::JoinPath({tensorflow::testing::TmpDir(), "label-set"});
+  label_set_.Write(fname);
+  MorphologyLabelSet label_set2(fname);
+  EXPECT_EQ(0, label_set2.LookupExisting(si));
+  EXPECT_EQ(1, label_set2.LookupExisting(pi));
+  EXPECT_EQ(2, label_set2.Size());
+}
+
+}  // namespace syntaxnet

+ 0 - 1
syntaxnet/syntaxnet/parser_eval.py

@@ -22,7 +22,6 @@ import time
 
 import tensorflow as tf
 
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from syntaxnet import sentence_pb2
 from syntaxnet import graph_builder

+ 18 - 0
syntaxnet/syntaxnet/parser_features.cc

@@ -166,6 +166,9 @@ REGISTER_PARSER_IDX_FEATURE_FUNCTION("label", LabelFeatureFunction);
 typedef BasicParserSentenceFeatureFunction<Word> WordFeatureFunction;
 REGISTER_PARSER_IDX_FEATURE_FUNCTION("word", WordFeatureFunction);
 
+typedef BasicParserSentenceFeatureFunction<Char> CharFeatureFunction;
+REGISTER_PARSER_IDX_FEATURE_FUNCTION("char", CharFeatureFunction);
+
 typedef BasicParserSentenceFeatureFunction<Tag> TagFeatureFunction;
 REGISTER_PARSER_IDX_FEATURE_FUNCTION("tag", TagFeatureFunction);
 
@@ -175,6 +178,21 @@ REGISTER_PARSER_IDX_FEATURE_FUNCTION("digit", DigitFeatureFunction);
 typedef BasicParserSentenceFeatureFunction<Hyphen> HyphenFeatureFunction;
 REGISTER_PARSER_IDX_FEATURE_FUNCTION("hyphen", HyphenFeatureFunction);
 
+typedef BasicParserSentenceFeatureFunction<Capitalization>
+    CapitalizationFeatureFunction;
+REGISTER_PARSER_IDX_FEATURE_FUNCTION("capitalization",
+                                     CapitalizationFeatureFunction);
+
+typedef BasicParserSentenceFeatureFunction<PunctuationAmount>
+    PunctuationAmountFeatureFunction;
+REGISTER_PARSER_IDX_FEATURE_FUNCTION("punctuation-amount",
+                                     PunctuationAmountFeatureFunction);
+
+typedef BasicParserSentenceFeatureFunction<Quote>
+    QuoteFeatureFunction;
+REGISTER_PARSER_IDX_FEATURE_FUNCTION("quote",
+                                     QuoteFeatureFunction);
+
 typedef BasicParserSentenceFeatureFunction<PrefixFeature> PrefixFeatureFunction;
 REGISTER_PARSER_IDX_FEATURE_FUNCTION("prefix", PrefixFeatureFunction);
 

+ 4 - 2
syntaxnet/syntaxnet/proto_io.h

@@ -144,7 +144,7 @@ class StdIn : public tensorflow::RandomAccessFile {
 // Reads sentence protos from a text file.
 class TextReader {
  public:
-  explicit TextReader(const TaskInput &input) {
+  explicit TextReader(const TaskInput &input, TaskContext *context) {
     CHECK_EQ(input.record_format_size(), 1)
         << "TextReader only supports inputs with one record format: "
         << input.DebugString();
@@ -153,6 +153,7 @@ class TextReader {
         << input.DebugString();
     filename_ = TaskContext::InputFile(input);
     format_.reset(DocumentFormat::Create(input.record_format(0)));
+    format_->Setup(context);
     Reset();
   }
 
@@ -202,7 +203,7 @@ class TextReader {
 // Writes sentence protos to a text conll file.
 class TextWriter {
  public:
-  explicit TextWriter(const TaskInput &input) {
+  explicit TextWriter(const TaskInput &input, TaskContext *context) {
     CHECK_EQ(input.record_format_size(), 1)
         << "TextWriter only supports files with one record format: "
         << input.DebugString();
@@ -211,6 +212,7 @@ class TextWriter {
         << input.DebugString();
     filename_ = TaskContext::InputFile(input);
     format_.reset(DocumentFormat::Create(input.record_format(0)));
+    format_->Setup(context);
     if (filename_ != "-") {
       TF_CHECK_OK(
           tensorflow::Env::Default()->NewWritableFile(filename_, &file_));

+ 85 - 0
syntaxnet/syntaxnet/segmenter_utils.cc

@@ -0,0 +1,85 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "syntaxnet/segmenter_utils.h"
+#include "util/utf8/unicodetext.h"
+#include "util/utf8/unilib.h"
+#include "util/utf8/unilib_utf8_utils.h"
+
+namespace syntaxnet {
+
+// Separators, code Zs from http://www.unicode.org/Public/UNIDATA/PropList.txt
+// NB: This list is not necessarily exhaustive.
+const std::unordered_set<int> SegmenterUtils::kBreakChars({
+  0x2028,  // line separator
+  0x2029,  // paragraph separator
+  0x0020,  // space
+  0x00a0,  // no-break space
+  0x1680,  // Ogham space mark
+  0x180e,  // Mongolian vowel separator
+  0x202f,  // narrow no-break space
+  0x205f,  // medium mathematical space
+  0x3000,  // ideographic space
+  0xe5e5,  // Google addition
+  0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008,
+  0x2009, 0x200a
+});
+
+void SegmenterUtils::GetUTF8Chars(const string &text,
+                                  vector<tensorflow::StringPiece> *chars) {
+  const char *start = text.c_str();
+  const char *end = text.c_str() + text.size();
+  while (start < end) {
+    int char_length = UniLib::OneCharLen(start);
+    chars->emplace_back(start, char_length);
+    start += char_length;
+  }
+}
+
+void SegmenterUtils::SetCharsAsTokens(
+    const string &text,
+    const vector<tensorflow::StringPiece> &chars,
+    Sentence *sentence) {
+  sentence->clear_token();
+  sentence->set_text(text);
+  for (int i = 0; i < chars.size(); ++i) {
+    Token *tok = sentence->add_token();
+    tok->set_word(chars[i].ToString());  // NOLINT
+    int start_byte, end_byte;
+    GetCharStartEndBytes(text, chars[i], &start_byte, &end_byte);
+    tok->set_start(start_byte);
+    tok->set_end(end_byte);
+  }
+}
+
+bool SegmenterUtils::IsValidSegment(const Sentence &sentence,
+                                    const Token &token) {
+  // Check that the token is not empty, both by string and by bytes.
+  if (token.word().empty()) return false;
+  if (token.start() > token.end()) return false;
+
+  // Check token boudaries inside of text.
+  if (token.start() < 0) return false;
+  if (token.end() >= sentence.text().size()) return false;
+
+  // Check that token string is valid UTF8, by bytes.
+  const char s = sentence.text()[token.start()];
+  const char e = sentence.text()[token.end() + 1];
+  if (UniLib::IsTrailByte(s)) return false;
+  if (UniLib::IsTrailByte(e)) return false;
+  return true;
+}
+
+}  // namespace syntaxnet

+ 93 - 0
syntaxnet/syntaxnet/segmenter_utils.h

@@ -0,0 +1,93 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef SYNTAXNET_SEGMENTER_UTILS_H_
+#define SYNTAXNET_SEGMENTER_UTILS_H_
+
+#include <string>
+#include <vector>
+#include <unordered_set>
+
+#include "syntaxnet/sentence.pb.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "util/utf8/unicodetext.h"
+
+namespace syntaxnet {
+
+// A set of common convenience functions.
+class SegmenterUtils {
+ public:
+  // Takes a text and convert it into a vector, where each element is a utf8
+  // character.
+  static void GetUTF8Chars(const string &text,
+                           vector<tensorflow::StringPiece> *chars);
+
+  // Sets tokens in the sentence so that each token is a single character.
+  // Assigns the start/end byte offsets.
+  //
+  // If the sentence is not empty, the current tokens will be cleared.
+  static void SetCharsAsTokens(const string &text,
+                               const vector<tensorflow::StringPiece> &chars,
+                               Sentence *sentence);
+
+  // Returns true for UTF-8 characters that cannot be 'real' tokens. This is
+  // defined as any whitespace, line break or paragraph break.
+  static bool IsBreakChar(const string &word) {
+    if (word == "\n" || word == "\t") return true;
+    UnicodeText text;
+    text.PointToUTF8(word.c_str(), word.length());
+    CHECK_EQ(text.size(), 1);
+    return kBreakChars.find(*text.begin()) != kBreakChars.end();
+  }
+
+  // Returns the break level for the next token based on the current character.
+  static Token::BreakLevel BreakLevel(const string &word) {
+    UnicodeText text;
+    text.PointToUTF8(word.c_str(), word.length());
+    auto point = *text.begin();
+    if (word == "\n" || point == kLineSeparator) {
+      return Token::LINE_BREAK;
+    } else if (point == kParagraphSeparator) {
+      return Token::SENTENCE_BREAK;  // No PARAGRAPH_BREAK in sentence proto.
+    } else if (word == "\t" || kBreakChars.find(point) != kBreakChars.end()) {
+      return Token::SPACE_BREAK;
+    }
+    return Token::NO_BREAK;
+  }
+
+  // Convenience function for computing start/end byte offsets of a character
+  // StringPiece relative to original text.
+  static void GetCharStartEndBytes(const string &text,
+                                   tensorflow::StringPiece c,
+                                   int *start,
+                                   int *end) {
+    *start = c.data() - text.data();
+    *end = *start + c.size() - 1;
+  }
+
+  // Returns true if this segment is a valid segment. Currently checks:
+  // 1) It is non-empty
+  // 2) It is valid UTF8
+  static bool IsValidSegment(const Sentence &sentence, const Token &token);
+
+  // Set for utf8 break characters.
+  static const std::unordered_set<int> kBreakChars;
+  static const int kLineSeparator = 0x2028;
+  static const int kParagraphSeparator = 0x2029;
+};
+
+}  // namespace syntaxnet
+
+#endif  // SYNTAXNET_SEGMENTER_UTILS_H_

+ 149 - 0
syntaxnet/syntaxnet/segmenter_utils_test.cc

@@ -0,0 +1,149 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "syntaxnet/segmenter_utils.h"
+
+#include <string>
+#include <vector>
+
+#include "syntaxnet/char_properties.h"
+#include "syntaxnet/sentence.pb.h"
+#include <gmock/gmock.h>
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace syntaxnet {
+
+// Creates a Korean senence and also initializes the token field.
+static Sentence GetKoSentence() {
+  Sentence sentence;
+
+  string text = "서울시는 2012년부터";
+
+  // Add tokens.
+  sentence.set_text(text);
+  Token *tok = sentence.add_token();
+  tok->set_word("서울시");
+  tok->set_start(0);
+  tok->set_end(8);
+  tok = sentence.add_token();
+  tok->set_word("는");
+  tok->set_start(9);
+  tok->set_end(11);
+  tok = sentence.add_token();
+  tok->set_word("2012");
+  tok->set_start(13);
+  tok->set_end(16);
+  tok = sentence.add_token();
+  tok->set_word("년");
+  tok->set_start(17);
+  tok->set_end(19);
+  tok = sentence.add_token();
+  tok->set_word("부터");
+  tok->set_start(20);
+  tok->set_end(25);
+
+  return sentence;
+}
+
+// Gets the start end bytes of the given chars in the given text.
+static void GetStartEndBytes(const string &text,
+                             const vector<tensorflow::StringPiece> &chars,
+                             vector<int> *starts,
+                             vector<int> *ends) {
+  SegmenterUtils segment_utils;
+  for (const tensorflow::StringPiece &c : chars) {
+    int start; int end;
+    segment_utils.GetCharStartEndBytes(text, c, &start, &end);
+    starts->push_back(start);
+    ends->push_back(end);
+  }
+}
+
+// Test the GetChars function.
+TEST(SegmenterUtilsTest, GetCharsTest) {
+  // Create test sentence.
+  const Sentence sentence = GetKoSentence();
+  vector<tensorflow::StringPiece> chars;
+  SegmenterUtils::GetUTF8Chars(sentence.text(), &chars);
+
+  // Check the number of characters is correct.
+  CHECK_EQ(chars.size(), 12);
+
+  vector<int> starts;
+  vector<int> ends;
+  GetStartEndBytes(sentence.text(), chars, &starts, &ends);
+
+  // Check start positions.
+  CHECK_EQ(starts[0], 0);
+  CHECK_EQ(starts[1], 3);
+  CHECK_EQ(starts[2], 6);
+  CHECK_EQ(starts[3], 9);
+  CHECK_EQ(starts[4], 12);
+  CHECK_EQ(starts[5], 13);
+  CHECK_EQ(starts[6], 14);
+  CHECK_EQ(starts[7], 15);
+  CHECK_EQ(starts[8], 16);
+  CHECK_EQ(starts[9], 17);
+  CHECK_EQ(starts[10], 20);
+  CHECK_EQ(starts[11], 23);
+
+  // Check end positions.
+  CHECK_EQ(ends[0], 2);
+  CHECK_EQ(ends[1], 5);
+  CHECK_EQ(ends[2], 8);
+  CHECK_EQ(ends[3], 11);
+  CHECK_EQ(ends[4], 12);
+  CHECK_EQ(ends[5], 13);
+  CHECK_EQ(ends[6], 14);
+  CHECK_EQ(ends[7], 15);
+  CHECK_EQ(ends[8], 16);
+  CHECK_EQ(ends[9], 19);
+  CHECK_EQ(ends[10], 22);
+  CHECK_EQ(ends[11], 25);
+}
+
+// Test the SetCharsAsTokens function.
+TEST(SegmenterUtilsTest, SetCharsAsTokensTest) {
+  // Create test sentence.
+  const Sentence sentence = GetKoSentence();
+  vector<tensorflow::StringPiece> chars;
+  SegmenterUtils segment_utils;
+  segment_utils.GetUTF8Chars(sentence.text(), &chars);
+
+  vector<int> starts;
+  vector<int> ends;
+  GetStartEndBytes(sentence.text(), chars, &starts, &ends);
+
+  // Check that the new docs word, start and end positions are properly set.
+  Sentence new_sentence;
+  segment_utils.SetCharsAsTokens(sentence.text(), chars, &new_sentence);
+  CHECK_EQ(new_sentence.token_size(), chars.size());
+  for (int t = 0; t < sentence.token_size(); ++t) {
+    CHECK_EQ(new_sentence.token(t).word(), chars[t]);
+    CHECK_EQ(new_sentence.token(t).start(), starts[t]);
+    CHECK_EQ(new_sentence.token(t).end(), ends[t]);
+  }
+
+  // Re-running should remove the old tokens.
+  segment_utils.SetCharsAsTokens(sentence.text(), chars, &new_sentence);
+  CHECK_EQ(new_sentence.token_size(), chars.size());
+  for (int t = 0; t < sentence.token_size(); ++t) {
+    CHECK_EQ(new_sentence.token(t).word(), chars[t]);
+    CHECK_EQ(new_sentence.token(t).start(), starts[t]);
+    CHECK_EQ(new_sentence.token(t).end(), ends[t]);
+  }
+}
+
+}  // namespace syntaxnet

+ 15 - 0
syntaxnet/syntaxnet/sentence.proto

@@ -59,3 +59,18 @@ message Token {
 
   extensions 1000 to max;
 }
+
+// Stores information about the morphology of a token.
+message TokenMorphology {
+  extend Token {
+    optional TokenMorphology morphology = 63949837;
+  }
+
+  // Morphology is represented by a set of attribute values.
+  message Attribute {
+    required string name = 1;
+    required string value = 2;
+  }
+  // This attribute field is designated to hold a single disambiguated analysis.
+  repeated Attribute attribute = 3;
+};

+ 1 - 1
syntaxnet/syntaxnet/sentence_batch.cc

@@ -24,7 +24,7 @@ limitations under the License.
 namespace syntaxnet {
 
 void SentenceBatch::Init(TaskContext *context) {
-  reader_.reset(new TextReader(*context->GetInput(input_name_)));
+  reader_.reset(new TextReader(*context->GetInput(input_name_), context));
   size_ = 0;
 }
 

+ 233 - 3
syntaxnet/syntaxnet/sentence_features.cc

@@ -14,9 +14,11 @@ limitations under the License.
 ==============================================================================*/
 
 #include "syntaxnet/sentence_features.h"
-
+#include "syntaxnet/char_properties.h"
 #include "syntaxnet/registry.h"
 #include "util/utf8/unicodetext.h"
+#include "util/utf8/unilib.h"
+#include "util/utf8/unilib_utf8_utils.h"
 
 namespace syntaxnet {
 
@@ -55,6 +57,83 @@ string TermFrequencyMapFeature::WorkspaceName() const {
                                              min_freq_, max_num_terms_);
 }
 
+TermFrequencyMapSetFeature::~TermFrequencyMapSetFeature() {
+  if (term_map_ != nullptr) {
+    SharedStore::Release(term_map_);
+    term_map_ = nullptr;
+  }
+}
+
+void TermFrequencyMapSetFeature::Setup(TaskContext *context) {
+  context->GetInput(input_name_, "text", "");
+}
+
+void TermFrequencyMapSetFeature::Init(TaskContext *context) {
+  min_freq_ = GetIntParameter("min-freq", 0);
+  max_num_terms_ = GetIntParameter("max-num-terms", 0);
+  file_name_ = context->InputFile(*context->GetInput(input_name_));
+  term_map_ = SharedStoreUtils::GetWithDefaultName<TermFrequencyMap>(
+      file_name_, min_freq_, max_num_terms_);
+  TokenLookupSetFeature::Init(context);
+}
+
+string TermFrequencyMapSetFeature::WorkspaceName() const {
+  return SharedStoreUtils::CreateDefaultName(
+      "term-frequency-map-set", input_name_, min_freq_, max_num_terms_);
+}
+
+namespace {
+void GetUTF8Chars(const string &word, vector<tensorflow::StringPiece> *chars) {
+  UnicodeText text;
+  text.PointToUTF8(word.c_str(), word.size());
+  for (UnicodeText::const_iterator it = text.begin(); it != text.end(); ++it) {
+    chars->push_back(tensorflow::StringPiece(it.utf8_data(), it.utf8_length()));
+  }
+}
+
+int UTF8FirstLetterNumBytes(const char *utf8_str) {
+  if (*utf8_str == '\0') return 0;
+  return UniLib::OneCharLen(utf8_str);
+}
+
+}  // namespace
+
+void CharNgram::GetTokenIndices(const Token &token, vector<int> *values) const {
+  values->clear();
+  vector<tensorflow::StringPiece> char_sp;
+  if (use_terminators_) char_sp.push_back("^");
+  GetUTF8Chars(token.word(), &char_sp);
+  if (use_terminators_) char_sp.push_back("$");
+  for (int start = 0; start < char_sp.size(); ++start) {
+    string char_ngram;
+    for (int index = 0;
+         index < max_char_ngram_length_ && start + index < char_sp.size();
+         ++index) {
+      tensorflow::StringPiece c = char_sp[start + index];
+      if (c == " ") break;  // Never add char ngrams containing spaces.
+      tensorflow::strings::StrAppend(&char_ngram, c);
+      int value = LookupIndex(char_ngram);
+      if (value != -1) {  // Skip unknown values.
+        values->push_back(value);
+      }
+    }
+  }
+}
+
+void MorphologySet::GetTokenIndices(const Token &token,
+                                    vector<int> *values) const {
+  values->clear();
+  const TokenMorphology &token_morphology =
+      token.GetExtension(TokenMorphology::morphology);
+  for (const TokenMorphology::Attribute &att : token_morphology.attribute()) {
+    int value =
+        LookupIndex(tensorflow::strings::StrCat(att.name(), "=", att.value()));
+    if (value != -1) {  // Skip unknown values.
+      values->push_back(value);
+    }
+  }
+}
+
 string Hyphen::GetFeatureValueName(FeatureValue value) const {
   switch (value) {
     case NO_HYPHEN:
@@ -70,6 +149,152 @@ FeatureValue Hyphen::ComputeValue(const Token &token) const {
   return (word.find('-') < word.length() ? HAS_HYPHEN : NO_HYPHEN);
 }
 
+void Capitalization::Setup(TaskContext *context) {
+  utf8_ = (GetParameter("utf8") == "true");
+}
+
+// Runs ComputeValue for each token in the sentence.
+void Capitalization::Preprocess(WorkspaceSet *workspaces,
+                                Sentence *sentence) const {
+  if (workspaces->Has<VectorIntWorkspace>(Workspace())) return;
+  VectorIntWorkspace *workspace =
+      new VectorIntWorkspace(sentence->token_size());
+  for (int i = 0; i < sentence->token_size(); ++i) {
+    const int value = ComputeValueWithFocus(sentence->token(i), i);
+    workspace->set_element(i, value);
+  }
+  workspaces->Set<VectorIntWorkspace>(Workspace(), workspace);
+}
+
+string Capitalization::GetFeatureValueName(FeatureValue value) const {
+  switch (value) {
+    case LOWERCASE:
+      return "LOWERCASE";
+    case UPPERCASE:
+      return "UPPERCASE";
+    case CAPITALIZED:
+      return "CAPITALIZED";
+    case CAPITALIZED_SENTENCE_INITIAL:
+      return "CAPITALIZED_SENTENCE_INITIAL";
+    case NON_ALPHABETIC:
+      return "NON_ALPHABETIC";
+  }
+  return "<INVALID>";
+}
+
+FeatureValue Capitalization::ComputeValueWithFocus(const Token &token,
+                                                   int focus) const {
+  const string &word = token.word();
+
+  // Check whether there is an uppercase or lowercase character.
+  bool has_upper = false;
+  bool has_lower = false;
+  if (utf8_) {
+    LOG(FATAL) << "Not implemented.";
+  } else {
+    const char *str = word.c_str();
+    for (int i = 0; i < word.length(); ++i) {
+      const char c = str[i];
+      has_upper = (has_upper || (c >= 'A' && c <= 'Z'));
+      has_lower = (has_lower || (c >= 'a' && c <= 'z'));
+    }
+  }
+
+  // Compute simple values.
+  if (!has_upper && has_lower) return LOWERCASE;
+  if (has_upper && !has_lower) return UPPERCASE;
+  if (!has_upper && !has_lower) return NON_ALPHABETIC;
+
+  // Else has_upper && has_lower; a normal capitalized word.  Check the break
+  // level to determine whether the capitalized word is sentence-initial.
+  const bool sentence_initial = (focus == 0);
+  return sentence_initial ? CAPITALIZED_SENTENCE_INITIAL : CAPITALIZED;
+}
+
+string PunctuationAmount::GetFeatureValueName(FeatureValue value) const {
+  switch (value) {
+    case NO_PUNCTUATION:
+      return "NO_PUNCTUATION";
+    case SOME_PUNCTUATION:
+      return "SOME_PUNCTUATION";
+    case ALL_PUNCTUATION:
+      return "ALL_PUNCTUATION";
+  }
+  return "<INVALID>";
+}
+
+FeatureValue PunctuationAmount::ComputeValue(const Token &token) const {
+  const string &word = token.word();
+  bool has_punctuation = false;
+  bool all_punctuation = true;
+
+  const char *start = word.c_str();
+  const char *end = word.c_str() + word.size();
+  while (start < end) {
+    int char_length = UTF8FirstLetterNumBytes(start);
+    bool char_is_punct = is_punctuation_or_symbol(start, char_length);
+    all_punctuation &= char_is_punct;
+    has_punctuation |= char_is_punct;
+    if (!all_punctuation && has_punctuation) return SOME_PUNCTUATION;
+    start += char_length;
+  }
+  if (!all_punctuation) return NO_PUNCTUATION;
+  return ALL_PUNCTUATION;
+}
+
+string Quote::GetFeatureValueName(FeatureValue value) const {
+  switch (value) {
+    case NO_QUOTE:
+      return "NO_QUOTE";
+    case OPEN_QUOTE:
+      return "OPEN_QUOTE";
+    case CLOSE_QUOTE:
+      return "CLOSE_QUOTE";
+    case UNKNOWN_QUOTE:
+      return "UNKNOWN_QUOTE";
+  }
+  return "<INVALID>";
+}
+
+FeatureValue Quote::ComputeValue(const Token &token) const {
+  const string &word = token.word();
+
+  // Penn Treebank open and close quotes are multi-character.
+  if (word == "``") return OPEN_QUOTE;
+  if (word == "''") return CLOSE_QUOTE;
+  if (word.length() == 1) {
+    int char_len = UTF8FirstLetterNumBytes(word.c_str());
+    bool is_open = is_open_quote(word.c_str(), char_len);
+    bool is_close = is_close_quote(word.c_str(), char_len);
+    if (is_open && !is_close) return OPEN_QUOTE;
+    if (is_close && !is_open) return CLOSE_QUOTE;
+    if (is_open && is_close) return UNKNOWN_QUOTE;
+  }
+  return NO_QUOTE;
+}
+
+void Quote::Preprocess(WorkspaceSet *workspaces, Sentence *sentence) const {
+  if (workspaces->Has<VectorIntWorkspace>(Workspace())) return;
+  VectorIntWorkspace *workspace =
+      new VectorIntWorkspace(sentence->token_size());
+
+  // For double quote ", it is unknown whether they are open or closed without
+  // looking at the prior tokens in the sentence.  in_quote is true iff an odd
+  // number of " marks have been seen so far in the sentence (similar to the
+  // behavior of some tokenizers).
+  bool in_quote = false;
+  for (int i = 0; i < sentence->token_size(); ++i) {
+    int quote_type = ComputeValue(sentence->token(i));
+    if (quote_type == UNKNOWN_QUOTE) {
+      // Update based on in_quote and flip in_quote.
+      quote_type = in_quote ? CLOSE_QUOTE : OPEN_QUOTE;
+      in_quote = !in_quote;
+    }
+    workspace->set_element(i, quote_type);
+  }
+  workspaces->Set<VectorIntWorkspace>(Workspace(), workspace);
+}
+
 string Digit::GetFeatureValueName(FeatureValue value) const {
   switch (value) {
     case NO_DIGIT:
@@ -130,8 +355,7 @@ static AffixTable *CreateAffixTable(const string &filename,
 void AffixTableFeature::Setup(TaskContext *context) {
   context->GetInput(input_name_, "recordio", "affix-table");
   affix_length_ = GetIntParameter("length", 0);
-  CHECK_GE(affix_length_, 0)
-      << "Length must be specified for affix preprocessor.";
+  CHECK_GE(affix_length_, 0) << "Length must be specified for affix feature.";
   TokenLookupFeature::Setup(context);
 }
 
@@ -181,6 +405,7 @@ REGISTER_CLASS_REGISTRY("sentence+index feature function", SentenceFeature);
 
 // Register the features defined in the header.
 REGISTER_SENTENCE_IDX_FEATURE("word", Word);
+REGISTER_SENTENCE_IDX_FEATURE("char", Char);
 REGISTER_SENTENCE_IDX_FEATURE("lcword", LowercaseWord);
 REGISTER_SENTENCE_IDX_FEATURE("tag", Tag);
 REGISTER_SENTENCE_IDX_FEATURE("offset", Offset);
@@ -188,5 +413,10 @@ REGISTER_SENTENCE_IDX_FEATURE("hyphen", Hyphen);
 REGISTER_SENTENCE_IDX_FEATURE("digit", Digit);
 REGISTER_SENTENCE_IDX_FEATURE("prefix", PrefixFeature);
 REGISTER_SENTENCE_IDX_FEATURE("suffix", SuffixFeature);
+REGISTER_SENTENCE_IDX_FEATURE("char-ngram", CharNgram);
+REGISTER_SENTENCE_IDX_FEATURE("morphology-set", MorphologySet);
+REGISTER_SENTENCE_IDX_FEATURE("capitalization", Capitalization);
+REGISTER_SENTENCE_IDX_FEATURE("punctuation-amount", PunctuationAmount);
+REGISTER_SENTENCE_IDX_FEATURE("quote", Quote);
 
 }  // namespace syntaxnet

+ 329 - 5
syntaxnet/syntaxnet/sentence_features.h

@@ -23,6 +23,7 @@ limitations under the License.
 #include "syntaxnet/affix.h"
 #include "syntaxnet/feature_extractor.h"
 #include "syntaxnet/feature_types.h"
+#include "syntaxnet/segmenter_utils.h"
 #include "syntaxnet/shared_store.h"
 #include "syntaxnet/task_context.h"
 #include "syntaxnet/workspace.h"
@@ -85,6 +86,88 @@ class TokenLookupFeature : public SentenceFeature {
     return workspaces.Get<VectorIntWorkspace>(workspace_).element(focus);
   }
 
+  int Workspace() const { return workspace_; }
+
+ private:
+  int workspace_;
+};
+
+// A multi purpose specialization of the feature. Processes the tokens in a
+// Sentence by looking up a value set for each token and storing that in
+// a VectorVectorInt workspace. Given a set of base values of size Size(),
+// reserves an extra value for unknown tokens.
+class TokenLookupSetFeature : public SentenceFeature {
+ public:
+  void Init(TaskContext *context) override {
+    set_feature_type(new ResourceBasedFeatureType<TokenLookupSetFeature>(
+        name(), this, {{NumValues(), "<OUTSIDE>"}}));
+  }
+
+  // Number of unique values.
+  virtual int64 NumValues() const = 0;
+
+  // Given a position in a sentence and workspaces, looks up the corresponding
+  // feature value set. The index is relative to the start of the sentence.
+  virtual void LookupToken(const WorkspaceSet &workspaces,
+                           const Sentence &sentence, int index,
+                           vector<int> *values) const = 0;
+
+  // Given a feature value, returns a string representation.
+  virtual string GetFeatureValueName(int value) const = 0;
+
+  // Name of the shared workspace.
+  virtual string WorkspaceName() const = 0;
+
+  // TokenLookupSetFeatures use VectorVectorIntWorkspaces by default.
+  void RequestWorkspaces(WorkspaceRegistry *registry) override {
+    workspace_ = registry->Request<VectorVectorIntWorkspace>(WorkspaceName());
+  }
+
+  // Default preprocessing: looks up a value set for each token in the Sentence.
+  void Preprocess(WorkspaceSet *workspaces, Sentence *sentence) const override {
+    // Default preprocessing: lookup a value set for each token in the Sentence.
+    if (workspaces->Has<VectorVectorIntWorkspace>(workspace_)) return;
+    VectorVectorIntWorkspace *workspace =
+        new VectorVectorIntWorkspace(sentence->token_size());
+    for (int i = 0; i < sentence->token_size(); ++i) {
+      LookupToken(*workspaces, *sentence, i, workspace->mutable_elements(i));
+    }
+    workspaces->Set<VectorVectorIntWorkspace>(workspace_, workspace);
+  }
+
+  // Returns a pre-computed token value from the cache. This assumes the cache
+  // is populated.
+  const vector<int> &GetCachedValueSet(const WorkspaceSet &workspaces,
+                                       const Sentence &sentence,
+                                       int focus) const {
+    // Do bounds checking on focus.
+    CHECK_GE(focus, 0);
+    CHECK_LT(focus, sentence.token_size());
+
+    // Return value from cache.
+    return workspaces.Get<VectorVectorIntWorkspace>(workspace_).elements(focus);
+  }
+
+  // Adds any precomputed features at the given focus, if present.
+  void Evaluate(const WorkspaceSet &workspaces, const Sentence &sentence,
+                int focus, FeatureVector *result) const override {
+    if (focus >= 0 && focus < sentence.token_size()) {
+      const vector<int> &elements =
+          GetCachedValueSet(workspaces, sentence, focus);
+      for (auto &value : elements) {
+        result->add(this->feature_type(), value);
+      }
+    }
+  }
+
+  // Returns the precomputed value, or NumValues() for features outside
+  // the sentence.
+  FeatureValue Compute(const WorkspaceSet &workspaces, const Sentence &sentence,
+                       int focus, const FeatureVector *result) const override {
+    if (focus < 0 || focus >= sentence.token_size()) return NumValues();
+    return workspaces.Get<VectorIntWorkspace>(workspace_).element(focus);
+  }
+
  private:
   int workspace_;
 };
@@ -134,6 +217,83 @@ class TermFrequencyMapFeature : public TokenLookupFeature {
   int max_num_terms_;
 };
 
+// Specialization of the TokenLookupSetFeature class to use a TermFrequencyMap
+// to perform the mapping. This takes two options: "min_freq" (discard tokens
+// with less than this min frequency), and "max_num_terms" (only read in at most
+// these terms.)
+class TermFrequencyMapSetFeature : public TokenLookupSetFeature {
+ public:
+  // Initializes with an empty name, since we need the options to compute the
+  // actual workspace name.
+  explicit TermFrequencyMapSetFeature(const string &input_name)
+      : input_name_(input_name), min_freq_(0), max_num_terms_(0) {}
+
+  // Releases shared resources.
+  ~TermFrequencyMapSetFeature() override;
+
+  // Returns index of raw word text.
+  virtual void GetTokenIndices(const Token &token,
+                               vector<int> *values) const = 0;
+
+  // Requests the resource inputs.
+  void Setup(TaskContext *context) override;
+
+  // Obtains resources using the shared store. At this point options are known
+  // so the full name can be computed.
+  void Init(TaskContext *context) override;
+
+  // Number of unique values.
+  int64 NumValues() const override { return term_map_->Size(); }
+
+  // Special value for strings not in the map.
+  FeatureValue UnknownValue() const { return term_map_->Size(); }
+
+  // Gets pointer to the underlying map.
+  const TermFrequencyMap *term_map() const { return term_map_; }
+
+  // Returns the term index or the unknown value. Used inside GetTokenIndex()
+  // specializations for convenience.
+  int LookupIndex(const string &term) const {
+    return term_map_->LookupIndex(term, -1);
+  }
+
+  // Given a position in a sentence and workspaces, looks up the corresponding
+  // feature value set. The index is relative to the start of the sentence.
+  void LookupToken(const WorkspaceSet &workspaces, const Sentence &sentence,
+                   int index, vector<int> *values) const override {
+    GetTokenIndices(sentence.token(index), values);
+  }
+
+  // Uses the TermFrequencyMap to lookup the string associated with a value.
+  string GetFeatureValueName(int value) const override {
+    if (value == UnknownValue()) return "<UNKNOWN>";
+    if (value >= 0 && value < NumValues()) {
+      return term_map_->GetTerm(value);
+    }
+    LOG(ERROR) << "Invalid feature value: " << value;
+    return "<INVALID>";
+  }
+
+  // Name of the shared workspace.
+  string WorkspaceName() const override;
+
+ private:
+  // Shortcut pointer to shared map. Not owned.
+  const TermFrequencyMap *term_map_ = nullptr;
+
+  // Name of the input for the term map.
+  string input_name_;
+
+  // Filename of the underlying resource.
+  string file_name_;
+
+  // Minimum frequency for term map.
+  int min_freq_;
+
+  // Maximum number of terms for term map.
+  int max_num_terms_;
+};
+
 class Word : public TermFrequencyMapFeature {
  public:
   Word() : TermFrequencyMapFeature("word-map") {}
@@ -144,6 +304,36 @@ class Word : public TermFrequencyMapFeature {
   }
 };
 
+class Char : public TermFrequencyMapFeature {
+ public:
+  Char() : TermFrequencyMapFeature("char-map") {}
+
+  FeatureValue ComputeValue(const Token &token) const override {
+    const string &form = token.word();
+    if (SegmenterUtils::IsBreakChar(form)) return BreakCharValue();
+    return term_map().LookupIndex(form, UnknownValue());
+  }
+
+  // Special value for breaks.
+  FeatureValue BreakCharValue() const { return term_map().Size(); }
+
+  // Special value for non-break strings not in the map.
+  FeatureValue UnknownValue() const { return term_map().Size() + 1; }
+
+  // Number of unique values.
+  int64 NumValues() const override { return term_map().Size() + 2; }
+
+  string GetFeatureValueName(FeatureValue value) const override {
+    if (value == BreakCharValue()) return "<BREAK_CHAR>";
+    if (value == UnknownValue()) return "<UNKNOWN>";
+    if (value >= 0 && value < term_map().Size()) {
+      return term_map().GetTerm(value);
+    }
+    LOG(ERROR) << "Invalid feature value: " << value;
+    return "<INVALID>";
+  }
+};
+
 class LowercaseWord : public TermFrequencyMapFeature {
  public:
   LowercaseWord() : TermFrequencyMapFeature("lc-word-map") {}
@@ -172,6 +362,47 @@ class Label : public TermFrequencyMapFeature {
   }
 };
 
+class CharNgram : public TermFrequencyMapSetFeature {
+ public:
+  CharNgram() : TermFrequencyMapSetFeature("char-ngram-map") {}
+  ~CharNgram() override {}
+
+  void Setup(TaskContext *context) override {
+    TermFrequencyMapSetFeature::Setup(context);
+    max_char_ngram_length_ = context->Get("lexicon_max_char_ngram_length", 3);
+    use_terminators_ =
+        context->Get("lexicon_char_ngram_include_terminators", false);
+  }
+
+  // Returns index of raw word text.
+  void GetTokenIndices(const Token &token, vector<int> *values) const override;
+
+ private:
+  // Size parameter (n) for the ngrams.
+  int max_char_ngram_length_ = 3;
+
+  // Whether to pad the word with ^ and $ before extracting ngrams.
+  bool use_terminators_ = false;
+};
+
+class MorphologySet : public TermFrequencyMapSetFeature {
+ public:
+  MorphologySet() : TermFrequencyMapSetFeature("morphology-map") {}
+  ~MorphologySet() override {}
+
+  void Setup(TaskContext *context) override {
+    TermFrequencyMapSetFeature::Setup(context);
+  }
+
+
+  int64 NumValues() const override {
+    return term_map()->Size() - 1;
+  }
+
+  // Returns index of raw word text.
+  void GetTokenIndices(const Token &token, vector<int> *values) const override;
+};
+
 class LexicalCategoryFeature : public TokenLookupFeature {
  public:
   LexicalCategoryFeature(const string &name, int cardinality)
@@ -180,7 +411,7 @@ class LexicalCategoryFeature : public TokenLookupFeature {
 
   FeatureValue NumValues() const override { return cardinality_; }
 
-  // Returns the identifier for the workspace for this preprocessor.
+  // Returns the identifier for the workspace for this feature.
   string WorkspaceName() const override {
     return tensorflow::strings::StrCat(name_, ":", cardinality_);
   }
@@ -193,7 +424,7 @@ class LexicalCategoryFeature : public TokenLookupFeature {
   const int cardinality_;
 };
 
-// Preprocessor that computes whether a word has a hyphen or not.
+// Feature that computes whether a word has a hyphen or not.
 class Hyphen : public LexicalCategoryFeature {
  public:
   // Enumeration of values.
@@ -213,7 +444,100 @@ class Hyphen : public LexicalCategoryFeature {
   FeatureValue ComputeValue(const Token &token) const override;
 };
 
-// Preprocessor that computes whether a word has a hyphen or not.
+// Feature that categorizes the capitalization of the word. If the option
+// utf8=true is specified, lowercase and uppercase checks are done with UTF8
+// compliant functions.
+class Capitalization : public LexicalCategoryFeature {
+ public:
+  // Enumeration of values.
+  enum Category {
+    LOWERCASE = 0,                     // normal word
+    UPPERCASE = 1,                     // all-caps
+    CAPITALIZED = 2,                   // has one cap and one non-cap
+    CAPITALIZED_SENTENCE_INITIAL = 3,  // same as above but sentence-initial
+    NON_ALPHABETIC = 4,                // contains no alphabetic characters
+    CARDINALITY = 5,
+  };
+
+  // Default constructor.
+  Capitalization() : LexicalCategoryFeature("capitalization", CARDINALITY) {}
+
+  // Sets one of the options for the capitalization.
+  void Setup(TaskContext *context) override;
+
+  // Capitalization needs special preprocessing because token category can
+  // depend on whether the token is at the start of the sentence.
+  void Preprocess(WorkspaceSet *workspaces, Sentence *sentence) const override;
+
+  // Returns a string representation of the enum value.
+  string GetFeatureValueName(FeatureValue value) const override;
+
+  // Returns the category value for the token.
+  FeatureValue ComputeValue(const Token &token) const override {
+    LOG(FATAL) << "Capitalization should use ComputeValueWithFocus.";
+    return 0;
+  }
+
+  // Returns the category value for the token.
+  FeatureValue ComputeValueWithFocus(const Token &token, int focus) const;
+
+ private:
+  // Whether to use UTF8 compliant functions to check capitalization.
+  bool utf8_ = false;
+};
+
+// A feature for computing whether the focus token contains any punctuation
+// for ternary features.
+class PunctuationAmount : public LexicalCategoryFeature {
+ public:
+  // Enumeration of values.
+  enum Category {
+    NO_PUNCTUATION = 0,
+    SOME_PUNCTUATION = 1,
+    ALL_PUNCTUATION = 2,
+    CARDINALITY = 3,
+  };
+
+  // Default constructor.
+  PunctuationAmount()
+      : LexicalCategoryFeature("punctuation-amount", CARDINALITY) {}
+
+  // Returns a string representation of the enum value.
+  string GetFeatureValueName(FeatureValue value) const override;
+
+  // Returns the category value for the token.
+  FeatureValue ComputeValue(const Token &token) const override;
+};
+
+// A feature for a feature that returns whether the word is an open or
+// close quotation mark, based on its relative position to other quotation marks
+// in the sentence.
+class Quote : public LexicalCategoryFeature {
+ public:
+  // Enumeration of values.
+  enum Category {
+    NO_QUOTE = 0,
+    OPEN_QUOTE = 1,
+    CLOSE_QUOTE = 2,
+    UNKNOWN_QUOTE = 3,
+    CARDINALITY = 4,
+  };
+
+  // Default constructor.
+  Quote() : LexicalCategoryFeature("quote", CARDINALITY) {}
+
+  // Returns a string representation of the enum value.
+  string GetFeatureValueName(FeatureValue value) const override;
+
+  // Returns the category value for the token.
+  FeatureValue ComputeValue(const Token &token) const override;
+
+  // Override preprocess to compute open and close quotes from prior context of
+  // the sentence.
+  void Preprocess(WorkspaceSet *workspaces, Sentence *instance) const override;
+};
+
+// Feature that computes whether a word has digits or not.
 class Digit : public LexicalCategoryFeature {
  public:
   // Enumeration of values.
@@ -234,9 +558,9 @@ class Digit : public LexicalCategoryFeature {
   FeatureValue ComputeValue(const Token &token) const override;
 };
 
-// TokenLookupPreprocessor object to compute prefixes and suffixes of words. The
+// TokenLookupFeature object to compute prefixes and suffixes of words. The
 // AffixTable is stored in the SharedStore. This is very similar to the
-// implementation of TermFrequencyMapPreprocessor, but using an AffixTable to
+// implementation of TermFrequencyMapFeature, but using an AffixTable to
 // perform the lookups. There are only two specializations, for prefixes and
 // suffixes.
 class AffixTableFeature : public TokenLookupFeature {

+ 123 - 4
syntaxnet/syntaxnet/sentence_features_test.cc

@@ -26,6 +26,7 @@ limitations under the License.
 #include "syntaxnet/utils.h"
 #include "syntaxnet/workspace.h"
 #include <gmock/gmock.h>
+#include "tensorflow/core/platform/test.h"
 
 using testing::UnorderedElementsAreArray;
 
@@ -83,6 +84,27 @@ class SentenceFeaturesTest : public ::testing::Test {
     return values;
   }
 
+  // Adds an input to the task context.
+  void AddInputToContext(const string &name, const string &file_pattern,
+                         const string &file_format,
+                         const string &record_format) {
+    TaskInput *input = context_.GetInput(name);
+    TaskInput::Part *part = input->add_part();
+    part->set_file_pattern(file_pattern);
+    part->set_file_format(file_format);
+    part->set_record_format(record_format);
+  }
+
+  // Checks that a vector workspace is equal to a target vector.
+  void CheckVectorWorkspace(const VectorIntWorkspace &workspace,
+                            vector<int> target) {
+    vector<int> src;
+    for (int i = 0; i < workspace.size(); ++i) {
+      src.push_back(workspace.element(i));
+    }
+    EXPECT_THAT(src, testing::ContainerEq(target));
+  }
+
   Sentence sentence_;
   WorkspaceSet workspaces_;
 
@@ -99,13 +121,18 @@ class CommonSentenceFeaturesTest : public SentenceFeaturesTest {
       : SentenceFeaturesTest(
             "text: 'I saw a man with a telescope.' "
             "token { word: 'I' start: 0 end: 0 tag: 'PRP' category: 'PRON'"
-            " head: 1 label: 'nsubj' break_level: NO_BREAK } "
+            "  head: 1 label: 'nsubj' break_level: NO_BREAK } "
             "token { word: 'saw' start: 2 end: 4 tag: 'VBD' category: 'VERB'"
-            " label: 'ROOT' break_level: SPACE_BREAK } "
+            "  label: 'ROOT' break_level: SPACE_BREAK } "
             "token { word: 'a' start: 6 end: 6 tag: 'DT' category: 'DET'"
-            " head: 3 label: 'det' break_level: SPACE_BREAK } "
+            "  head: 3 label: 'det' break_level: SPACE_BREAK } "
             "token { word: 'man' start: 8 end: 10 tag: 'NN' category: 'NOUN'"
-            " head: 1 label: 'dobj' break_level: SPACE_BREAK } "
+            "  head: 1 label: 'dobj' break_level: SPACE_BREAK"
+            "  [syntaxnet.TokenMorphology.morphology] { "
+            "    attribute { name:'morph' value:'Sg' } "
+            "    attribute { name:'morph' value:'Masc' } "
+            "  } "
+            "} "
             "token { word: 'with' start: 12 end: 15 tag: 'IN' category: 'ADP'"
             " head: 1 label: 'prep' break_level: SPACE_BREAK } "
             "token { word: 'a' start: 17 end: 17 tag: 'DT' category: 'DET'"
@@ -152,4 +179,96 @@ TEST_F(CommonSentenceFeaturesTest, OffsetPlusTag) {
   EXPECT_EQ("<OUTSIDE>", ExtractFeature(9));
 }
 
+TEST_F(CommonSentenceFeaturesTest, CharNgramFeature) {
+  TermFrequencyMap char_ngram_map;
+  char_ngram_map.Increment("a");
+  char_ngram_map.Increment("aw");
+  char_ngram_map.Increment("sa");
+  creators_.Add(
+      "char-ngram-map", "text", "",
+      [&char_ngram_map](const string &path) { char_ngram_map.Save(path); });
+
+  // Test that CharNgram works as expected.
+  PrepareFeature("char-ngram");
+  EXPECT_EQ("", utils::Join(ExtractMultiFeature(-1), ","));
+  EXPECT_EQ("", utils::Join(ExtractMultiFeature(0), ","));
+  EXPECT_EQ("sa,a,aw", utils::Join(ExtractMultiFeature(1), ","));
+  EXPECT_EQ("a", utils::Join(ExtractMultiFeature(2), ","));
+  EXPECT_EQ("a", utils::Join(ExtractMultiFeature(3), ","));
+  EXPECT_EQ("", utils::Join(ExtractMultiFeature(8), ","));
+}
+
+TEST_F(CommonSentenceFeaturesTest, MorphologySetFeature) {
+  TermFrequencyMap morphology_map;
+  morphology_map.Increment("morph=Sg");
+  morphology_map.Increment("morph=Sg");
+  morphology_map.Increment("morph=Masc");
+  morphology_map.Increment("morph=Masc");
+  morphology_map.Increment("morph=Pl");
+  creators_.Add(
+      "morphology-map", "text", "",
+      [&morphology_map](const string &path) { morphology_map.Save(path); });
+
+  // Test that CharNgram works as expected.
+  PrepareFeature("morphology-set");
+  EXPECT_EQ("", utils::Join(ExtractMultiFeature(-1), ","));
+  EXPECT_EQ("", utils::Join(ExtractMultiFeature(0), ","));
+  EXPECT_EQ("morph=Sg,morph=Masc", utils::Join(ExtractMultiFeature(3), ","));
+}
+
+TEST_F(CommonSentenceFeaturesTest, CapitalizationProcessesCorrectly) {
+  Capitalization feature;
+  feature.RequestWorkspaces(&registry_);
+  workspaces_.Reset(registry_);
+  feature.Preprocess(&workspaces_, &sentence_);
+
+  // Check the workspace contains what we expect.
+  EXPECT_TRUE(workspaces_.Has<VectorIntWorkspace>(feature.Workspace()));
+  const VectorIntWorkspace &workspace =
+      workspaces_.Get<VectorIntWorkspace>(feature.Workspace());
+  constexpr int UPPERCASE = Capitalization::UPPERCASE;
+  constexpr int LOWERCASE = Capitalization::LOWERCASE;
+  constexpr int NON_ALPHABETIC = Capitalization::NON_ALPHABETIC;
+  CheckVectorWorkspace(workspace,
+                       {UPPERCASE, LOWERCASE, LOWERCASE, LOWERCASE, LOWERCASE,
+                        LOWERCASE, LOWERCASE, NON_ALPHABETIC});
+}
+
+class CharFeatureTest : public SentenceFeaturesTest {
+ protected:
+  CharFeatureTest()
+      : SentenceFeaturesTest(
+          "text: '一 个 测 试 员  ' "
+          "token { word: '一' start: 0 end: 2 } "
+          "token { word: '个' start: 3 end: 5 } "
+          "token { word: '测' start: 6 end: 8 } "
+          "token { word: '试' start: 9 end: 11 } "
+          "token { word: '员' start: 12 end: 14 } "
+          "token { word: ' ' start: 15 end: 15 } "
+          "token { word: '\t' start: 16 end: 16 } ") {}
+};
+
+TEST_F(CharFeatureTest, CharFeature) {
+  TermFrequencyMap char_map;
+  char_map.Increment("一");
+  char_map.Increment("个");
+  char_map.Increment("试");
+  char_map.Increment("员");
+  creators_.Add(
+      "char-map", "text", "",
+      [&char_map](const string &path) { char_map.Save(path); });
+
+  // Test that Char works as expected.
+  PrepareFeature("char");
+  EXPECT_EQ("<OUTSIDE>", ExtractFeature(-1));
+  EXPECT_EQ("一", ExtractFeature(0));
+  EXPECT_EQ("个", ExtractFeature(1));
+  EXPECT_EQ("<UNKNOWN>", ExtractFeature(2));  // "测" is not in the char map.
+  EXPECT_EQ("试", ExtractFeature(3));
+  EXPECT_EQ("员", ExtractFeature(4));
+  EXPECT_EQ("<BREAK_CHAR>", ExtractFeature(5));
+  EXPECT_EQ("<BREAK_CHAR>", ExtractFeature(6));
+  EXPECT_EQ("<OUTSIDE>", ExtractFeature(7));
+}
+
 }  // namespace syntaxnet

+ 40 - 5
syntaxnet/syntaxnet/tagger_transitions.cc

@@ -25,8 +25,10 @@ limitations under the License.
 
 #include <string>
 
+#include "syntaxnet/parser_features.h"
 #include "syntaxnet/parser_state.h"
 #include "syntaxnet/parser_transitions.h"
+#include "syntaxnet/sentence_features.h"
 #include "syntaxnet/shared_store.h"
 #include "syntaxnet/task_context.h"
 #include "syntaxnet/term_frequency_map.h"
@@ -98,7 +100,9 @@ class TaggerTransitionState : public ParserTransitionState {
     for (size_t i = 0; i < tag_.size(); ++i) {
       Token *token = sentence->mutable_token(i);
       token->set_tag(TagAsString(Tag(i)));
-      token->set_category(tag_to_category_->GetCategory(token->tag()));
+      if (tag_to_category_) {
+        token->set_category(tag_to_category_->GetCategory(token->tag()));
+      }
     }
   }
 
@@ -146,6 +150,7 @@ class TaggerTransitionSystem : public ParserTransitionSystem {
   // Determines tag map location.
   void Setup(TaskContext *context) override {
     input_tag_map_ = context->GetInput("tag-map", "text", "");
+    join_category_to_pos_ = context->GetBoolParameter("join_category_to_pos");
     input_tag_to_category_ = context->GetInput("tag-to-category", "text", "");
   }
 
@@ -154,15 +159,21 @@ class TaggerTransitionSystem : public ParserTransitionSystem {
     const string tag_map_path = TaskContext::InputFile(*input_tag_map_);
     tag_map_ = SharedStoreUtils::GetWithDefaultName<TermFrequencyMap>(
         tag_map_path, 0, 0);
-    const string tag_to_category_path =
-        TaskContext::InputFile(*input_tag_to_category_);
-    tag_to_category_ = SharedStoreUtils::GetWithDefaultName<TagToCategoryMap>(
-        tag_to_category_path);
+    if (!join_category_to_pos_) {
+      const string tag_to_category_path =
+          TaskContext::InputFile(*input_tag_to_category_);
+      tag_to_category_ = SharedStoreUtils::GetWithDefaultName<TagToCategoryMap>(
+          tag_to_category_path);
+    }
   }
 
   // The SHIFT action uses the same value as the corresponding action type.
   static ParserAction ShiftAction(int tag) { return tag; }
 
+  // The tagger transition system doesn't look at the dependency tree, so it
+  // allows non-projective trees.
+  bool AllowsNonProjective() const override { return true; }
+
   // Returns the number of action types.
   int NumActionTypes() const override { return 1; }
 
@@ -251,8 +262,32 @@ class TaggerTransitionSystem : public ParserTransitionSystem {
 
   // Tag to category map. Owned through SharedStore.
   const TagToCategoryMap *tag_to_category_ = nullptr;
+
+  bool join_category_to_pos_ = false;
 };
 
 REGISTER_TRANSITION_SYSTEM("tagger", TaggerTransitionSystem);
 
+// Feature function for retrieving the tag assigned to a token by the tagger
+// transition system.
+class PredictedTagFeatureFunction
+    : public BasicParserSentenceFeatureFunction<Tag> {
+ public:
+  PredictedTagFeatureFunction() {}
+
+  // Gets the TaggerTransitionState from the parser state and reads the assigned
+  // tag at the focus index. Returns -1 if the focus is not within the sentence.
+  FeatureValue Compute(const WorkspaceSet &workspaces, const ParserState &state,
+                       int focus, const FeatureVector *result) const override {
+    if (focus < 0 || focus >= state.sentence().token_size()) return -1;
+    return static_cast<const TaggerTransitionState *>(state.transition_state())
+        ->Tag(focus);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(PredictedTagFeatureFunction);
+};
+
+REGISTER_PARSER_IDX_FEATURE_FUNCTION("pred-tag", PredictedTagFeatureFunction);
+
 }  // namespace syntaxnet

+ 6 - 0
syntaxnet/syntaxnet/testdata/context.pbtxt

@@ -61,6 +61,12 @@ input {
   }
 }
 input {
+  name: 'char-map'
+  Part {
+    file_pattern: 'OUTPATH/char-map'
+  }
+}
+input {
   name: 'prefix-table'
   Part {
     file_pattern: 'OUTPATH/prefix-table'

+ 107 - 7
syntaxnet/syntaxnet/text_formats.cc

@@ -63,6 +63,11 @@ class CoNLLSyntaxFormat : public DocumentFormat {
  public:
   CoNLLSyntaxFormat() {}
 
+  void Setup(TaskContext *context) override {
+    join_category_to_pos_ = context->GetBoolParameter("join_category_to_pos");
+    add_pos_as_attribute_ = context->GetBoolParameter("add_pos_as_attribute");
+  }
+
   // Reads up to the first empty line and returns false end of file is reached.
   bool ReadRecord(tensorflow::io::InputBuffer *buffer,
                   string *record) override {
@@ -121,6 +126,7 @@ class CoNLLSyntaxFormat : public DocumentFormat {
       const string &word = fields[1];
       const string &cpostag = fields[3];
       const string &tag = fields[4];
+      const string &attributes = fields[5];
       const int head = utils::ParseUsing<int>(fields[6], 0, utils::ParseInt32);
       const string &label = fields[7];
 
@@ -139,6 +145,9 @@ class CoNLLSyntaxFormat : public DocumentFormat {
       if (!tag.empty()) token->set_tag(tag);
       if (!cpostag.empty()) token->set_category(cpostag);
       if (!label.empty()) token->set_label(label);
+      if (!attributes.empty()) AddMorphAttributes(attributes, token);
+      if (join_category_to_pos_) JoinCategoryToPos(token);
+      if (add_pos_as_attribute_) AddPosAsAttribute(token);
     }
 
     if (sentence->token_size() > 0) {
@@ -158,16 +167,18 @@ class CoNLLSyntaxFormat : public DocumentFormat {
     *key = sentence.docid();
     vector<string> lines;
     for (int i = 0; i < sentence.token_size(); ++i) {
+      Token token = sentence.token(i);
+      if (join_category_to_pos_) SplitCategoryFromPos(&token);
+      if (add_pos_as_attribute_) RemovePosFromAttributes(&token);
       vector<string> fields(10);
       fields[0] = tensorflow::strings::Printf("%d", i + 1);
-      fields[1] = sentence.token(i).word();
+      fields[1] = token.word();
       fields[2] = "_";
-      fields[3] = sentence.token(i).category();
-      fields[4] = sentence.token(i).tag();
-      fields[5] = "_";
-      fields[6] =
-          tensorflow::strings::Printf("%d", sentence.token(i).head() + 1);
-      fields[7] = sentence.token(i).label();
+      fields[3] = token.category();
+      fields[4] = token.tag();
+      fields[5] = GetMorphAttributes(token);
+      fields[6] = tensorflow::strings::Printf("%d", token.head() + 1);
+      fields[7] = token.label();
       fields[8] = "_";
       fields[9] = "_";
       lines.push_back(utils::Join(fields, "\t"));
@@ -176,6 +187,95 @@ class CoNLLSyntaxFormat : public DocumentFormat {
   }
 
  private:
+  // Creates a TokenMorphology object out of a list of attribute values of the
+  // form: a1=v1|a2=v2|... or v1|v2|...
+  void AddMorphAttributes(const string &attributes, Token *token) {
+    TokenMorphology *morph =
+        token->MutableExtension(TokenMorphology::morphology);
+    vector<string> att_vals = utils::Split(attributes, '|');
+    for (int i = 0; i < att_vals.size(); ++i) {
+      vector<string> att_val = utils::Split(att_vals[i], '=');
+      CHECK_LE(att_val.size(), 2)
+          << "Error parsing morphology features "
+          << "column, must be of format "
+          << "a1=v1|a2=v2|... or v1|v2|... <field>: " << attributes;
+
+      // Format is either:
+      //   1) a1=v1|a2=v2..., e.g., Czech CoNLL data, or,
+      //   2) v1|v2|..., e.g., German CoNLL data.
+      const pair<string, string> name_value =
+          att_val.size() == 2 ? std::make_pair(att_val[0], att_val[1])
+                              : std::make_pair(att_val[0], "on");
+
+      // We currently don't expect an empty attribute value, but might have an
+      // empty attribute name due to data input errors.
+      if (name_value.second.empty()) {
+        LOG(WARNING) << "Invalid attributes string: " << attributes
+                     << " for token: " << token->ShortDebugString();
+        continue;
+      }
+      if (!name_value.first.empty()) {
+        TokenMorphology::Attribute *attribute = morph->add_attribute();
+        attribute->set_name(name_value.first);
+        attribute->set_value(name_value.second);
+      }
+    }
+  }
+
+  // Creates a list of attribute values of the form a1=v1|a2=v2|... or v1|v2|...
+  // from a TokenMorphology object.
+  string GetMorphAttributes(const Token &token) {
+    const TokenMorphology &morph =
+        token.GetExtension(TokenMorphology::morphology);
+    if (morph.attribute_size() == 0) return "_";
+    string attributes;
+    for (const TokenMorphology::Attribute &attribute : morph.attribute()) {
+      if (!attributes.empty()) tensorflow::strings::StrAppend(&attributes, "|");
+      tensorflow::strings::StrAppend(&attributes, attribute.name());
+      if (attribute.value() != "on") {
+        tensorflow::strings::StrAppend(&attributes, "=", attribute.value());
+      }
+    }
+    return attributes;
+  }
+
+  void JoinCategoryToPos(Token *token) {
+    token->set_tag(
+        tensorflow::strings::StrCat(token->category(), "++", token->tag()));
+    token->clear_category();
+  }
+
+  void SplitCategoryFromPos(Token *token) {
+    const string &tag = token->tag();
+    const size_t pos = tag.find("++");
+    if (pos != string::npos) {
+      token->set_category(tag.substr(0, pos));
+      token->set_tag(tag.substr(pos + 2));
+    }
+  }
+
+  void AddPosAsAttribute(Token *token) {
+    if (!token->tag().empty()) {
+      TokenMorphology *morph =
+          token->MutableExtension(TokenMorphology::morphology);
+      TokenMorphology::Attribute *attribute = morph->add_attribute();
+      attribute->set_name("fPOS");
+      attribute->set_value(token->tag());
+    }
+  }
+
+  void RemovePosFromAttributes(Token *token) {
+    // Assumes the "fPOS" attribute, if present, is the last one.
+    TokenMorphology *morph =
+        token->MutableExtension(TokenMorphology::morphology);
+    if (morph->attribute().rbegin()->name() == "fPOS") {
+      morph->mutable_attribute()->RemoveLast();
+    }
+  }
+
+  bool join_category_to_pos_ = false;
+  bool add_pos_as_attribute_ = false;
+
   TF_DISALLOW_COPY_AND_ASSIGN(CoNLLSyntaxFormat);
 };
 

+ 59 - 1
syntaxnet/syntaxnet/utils.h

@@ -62,7 +62,7 @@ string Join(const std::vector<T> &s, const char *sep) {
   return result;
 }
 
-string JoinPath(std::initializer_list<StringPiece> paths);
+string JoinPath(std::initializer_list<tensorflow::StringPiece> paths);
 
 size_t RemoveLeadingWhitespace(tensorflow::StringPiece *text);
 
@@ -165,6 +165,64 @@ class PunctuationUtil {
 
 void NormalizeDigits(string *form);
 
+// Helper type to mark missing c-tor argument types
+// for Type's c-tor in LazyStaticPtr<Type, ...>.
+struct NoArg {};
+
+template <typename Type, typename Arg1 = NoArg, typename Arg2 = NoArg,
+          typename Arg3 = NoArg>
+class LazyStaticPtr {
+ public:
+  typedef Type element_type;  // per smart pointer convention
+
+  // Pretend to be a pointer to Type (never NULL due to on-demand creation):
+  Type &operator*() const { return *get(); }
+  Type *operator->() const { return get(); }
+
+  // Named accessor/initializer:
+  Type *get() const {
+    if (!ptr_) Initialize(this);
+    return ptr_;
+  }
+
+ public:
+  // All the data is public and LazyStaticPtr has no constructors so that we can
+  // initialize LazyStaticPtr objects with the "= { arg_value, ... }" syntax.
+  // Clients of LazyStaticPtr must not access the data members directly.
+
+  // Arguments for Type's c-tor
+  // (unused NoArg-typed arguments consume either no space, or 1 byte to
+  //  ensure address uniqueness):
+  Arg1 arg1_;
+  Arg2 arg2_;
+  Arg3 arg3_;
+
+  // The object we create and show.
+  mutable Type *ptr_;
+
+ private:
+  template <typename A1, typename A2, typename A3>
+  static Type *Factory(const A1 &a1, const A2 &a2, const A3 &a3) {
+    return new Type(a1, a2, a3);
+  }
+
+  template <typename A1, typename A2>
+  static Type *Factory(const A1 &a1, const A2 &a2, NoArg a3) {
+    return new Type(a1, a2);
+  }
+
+  template <typename A1>
+  static Type *Factory(const A1 &a1, NoArg a2, NoArg a3) {
+    return new Type(a1);
+  }
+
+  static Type *Factory(NoArg a1, NoArg a2, NoArg a3) { return new Type(); }
+
+  static void Initialize(const LazyStaticPtr *lsp) {
+    lsp->ptr_ = Factory(lsp->arg1_, lsp->arg2_, lsp->arg3_);
+  }
+};
+
 }  // namespace utils
 }  // namespace syntaxnet
 

+ 2 - 0
syntaxnet/syntaxnet/workspace.h

@@ -185,6 +185,8 @@ class VectorIntWorkspace : public Workspace {
   // Sets the i'th element.
   void set_element(int i, int value) { elements_[i] = value; }
 
+  int size() const { return elements_.size(); }
+
  private:
   // The enclosed vector.
   vector<int> elements_;

+ 6 - 0
syntaxnet/util/utf8/unicodetext.h

@@ -462,6 +462,12 @@ inline string UnicodeTextToUTF8(const UnicodeText& t) {
   return string(t.utf8_data(), t.utf8_length());
 }
 
+// This template function declaration is used in defining arraysize.
+// Note that the function doesn't need an implementation, as we only
+// use its type.
+template <typename T, size_t N>
+char (&ArraySizeHelper(T (&array)[N]))[N];
+#define arraysize(array) (sizeof(ArraySizeHelper(array)))
 
 // For debugging.  Return a string of integers, written in uppercase
 // hex (%X), corresponding to the codepoints within the text. Each

+ 0 - 4
syntaxnet/util/utf8/unicodetext_unittest.cc

@@ -25,10 +25,6 @@
 
 namespace {
 
-template <typename T, size_t N>
-char (&ArraySizeHelper(T (&array)[N]))[N];
-#define arraysize(array) (sizeof(ArraySizeHelper(array)))
-
 class UnicodeTextTest : public testing::Test {
  protected:
   UnicodeTextTest() : empty_text_() {

+ 14 - 0
syntaxnet/util/utf8/unilib_utf8_utils.h

@@ -21,6 +21,7 @@
 // They are also exported from unilib.h for legacy reasons.
 
 #include "syntaxnet/base.h"
+#include "third_party/utf/utf.h"
 
 namespace UniLib {
 
@@ -32,6 +33,19 @@ inline bool IsValidCodepoint(char32 c) {
     || (c >= 0xE000 && c <= 0x10FFFF);
 }
 
+// Returns true if 'str' is the start of a structurally valid UTF-8
+// sequence and is not a surrogate codepoint. Returns false if str.empty()
+// or if str.length() < UniLib::OneCharLen(str[0]). Otherwise, this function
+// will access 1-4 bytes of src, where n is UniLib::OneCharLen(src[0]).
+inline bool IsUTF8ValidCodepoint(StringPiece str) {
+  char32 c;
+  int consumed;
+  // It's OK if str.length() > consumed.
+  return !str.empty()
+      && isvalidcharntorune(str.data(), str.size(), &c, &consumed)
+      && IsValidCodepoint(c);
+}
+
 // Returns the length (number of bytes) of the Unicode code point
 // starting at src, based on inspecting just that one byte. This
 // requires that src point to a well-formed UTF-8 string; the result