Explorar el Código

New transition systems and features for syntaxnet (#301)

* Morpher and segmenter transition systems and new features (quotes, punctuation, capitalization, character ngrams, morphology attributes).
calberti hace 9 años
padre
commit
64675fc72f
Se han modificado 37 ficheros con 4257 adiciones y 62 borrados
  1. 3 2
      syntaxnet/README.md
  2. 105 23
      syntaxnet/syntaxnet/BUILD
  3. 102 0
      syntaxnet/syntaxnet/binary_segment_state.cc
  4. 99 0
      syntaxnet/syntaxnet/binary_segment_state.h
  5. 218 0
      syntaxnet/syntaxnet/binary_segment_state_test.cc
  6. 121 0
      syntaxnet/syntaxnet/binary_segment_transitions.cc
  7. 111 0
      syntaxnet/syntaxnet/binary_segment_transitions_test.cc
  8. 845 0
      syntaxnet/syntaxnet/char_properties.cc
  9. 362 0
      syntaxnet/syntaxnet/char_properties.h
  10. 364 0
      syntaxnet/syntaxnet/char_properties_test.cc
  11. 4 2
      syntaxnet/syntaxnet/document_filters.cc
  12. 2 0
      syntaxnet/syntaxnet/document_format.h
  13. 12 1
      syntaxnet/syntaxnet/lexicon_builder.cc
  14. 25 1
      syntaxnet/syntaxnet/lexicon_builder_test.py
  15. 298 0
      syntaxnet/syntaxnet/morpher_transitions.cc
  16. 91 0
      syntaxnet/syntaxnet/morphology_label_set.cc
  17. 110 0
      syntaxnet/syntaxnet/morphology_label_set.h
  18. 101 0
      syntaxnet/syntaxnet/morphology_label_set_test.cc
  19. 0 1
      syntaxnet/syntaxnet/parser_eval.py
  20. 18 0
      syntaxnet/syntaxnet/parser_features.cc
  21. 4 2
      syntaxnet/syntaxnet/proto_io.h
  22. 85 0
      syntaxnet/syntaxnet/segmenter_utils.cc
  23. 93 0
      syntaxnet/syntaxnet/segmenter_utils.h
  24. 149 0
      syntaxnet/syntaxnet/segmenter_utils_test.cc
  25. 15 0
      syntaxnet/syntaxnet/sentence.proto
  26. 1 1
      syntaxnet/syntaxnet/sentence_batch.cc
  27. 233 3
      syntaxnet/syntaxnet/sentence_features.cc
  28. 329 5
      syntaxnet/syntaxnet/sentence_features.h
  29. 123 4
      syntaxnet/syntaxnet/sentence_features_test.cc
  30. 40 5
      syntaxnet/syntaxnet/tagger_transitions.cc
  31. 6 0
      syntaxnet/syntaxnet/testdata/context.pbtxt
  32. 107 7
      syntaxnet/syntaxnet/text_formats.cc
  33. 59 1
      syntaxnet/syntaxnet/utils.h
  34. 2 0
      syntaxnet/syntaxnet/workspace.h
  35. 6 0
      syntaxnet/util/utf8/unicodetext.h
  36. 0 4
      syntaxnet/util/utf8/unicodetext_unittest.cc
  37. 14 0
      syntaxnet/util/utf8/unilib_utf8_utils.h

+ 3 - 2
syntaxnet/README.md

@@ -107,8 +107,8 @@ Bazel should complete reporting all tests passed.
 You can also compile SyntaxNet in a [Docker](https://www.docker.com/what-docker)
 container using this [Dockerfile](Dockerfile).
 
-**Note:** If you are running Docker on OSX, make sure that you have enough memory allocated
-for your Docker VM.
+**Note:** If you are running Docker on OSX, make sure that you have enough
+memory allocated for your Docker VM.
 
 ## Getting Started
 
@@ -612,6 +612,7 @@ Original authors of the code in this package include (in alphabetical order):
 *   David Weiss
 *   Emily Pitler
 *   Greg Coppola
+*   Ji Ma
 *   Keith Hall
 *   Kuzman Ganchev
 *   Michael Collins

+ 105 - 23
syntaxnet/syntaxnet/BUILD

@@ -159,6 +159,31 @@ cc_library(
 )
 
 cc_library(
+    name = "char_properties",
+    srcs = ["char_properties.cc"],
+    hdrs = ["char_properties.h"],
+    deps = [
+        ":registry",
+        ":utils",
+        "//util/utf8:unicodetext",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "segmenter_utils",
+    srcs = ["segmenter_utils.cc"],
+    hdrs = ["segmenter_utils.h"],
+    deps = [
+        ":base",
+        ":char_properties",
+        ":sentence_proto",
+        "//util/utf8:unicodetext",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
     name = "feature_extractor",
     srcs = ["feature_extractor.cc"],
     hdrs = [
@@ -199,6 +224,7 @@ cc_library(
         ":affix",
         ":feature_extractor",
         ":registry",
+        ":segmenter_utils",
     ],
 )
 
@@ -251,24 +277,50 @@ cc_library(
 )
 
 cc_library(
+    name = "morphology_label_set",
+    srcs = ["morphology_label_set.cc"],
+    hdrs = ["morphology_label_set.h"],
+    deps = [
+        ":document_format",
+        ":feature_extractor",
+        ":proto_io",
+        ":registry",
+        ":sentence_proto",
+        ":utils",
+    ],
+)
+
+cc_library(
     name = "parser_transitions",
     srcs = [
         "arc_standard_transitions.cc",
+        "binary_segment_state.cc",
+        "binary_segment_transitions.cc",
+        "morpher_transitions.cc",
+        "parser_features.cc",
         "parser_state.cc",
         "parser_transitions.cc",
         "tagger_transitions.cc",
     ],
     hdrs = [
+        "binary_segment_state.h",
+        "parser_features.h",
         "parser_state.h",
         "parser_transitions.h",
     ],
     deps = [
+        ":affix",
+        ":feature_extractor",
         ":kbest_syntax_proto",
+        ":morphology_label_set",
         ":registry",
+        ":segmenter_utils",
+        ":sentence_features",
         ":sentence_proto",
         ":shared_store",
         ":task_context",
         ":term_frequency_map",
+        ":workspace",
     ],
     alwayslink = 1,
 )
@@ -289,29 +341,11 @@ cc_library(
 )
 
 cc_library(
-    name = "parser_features",
-    srcs = ["parser_features.cc"],
-    hdrs = ["parser_features.h"],
-    deps = [
-        ":affix",
-        ":feature_extractor",
-        ":parser_transitions",
-        ":registry",
-        ":sentence_features",
-        ":task_context",
-        ":term_frequency_map",
-        ":workspace",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
     name = "embedding_feature_extractor",
     srcs = ["embedding_feature_extractor.cc"],
     hdrs = ["embedding_feature_extractor.h"],
     deps = [
         ":feature_extractor",
-        ":parser_features",
         ":parser_transitions",
         ":sparse_proto",
         ":task_context",
@@ -326,7 +360,6 @@ cc_library(
     deps = [
         ":embedding_feature_extractor",
         ":feature_extractor",
-        ":parser_features",
         ":parser_transitions",
         ":sentence_proto",
         ":sparse_proto",
@@ -344,7 +377,6 @@ cc_library(
         "reader_ops.cc",
     ],
     deps = [
-        ":parser_features",
         ":parser_transitions",
         ":sentence_batch",
         ":sentence_proto",
@@ -360,7 +392,6 @@ cc_library(
     srcs = ["document_filters.cc"],
     deps = [
         ":document_format",
-        ":parser_features",
         ":parser_transitions",
         ":sentence_batch",
         ":sentence_proto",
@@ -376,8 +407,8 @@ cc_library(
     deps = [
         ":dictionary_proto",
         ":document_format",
-        ":parser_features",
         ":parser_transitions",
+        ":segmenter_utils",
         ":sentence_batch",
         ":sentence_proto",
         ":task_context",
@@ -439,6 +470,18 @@ filegroup(
 )
 
 cc_test(
+    name = "binary_segment_state_test",
+    size = "small",
+    srcs = ["binary_segment_state_test.cc"],
+    deps = [
+        ":base",
+        ":parser_transitions",
+        ":term_frequency_map",
+        ":test_main",
+    ],
+)
+
+cc_test(
     name = "shared_store_test",
     size = "small",
     srcs = ["shared_store_test.cc"],
@@ -449,6 +492,26 @@ cc_test(
 )
 
 cc_test(
+    name = "char_properties_test",
+    srcs = ["char_properties_test.cc"],
+    deps = [
+        ":char_properties",
+        ":test_main",
+    ],
+)
+
+cc_test(
+    name = "segmenter_utils_test",
+    srcs = ["segmenter_utils_test.cc"],
+    deps = [
+        ":base",
+        ":segmenter_utils",
+        ":sentence_proto",
+        ":test_main",
+    ],
+)
+
+cc_test(
     name = "sentence_features_test",
     size = "medium",
     srcs = ["sentence_features_test.cc"],
@@ -466,6 +529,15 @@ cc_test(
 )
 
 cc_test(
+    name = "morphology_label_set_test",
+    srcs = ["morphology_label_set_test.cc"],
+    deps = [
+        ":morphology_label_set",
+        ":test_main",
+    ],
+)
+
+cc_test(
     name = "arc_standard_transitions_test",
     size = "small",
     srcs = ["arc_standard_transitions_test.cc"],
@@ -480,6 +552,17 @@ cc_test(
 )
 
 cc_test(
+    name = "binary_segment_transitions_test",
+    size = "small",
+    srcs = ["binary_segment_transitions_test.cc"],
+    deps = [
+        ":parser_transitions",
+        ":sentence_proto",
+        ":test_main",
+    ],
+)
+
+cc_test(
     name = "tagger_transitions_test",
     size = "small",
     srcs = ["tagger_transitions_test.cc"],
@@ -499,7 +582,6 @@ cc_test(
     srcs = ["parser_features_test.cc"],
     deps = [
         ":feature_extractor",
-        ":parser_features",
         ":parser_transitions",
         ":populate_test_inputs",
         ":sentence_proto",

+ 102 - 0
syntaxnet/syntaxnet/binary_segment_state.cc

@@ -0,0 +1,102 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "syntaxnet/binary_segment_state.h"
+
+#include <string>
+#include "syntaxnet/segmenter_utils.h"
+#include "syntaxnet/sentence.pb.h"
+
+namespace syntaxnet {
+
+ParserTransitionState *BinarySegmentState::Clone() const {
+  return new BinarySegmentState();
+}
+
+string BinarySegmentState::ToString(const ParserState &state) const {
+  string str("[");
+  for (int i = NumStarts(state) - 1; i >=0; --i) {
+    int start = LastStart(i, state);
+    int end = 0;
+    if (i - 1 >= 0) {
+      end = LastStart(i - 1, state) - 1;
+    } else if (state.EndOfInput()) {
+      end = state.sentence().token_size() - 1;
+    } else {
+      end = state.Next() - 1;
+    }
+    for (int k = start; k <= end; ++k) {
+      str.append(state.GetToken(k).word());
+    }
+    if (i >= 1) str.append(" ");
+  }
+
+  str.append("] ");
+  for (int i = state.Next(); i < state.NumTokens(); ++i) {
+    str.append(state.GetToken(i).word());
+  }
+  return str;
+}
+
+void BinarySegmentState::AddParseToDocument(const ParserState &state,
+                                            bool rewrite_root_labels,
+                                            Sentence *sentence) const {
+  if (sentence->token_size() == 0) return;
+  vector<bool> is_starts(sentence->token_size(), false);
+  for (int i = 0; i < NumStarts(state); ++i) {
+    is_starts[LastStart(i, state)] = true;
+  }
+
+  // Break level of the current token is determined based on its previous token.
+  Token::BreakLevel break_level = Token::NO_BREAK;
+  bool is_first_token = true;
+  Sentence new_sentence;
+  for (int i = 0; i < sentence->token_size(); ++i) {
+    const Token &token = sentence->token(i);
+    const string &word = token.word();
+    bool is_break = SegmenterUtils::IsBreakChar(word);
+    if (is_starts[i] || is_first_token) {
+      if (!is_break) {
+        // The current character is the first char of a new token/word.
+        Token *new_token = new_sentence.add_token();
+        new_token->set_start(token.start());
+        new_token->set_end(token.end());
+        new_token->set_word(word);
+
+        // For the first token, keep the old break level to make sure that the
+        // number of sentences stays unchanged.
+        new_token->set_break_level(break_level);
+        is_first_token = false;
+      }
+    } else {
+      // Append the character to the previous token.
+      if (!is_break) {
+        int index = new_sentence.token_size() - 1;
+        auto *last_token = new_sentence.mutable_token(index);
+        last_token->mutable_word()->append(word);
+        last_token->set_end(token.end());
+      }
+    }
+
+    // Update break level. Note we do not introduce new sentences in the
+    // transition system, thus anything goes beyond line break would be reduced
+    // to line break.
+    break_level = is_break ? SegmenterUtils::BreakLevel(word) : Token::NO_BREAK;
+    if (break_level >= Token::LINE_BREAK) break_level = Token::LINE_BREAK;
+  }
+  sentence->mutable_token()->Swap(new_sentence.mutable_token());
+}
+
+}  // namespace syntaxnet

+ 99 - 0
syntaxnet/syntaxnet/binary_segment_state.h

@@ -0,0 +1,99 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef SYNTAXNET_BINARY_SEGMENT_STATE_H_
+#define SYNTAXNET_BINARY_SEGMENT_STATE_H_
+
+#include "syntaxnet/parser_state.h"
+#include "syntaxnet/parser_transitions.h"
+
+namespace syntaxnet {
+
+class Sentence;
+
+// Parser state for binary segmentation transition system. The input of the
+// system is a sequence of utf8 characters that are to be segmented into tokens.
+// The system contains two type of transitions/actions:
+//  -START: the token at input is the first character of a new word.
+//  -MERGE: the token at input is to be merged with the its previous token.
+//
+// A BinarySegmentState is used to store segmentation histories that can be used
+// as features. In addition, it also provides the functionality to add
+// segmentation results to the document. The function assumes that sentences in
+// a document are processed in left-to-right order. See also the comments of
+// the FinishDocument function for explaination.
+//
+// Note on spaces:
+// Spaces, or more generally break-characters, should never be any part of a
+// word, and the START/MERGE of spaces would be ignored. In addition, if a space
+// starts a new word, then the actual first char of that word is the first
+// non-space token following the space.
+// Some examples:
+//  -chars:  ' ' A B
+//  -tags:    S  M M
+//  -result: 'AB'
+//
+//  -chars:  A ' ' B
+//  -tags:   S  M  M
+//  -result: 'AB'
+//
+//  -chars:  A ' ' B
+//  -tags:   S  S  M
+//  -result: 'AB'
+//
+//  -chars:  A  B  ' '
+//  -tags:   S  S  M
+//  -result: 'A', 'B'
+class BinarySegmentState : public ParserTransitionState {
+ public:
+  ParserTransitionState *Clone() const override;
+  void Init(ParserState *state) override {}
+
+  // Returns the number of start tokens that have already been identified. In
+  // other words, number of start tokens between the first token of the sentence
+  // and state.Input(), with state.Input() excluded.
+  static int NumStarts(const ParserState &state) {
+    return state.StackSize();
+  }
+
+  // Returns the index of the k-th most recent start token.
+  static int LastStart(int k, const ParserState &state) {
+    DCHECK_GE(k, 0);
+    DCHECK_LT(k, NumStarts(state));
+    return state.Stack(k);
+  }
+
+  // Adds the token at given index as a new start token.
+  static void AddStart(int index, ParserState *state) {
+    state->Push(index);
+  }
+
+  // Adds segmentation results to the given sentence.
+  void AddParseToDocument(const ParserState &state,
+                          bool rewrite_root_labels,
+                          Sentence *sentence) const override;
+
+  // Whether a parsed token should be considered correct for evaluation.
+  bool IsTokenCorrect(const ParserState &state, int index) const override {
+    return true;
+  }
+
+  // Returns a human readable string representation of this state.
+  string ToString(const ParserState &state) const override;
+};
+
+}  // namespace syntaxnet
+
+#endif  // SYNTAXNET_BINARY_SEGMENT_STATE_H_

+ 218 - 0
syntaxnet/syntaxnet/binary_segment_state_test.cc

@@ -0,0 +1,218 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "syntaxnet/binary_segment_state.h"
+
+#include <memory>
+
+#include "syntaxnet/base.h"
+#include "syntaxnet/sentence.pb.h"
+#include "syntaxnet/term_frequency_map.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace syntaxnet {
+
+class BinarySegmentStateTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Prepare a sentence.
+    const char *str_sentence = "text: '测试 的 句子' "
+        "token { word: '测' start: 0 end: 2 } "
+        "token { word: '试' start: 3 end: 5 } "
+        "token { word: ' ' start: 6 end: 6 } "
+        "token { word: '的' start: 7 end: 9 } "
+        "token { word: ' ' start: 10 end: 10 } "
+        "token { word: '句' start: 11 end: 13 } "
+        "token { word: '子' start: 14 end: 16 } ";
+    sentence_ = std::unique_ptr<Sentence>(new Sentence());
+    TextFormat::ParseFromString(str_sentence, sentence_.get());
+  }
+
+  // The test document, parse tree, and sentence.
+  std::unique_ptr<Sentence> sentence_;
+  TermFrequencyMap label_map_;
+};
+
+TEST_F(BinarySegmentStateTest, AddStartLastStartNumStartsTest) {
+  BinarySegmentState *segment_state = new BinarySegmentState();
+  ParserState state(sentence_.get(), segment_state, &label_map_);
+
+  // Test segment_state initialized with zero starts.
+  EXPECT_EQ(0, segment_state->NumStarts(state));
+
+  // Adding the first token as a start token.
+  segment_state->AddStart(0, &state);
+  ASSERT_EQ(1, segment_state->NumStarts(state));
+  EXPECT_EQ(0, segment_state->LastStart(0, state));
+
+  // Adding more starts.
+  segment_state->AddStart(2, &state);
+  segment_state->AddStart(3, &state);
+  segment_state->AddStart(4, &state);
+  segment_state->AddStart(5, &state);
+  ASSERT_EQ(5, segment_state->NumStarts(state));
+  EXPECT_EQ(5, segment_state->LastStart(0, state));
+  EXPECT_EQ(4, segment_state->LastStart(1, state));
+  EXPECT_EQ(3, segment_state->LastStart(2, state));
+  EXPECT_EQ(2, segment_state->LastStart(3, state));
+  EXPECT_EQ(0, segment_state->LastStart(4, state));
+}
+
+TEST_F(BinarySegmentStateTest, AddParseToDocumentTest) {
+  BinarySegmentState *segment_state = new BinarySegmentState();
+  ParserState state(sentence_.get(), segment_state, &label_map_);
+
+  // Test gold segmentation.
+  // 0   1   2    3   4   5   6
+  // 测  试  ' '  的  ' '  句  子
+  // S   M   S    S   S   S   M
+  segment_state->AddStart(0, &state);
+  segment_state->AddStart(2, &state);
+  segment_state->AddStart(3, &state);
+  segment_state->AddStart(4, &state);
+  segment_state->AddStart(5, &state);
+  Sentence sentence_with_annotation = *sentence_;
+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
+
+  // Test the number of tokens as well as the start/end byte-offsets of each
+  // token.
+  ASSERT_EQ(3, sentence_with_annotation.token_size());
+
+  // The first token is 测试.
+  EXPECT_EQ(0, sentence_with_annotation.token(0).start());
+  EXPECT_EQ(5, sentence_with_annotation.token(0).end());
+
+  // The second token is 的.
+  EXPECT_EQ(7, sentence_with_annotation.token(1).start());
+  EXPECT_EQ(9, sentence_with_annotation.token(1).end());
+
+  // The third token is 句子.
+  EXPECT_EQ(11, sentence_with_annotation.token(2).start());
+  EXPECT_EQ(16, sentence_with_annotation.token(2).end());
+
+  // Test merge space to other tokens. Since spaces, or more generally break
+  // characters, should never be a part of any word, they are skipped no matter
+  // how they are tagged.
+  // 0   1   2    3   4   5   6
+  // 测  试  ' '  的  ' '  句  子
+  // S   M   M    S   M   M   M
+  while (!state.StackEmpty()) state.Pop();
+  segment_state->AddStart(0, &state);
+  segment_state->AddStart(3, &state);
+  sentence_with_annotation = *sentence_;
+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
+
+  ASSERT_EQ(2, sentence_with_annotation.token_size());
+
+  // The first token is 测试. Note even a space is tagged as "merge", it is not
+  // attached to its previous word.
+  EXPECT_EQ(0, sentence_with_annotation.token(0).start());
+  EXPECT_EQ(5, sentence_with_annotation.token(0).end());
+
+  // The second token is 的句子.
+  EXPECT_EQ(7, sentence_with_annotation.token(1).start());
+  EXPECT_EQ(16, sentence_with_annotation.token(1).end());
+
+  // Test merge a token to space tokens. In such case, the current token would
+  // be merged to the first non-space token on its left side.
+  // 0   1   2    3   4   5   6
+  // 测  试  ' '  的  ' '  句  子
+  // S   M   S    M   S   M   M
+  while (!state.StackEmpty()) state.Pop();
+  segment_state->AddStart(0, &state);
+  segment_state->AddStart(2, &state);
+  segment_state->AddStart(4, &state);
+  sentence_with_annotation = *sentence_;
+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
+  ASSERT_EQ(1, sentence_with_annotation.token_size());
+  EXPECT_EQ(0, sentence_with_annotation.token(0).start());
+  EXPECT_EQ(16, sentence_with_annotation.token(0).end());
+}
+
+TEST_F(BinarySegmentStateTest, SpaceDocumentTest) {
+  const char *str_sentence = "text: ' \t\t' "
+      "token { word: ' ' start: 0 end: 0 } "
+      "token { word: '\t' start: 1 end: 1 } "
+      "token { word: '\t' start: 2 end: 2 } ";
+  TextFormat::ParseFromString(str_sentence, sentence_.get());
+  BinarySegmentState *segment_state = new BinarySegmentState();
+  ParserState state(sentence_.get(), segment_state, &label_map_);
+
+  // Break-chars should always be skipped, no matter how they are tagged.
+  // 0    1     2
+  //' '   '\t'  '\t'
+  // M    M     M
+  Sentence sentence_with_annotation = *sentence_;
+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
+  ASSERT_EQ(0, sentence_with_annotation.token_size());
+
+  // 0    1     2
+  //' '   '\t'  '\t'
+  // S    S     S
+  segment_state->AddStart(0, &state);
+  segment_state->AddStart(1, &state);
+  segment_state->AddStart(2, &state);
+  sentence_with_annotation = *sentence_;
+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
+  ASSERT_EQ(0, sentence_with_annotation.token_size());
+}
+
+TEST_F(BinarySegmentStateTest, DocumentBeginWithSpaceTest) {
+  const char *str_sentence = "text: ' 空格' "
+      "token { word: ' ' start: 0 end: 0 } "
+      "token { word: '空' start: 1 end: 3 } "
+      "token { word: '格' start: 4 end: 6 } ";
+  TextFormat::ParseFromString(str_sentence, sentence_.get());
+  BinarySegmentState *segment_state = new BinarySegmentState();
+  ParserState state(sentence_.get(), segment_state, &label_map_);
+
+  // 0    1    2
+  //' '   空   格
+  // M    M    M
+  Sentence sentence_with_annotation = *sentence_;
+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
+
+  ASSERT_EQ(1, sentence_with_annotation.token_size());
+
+  // The first token is 空格.
+  EXPECT_EQ(1, sentence_with_annotation.token(0).start());
+  EXPECT_EQ(6, sentence_with_annotation.token(0).end());
+
+  // 0    1    2
+  //' '   空   格
+  // S    M    M
+  while (!state.StackEmpty()) state.Pop();
+  segment_state->AddStart(0, &state);
+  sentence_with_annotation = *sentence_;
+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
+
+  ASSERT_EQ(1, sentence_with_annotation.token_size());
+
+  // The first token is 空格.
+  EXPECT_EQ(1, sentence_with_annotation.token(0).start());
+  EXPECT_EQ(6, sentence_with_annotation.token(0).end());
+}
+
+TEST_F(BinarySegmentStateTest, EmptyDocumentTest) {
+  const char *str_sentence = "text: '' ";
+  TextFormat::ParseFromString(str_sentence, sentence_.get());
+  BinarySegmentState *segment_state = new BinarySegmentState();
+  ParserState state(sentence_.get(), segment_state, &label_map_);
+  Sentence sentence_with_annotation = *sentence_;
+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
+  ASSERT_EQ(0, sentence_with_annotation.token_size());
+}
+
+}  // namespace syntaxnet

+ 121 - 0
syntaxnet/syntaxnet/binary_segment_transitions.cc

@@ -0,0 +1,121 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "syntaxnet/binary_segment_state.h"
+#include "syntaxnet/parser_state.h"
+#include "syntaxnet/parser_transitions.h"
+
+namespace syntaxnet {
+
+// Given an input of utf8 characters, the BinarySegmentTransitionSystem
+// conducts word segmentation by performing one of the following two actions:
+//  -START: starts a new word with the token at state.input, and also advances
+//          the state.input.
+//  -MERGE: adds the token at state.input to its prevous word, and also advances
+//          state.input.
+//
+// Also see nlp/saft/components/segmentation/transition/binary-segment-state.h
+// for examples on handling spaces.
+class BinarySegmentTransitionSystem : public ParserTransitionSystem {
+ public:
+  BinarySegmentTransitionSystem() {}
+  ParserTransitionState *NewTransitionState(bool train_mode) const override {
+    return new BinarySegmentState();
+  }
+
+  // Action types for the segmentation-transition system.
+  enum ParserActionType {
+    START = 0,
+    MERGE = 1,
+    CARDINAL = 2
+  };
+
+  static int StartAction() { return 0; }
+  static int MergeAction() { return 1; }
+
+  // The system always starts a new word by default.
+  ParserAction GetDefaultAction(const ParserState &state) const override {
+    return START;
+  }
+
+  // Returns the number of action types.
+  int NumActionTypes() const override {
+    return CARDINAL;
+  }
+
+  // Returns the number of possible actions.
+  int NumActions(int num_labels) const override {
+    return CARDINAL;
+  }
+
+  // Returns the next gold action for a given state according to the underlying
+  // annotated sentence. The training data for the transition system is created
+  // by the binary-segmenter-data task. If a token's break_level is NO_BREAK,
+  // then it is a MERGE, START otherwise. The only exception is that the first
+  // token in a sentence for the transition sysytem is always a START.
+  ParserAction GetNextGoldAction(const ParserState &state) const override {
+    if (state.Next() == 0) return StartAction();
+    const Token &token = state.GetToken(state.Next());
+    return (token.break_level() != Token::NO_BREAK ?
+        StartAction() : MergeAction());
+  }
+
+  // Both START and MERGE can be applied to any tokens in the sentence.
+  bool IsAllowedAction(
+      ParserAction action, const ParserState &state) const override {
+    return true;
+  }
+
+  // Performs the specified action on a given parser state, without adding the
+  // action to the state's history.
+  void PerformActionWithoutHistory(
+      ParserAction action, ParserState *state) const override {
+    // Note when the action is less than 0, it is treated as a START.
+    if (action < 0 || action == StartAction()) {
+      MutableTransitionState(state)->AddStart(state->Next(), state);
+    }
+    state->Advance();
+  }
+
+  // Allows backoff to best allowable transition.
+  bool BackOffToBestAllowableTransition() const override { return true; }
+
+  // A state is a deterministic state iff no tokens have been consumed.
+  bool IsDeterministicState(const ParserState &state) const override {
+    return state.Next() == 0;
+  }
+
+  // For binary segmentation, a state is a final state iff all tokens have been
+  // consumed.
+  bool IsFinalState(const ParserState &state) const override {
+    return state.EndOfInput();
+  }
+
+  // Returns a string representation of a parser action.
+  string ActionAsString(
+      ParserAction action, const ParserState &state) const override {
+    return action == StartAction() ? "START" : "MERGE";
+  }
+
+  // Downcasts the TransitionState in ParserState to an BinarySegmentState.
+  static BinarySegmentState *MutableTransitionState(ParserState *state) {
+    return static_cast<BinarySegmentState *>(state->mutable_transition_state());
+  }
+};
+
+REGISTER_TRANSITION_SYSTEM("binary-segment-transitions",
+                           BinarySegmentTransitionSystem);
+
+}  // namespace syntaxnet

+ 111 - 0
syntaxnet/syntaxnet/binary_segment_transitions_test.cc

@@ -0,0 +1,111 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "syntaxnet/binary_segment_state.h"
+#include "syntaxnet/parser_state.h"
+#include "syntaxnet/parser_transitions.h"
+#include "syntaxnet/term_frequency_map.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace syntaxnet {
+
+class SegmentationTransitionTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    transition_system_ = std::unique_ptr<ParserTransitionSystem>(
+        ParserTransitionSystem::Create("binary-segment-transitions"));
+
+    // Prepare a sentence.
+    const char *str_sentence = "text: '因为 有 这样' "
+        "token { word: '因' start: 0 end: 2 break_level: SPACE_BREAK } "
+        "token { word: '为' start: 3 end: 5 break_level: NO_BREAK } "
+        "token { word: ' ' start: 6 end: 6 break_level: SPACE_BREAK } "
+        "token { word: '有' start: 7 end: 9 break_level: SPACE_BREAK } "
+        "token { word: ' ' start: 10 end: 10 break_level: SPACE_BREAK } "
+        "token { word: '这' start: 11 end: 13 break_level: SPACE_BREAK } "
+        "token { word: '样' start: 14 end: 16 break_level: NO_BREAK } ";
+    sentence_ = std::unique_ptr<Sentence>(new Sentence());
+    TextFormat::ParseFromString(str_sentence, sentence_.get());
+  }
+
+  void CheckStarts(const ParserState &state, const vector<int> &target) {
+    ASSERT_EQ(state.StackSize(), target.size());
+    vector<int> starts;
+    for (int i = 0; i < state.StackSize(); ++i) {
+      EXPECT_EQ(state.Stack(i), target[i]);
+    }
+  }
+
+  // The test document, parse tree, and sentence with tags and partial parses.
+  std::unique_ptr<Sentence> sentence_;
+  std::unique_ptr<ParserTransitionSystem> transition_system_;
+  TermFrequencyMap label_map_;
+};
+
+TEST_F(SegmentationTransitionTest, GoldNextActionTest) {
+  BinarySegmentState *segment_state = static_cast<BinarySegmentState *>(
+      transition_system_->NewTransitionState(true));
+  ParserState state(sentence_.get(), segment_state, &label_map_);
+
+  // Do segmentation by following the gold actions.
+  while (transition_system_->IsFinalState(state) == false) {
+    ParserAction action = transition_system_->GetNextGoldAction(state);
+    transition_system_->PerformActionWithoutHistory(action, &state);
+  }
+
+  // Test STARTs.
+  CheckStarts(state, {5, 4, 3, 2, 0});
+
+  // Test the annotated tokens.
+  segment_state->AddParseToDocument(state, false, sentence_.get());
+  ASSERT_EQ(sentence_->token_size(), 3);
+  EXPECT_EQ(sentence_->token(0).word(), "因为");
+  EXPECT_EQ(sentence_->token(1).word(), "有");
+  EXPECT_EQ(sentence_->token(2).word(), "这样");
+
+  // Test start/end annotation of each token.
+  EXPECT_EQ(sentence_->token(0).start(), 0);
+  EXPECT_EQ(sentence_->token(0).end(), 5);
+  EXPECT_EQ(sentence_->token(1).start(), 7);
+  EXPECT_EQ(sentence_->token(1).end(), 9);
+  EXPECT_EQ(sentence_->token(2).start(), 11);
+  EXPECT_EQ(sentence_->token(2).end(), 16);
+}
+
+TEST_F(SegmentationTransitionTest, DefaultActionTest) {
+  BinarySegmentState *segment_state = static_cast<BinarySegmentState *>(
+      transition_system_->NewTransitionState(true));
+  ParserState state(sentence_.get(), segment_state, &label_map_);
+
+  // Do segmentation, tagging and parsing by following the gold actions.
+  while (transition_system_->IsFinalState(state) == false) {
+    ParserAction action = transition_system_->GetDefaultAction(state);
+    transition_system_->PerformActionWithoutHistory(action, &state);
+  }
+
+  // Every character should be START.
+  CheckStarts(state, {6, 5, 4, 3, 2, 1, 0});
+
+  // Every non-space character should be a word.
+  segment_state->AddParseToDocument(state, false, sentence_.get());
+  ASSERT_EQ(sentence_->token_size(), 5);
+  EXPECT_EQ(sentence_->token(0).word(), "因");
+  EXPECT_EQ(sentence_->token(1).word(), "为");
+  EXPECT_EQ(sentence_->token(2).word(), "有");
+  EXPECT_EQ(sentence_->token(3).word(), "这");
+  EXPECT_EQ(sentence_->token(4).word(), "样");
+}
+
+}  // namespace syntaxnet

+ 845 - 0
syntaxnet/syntaxnet/char_properties.cc

@@ -0,0 +1,845 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// char_properties.cc - define is_X() tests for various character properties
+//
+// See char_properties.h for how to write a character property.
+//
+// References for the char sets below:
+//
+// . http://www.unicode.org/Public/UNIDATA/PropList.txt
+//
+//   Large (but not exhaustive) list of Unicode chars and their "properties"
+//   (e.g., the property "Pi" = an initial quote punctuation char).
+//
+// . http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
+//
+//   Defines the list of properties, such as "Pi", used in the above list.
+//
+// . http://www.unipad.org/unimap/index.php?param_char=XXXX&page=detail
+//
+//   Gives detail about a particular character code.
+//   XXXX is a 4-hex-digit Unicode character code.
+//
+// . http://www.unicode.org/Public/UNIDATA/UCD.html
+//
+//   General reference for Unicode characters.
+//
+
+#include "syntaxnet/char_properties.h"
+
+#include <ctype.h>  // for ispunct, isspace
+#include <memory>
+#include <utility>
+#include <vector>  // for vector
+
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "third_party/utf/utf.h"      // for runetochar, ::UTFmax, Rune
+#include "util/utf8/unilib.h"  // for IsValidCodepoint, etc
+#include "util/utf8/unilib_utf8_utils.h"
+
+//============================================================
+// CharPropertyImplementation
+//
+
+// A CharPropertyImplementation stores a set of Unicode characters,
+// encoded in UTF-8, as a trie.  The trie is represented as a vector
+// of nodes.  Each node is a 256-element array that specifies what to
+// do with one byte of the UTF-8 sequence.  Each element n of a node
+// is one of:
+//  n = 0,  indicating that the Property is not true of any
+//          character whose UTF-8 encoding includes this byte at
+//          this position
+//  n = -1, indicating that the Property is true for the UTF-8 sequence
+//          that ends with this byte.
+//  n > 0,  indicating the index of the row that describes the
+//          remaining bytes in the UTF-8 sequence.
+//
+// The only operation that needs to be fast is HoldsFor, which tests
+// whether a character has a given property. We use each byte of the
+// character's UTF-8 encoding to index into a row. If the value is 0,
+// then the property is not true for the character. (We might discover
+// this even before getting to the end of the sequence.) If the value
+// is -1, then the property is true for this character. Otherwise,
+// the value is the index of another row, which we index using the next
+// byte in the sequence, and so on. The design of UTF-8 prevents
+// ambiguities here; no prefix of a UTF-8 sequence is a valid UTF-8
+// sequence.
+//
+// While it is possible to implement an iterator for this representation,
+// it is much easier to use set<char32> for this purpose. In fact, we
+// would use that as the entire representation, were it not for concerns
+// that HoldsFor might be slower.
+
+namespace syntaxnet {
+
+struct CharPropertyImplementation {
+  unordered_set<char32> chars;
+  vector<vector<int> > rows;
+  CharPropertyImplementation() {
+    rows.reserve(10);
+    rows.resize(1);
+    rows[0].resize(256, 0);
+  }
+  void AddChar(char *buf, int len) {
+    int n = 0;  // row index
+    for (int i = 0; i < len; ++i) {
+      int ch = reinterpret_cast<unsigned char *>(buf)[i];
+      int m = rows[n][ch];
+      if (m > 0) {
+        CHECK_LT(i, len - 1)
+            << " : " << (i + 1) << "-byte UTF-8 sequence "
+            << "(" << tensorflow::str_util::CEscape(string(buf, i + 1)) << ")"
+            << " is prefix of previously-seen UTF-8 sequence(s)";
+        n = m;
+      } else if (i == len - 1) {
+        rows[n][ch] = -1;
+      } else {
+        CHECK_EQ(m, 0) << " : UTF-8 sequence is extension of previously-seen "
+                       << (i + 1) << "-byte UTF-8 sequence "
+                       << "("
+                       << tensorflow::str_util::CEscape(string(buf, i + 1))
+                       << ")";
+        int a = rows.size();
+        rows.resize(a + 1);
+        rows[a].resize(256, 0);
+        rows[n][ch] = a;
+        n = a;
+      }
+    }
+  }
+
+  bool HoldsFor(const char *buf) const {
+    const unsigned char *bytes = reinterpret_cast<const unsigned char *>(buf);
+
+    // Lookup each byte of the UTF-8 sequence, starting in row 0.
+    int n = rows[0][*bytes];
+    if (n == 0) return false;
+    if (n == -1) return true;
+
+    // If the value is not 0 or -1, then it is the index of the row for the
+    // second byte in the sequence.
+    n = rows[n][*++bytes];
+    if (n == 0) return false;
+    if (n == -1) return true;
+    n = rows[n][*++bytes];  // Likewise for the third byte.
+    if (n == 0) return false;
+    if (n == -1) return true;
+    n = rows[n][*++bytes];  // Likewise for the fourth byte.
+    if (n == 0) return false;
+
+    // Since there can be at most 4 bytes in the sequence, n must be -1.
+    return true;
+
+    // Implementation note: it is possible (and perhaps clearer) to write this
+    // code as a loop, "for (int i = 0; i < 4; ++i) ...", but the TestHoldsFor
+    // benchmark results indicate that doing so produces slower code for
+    // anything other than short 7-bit ASCII strings (< 512 bytes). This is
+    // mysterious, since the compiler unrolls the loop, producing code that
+    // is almost the same as what we have here, except for the shortcut on
+    // the 4th byte.
+  }
+};
+
+//============================================================
+// CharProperty - a property that holds for selected Unicode chars
+//
+
+CharProperty::CharProperty(const char *name,
+                           const int *unicodes,
+                           int num_unicodes)
+    : name_(name),
+      impl_(new CharPropertyImplementation) {
+  // Initialize CharProperty to its char set.
+  AddCharSpec(unicodes, num_unicodes);
+}
+
+CharProperty::CharProperty(const char *name, CharPropertyInitializer *init_fn)
+    : name_(name),
+      impl_(new CharPropertyImplementation) {
+  (*init_fn)(this);
+}
+
+CharProperty::~CharProperty() {
+  delete impl_;
+}
+
+void CharProperty::AddChar(int c) {
+  CheckUnicodeVal(c);
+  impl_->chars.insert(c);
+
+  char buf[UTFmax];
+  Rune r = c;
+  int len = runetochar(buf, &r);
+  impl_->AddChar(buf, len);
+}
+
+void CharProperty::AddCharRange(int c1, int c2) {
+  for (int c = c1; c <= c2; ++c) {
+    AddChar(c);
+  }
+}
+
+void CharProperty::AddAsciiPredicate(AsciiPredicate *pred) {
+  for (int c = 0; c < 256; ++c) {
+    if ((*pred)(c)) {
+      AddChar(c);
+    }
+  }
+}
+
+void CharProperty::AddCharProperty(const char *propname) {
+  const CharProperty *prop = CharProperty::Lookup(propname);
+  CHECK(prop != NULL) << ": unknown char property \"" << propname
+                      << "\" in " << name_;
+  int c = -1;
+  while ((c = prop->NextElementAfter(c)) >= 0) {
+    AddChar(c);
+  }
+}
+
+void CharProperty::AddCharSpec(const int *unicodes, int num_unicodes) {
+  for (int i = 0; i < num_unicodes; ++i) {
+    if (i + 3 < num_unicodes && unicodes[i] == kPreUnicodeRange &&
+        unicodes[i + 3] == kPostUnicodeRange) {
+      // Range of unicode values
+      int lower = unicodes[i + 1];
+      int upper = unicodes[i + 2];
+      i += 3;  // i will be incremented once more at top of loop
+      CHECK(lower <= upper) << ": invalid char range in " << name_
+                            << ": [" << UnicodeToString(lower) << ", "
+                            << UnicodeToString(upper) << "]";
+      AddCharRange(lower, upper);
+    } else {
+      AddChar(unicodes[i]);
+    }
+  }
+}
+
+bool CharProperty::HoldsFor(int c) const {
+  if (!UniLib::IsValidCodepoint(c)) return false;
+  char buf[UTFmax];
+  Rune r = c;
+  runetochar(buf, &r);
+  return impl_->HoldsFor(buf);
+}
+
+bool CharProperty::HoldsFor(const char *str, int len) const {
+  // UniLib::IsUTF8ValidCodepoint also checks for structural validity.
+  return len > 0 && UniLib::IsUTF8ValidCodepoint(StringPiece(str, len)) &&
+         impl_->HoldsFor(str);
+}
+
+// Return -1 or the smallest Unicode char greater than c for which
+// the CharProperty holds.  Expects c == -1 or HoldsFor(c).
+int CharProperty::NextElementAfter(int c) const {
+  DCHECK(c == -1 || HoldsFor(c));
+  unordered_set<char32>::const_iterator end = impl_->chars.end();
+  if (c < 0) {
+    unordered_set<char32>::const_iterator it = impl_->chars.begin();
+    if (it == end) return -1;
+    return *it;
+  }
+  char32 r = c;
+  unordered_set<char32>::const_iterator it = impl_->chars.find(r);
+  if (it == end) return -1;
+  it++;
+  if (it == end) return -1;
+  return *it;
+}
+
+REGISTER_CLASS_REGISTRY("char property wrapper", CharPropertyWrapper);
+
+const CharProperty *CharProperty::Lookup(const char *subclass) {
+  // Create a CharPropertyWrapper object and delete it.  We only care about
+  // the CharProperty it provides.
+  std::unique_ptr<CharPropertyWrapper> wrapper(
+      CharPropertyWrapper::Create(subclass));
+  if (wrapper.get() == NULL) {
+    LOG(ERROR) << "CharPropertyWrapper not found for subclass: "
+               << "\"" << subclass << "\"";
+    return NULL;
+  }
+  return wrapper->GetCharProperty();
+}
+
+// Check that a given Unicode value is in range.
+void CharProperty::CheckUnicodeVal(int c) const {
+  CHECK(UniLib::IsValidCodepoint(c))
+      << "Unicode in " << name_ << " out of range: " << UnicodeToString(c);
+}
+
+// Converts a Unicode value to a string (for error messages).
+string CharProperty::UnicodeToString(int c) {
+  const char *fmt;
+
+  if (c < 0) {
+    fmt = "%d";      // out-of-range
+  } else if (c <= 0x7f) {
+    fmt = "'%c'";    // ascii
+  } else if (c <= 0xffff) {
+    fmt = "0x%04X";  // 4 hex digits
+  } else {
+    fmt = "0x%X";    // also out-of-range
+  }
+
+  return tensorflow::strings::Printf(fmt, c);
+}
+
+//======================================================================
+// Expression-level punctuation
+//
+
+// Punctuation that starts a sentence.
+DEFINE_CHAR_PROPERTY_AS_SET(start_sentence_punc,
+  0x00A1,  // Spanish inverted exclamation mark
+  0x00BF,  // Spanish inverted question mark
+)
+
+// Punctuation that ends a sentence.
+// Based on: http://www.unicode.org/unicode/reports/tr29/#Sentence_Boundaries
+DEFINE_CHAR_PROPERTY_AS_SET(end_sentence_punc,
+  '.',
+  '!',
+  '?',
+  0x055C,  // Armenian exclamation mark
+  0x055E,  // Armenian question mark
+  0x0589,  // Armenian full stop
+  0x061F,  // Arabic question mark
+  0x06D4,  // Arabic full stop
+  0x0700,  // Syriac end of paragraph
+  0x0701,  // Syriac supralinear full stop
+  0x0702,  // Syriac sublinear full stop
+  RANGE(0x0964, 0x0965),  // Devanagari danda..Devanagari double danda
+  0x1362,  // Ethiopic full stop
+  0x1367,  // Ethiopic question mark
+  0x1368,  // Ethiopic paragraph separator
+  0x104A,  // Myanmar sign little section
+  0x104B,  // Myanmar sign section
+  0x166E,  // Canadian syllabics full stop
+  0x17d4,  // Khmer sign khan
+  0x1803,  // Mongolian full stop
+  0x1809,  // Mongolian Manchu full stop
+  0x1944,  // Limbu exclamation mark
+  0x1945,  // Limbu question mark
+  0x203C,  // double exclamation mark
+  0x203D,  // interrobang
+  0x2047,  // double question mark
+  0x2048,  // question exclamation mark
+  0x2049,  // exclamation question mark
+  0x3002,  // ideographic full stop
+  0x037E,  // Greek question mark
+  0xFE52,  // small full stop
+  0xFE56,  // small question mark
+  0xFE57,  // small exclamation mark
+  0xFF01,  // fullwidth exclamation mark
+  0xFF0E,  // fullwidth full stop
+  0xFF1F,  // fullwidth question mark
+  0xFF61,  // halfwidth ideographic full stop
+  0x2026,  // ellipsis
+)
+
+// Punctuation, such as parens, that opens a "nested expression" of text.
+DEFINE_CHAR_PROPERTY_AS_SET(open_expr_punc,
+  '(',
+  '[',
+  '<',
+  '{',
+  0x207D,  // superscript left parenthesis
+  0x208D,  // subscript left parenthesis
+  0x27E6,  // mathematical left white square bracket
+  0x27E8,  // mathematical left angle bracket
+  0x27EA,  // mathematical left double angle bracket
+  0x2983,  // left white curly bracket
+  0x2985,  // left white parenthesis
+  0x2987,  // Z notation left image bracket
+  0x2989,  // Z notation left binding bracket
+  0x298B,  // left square bracket with underbar
+  0x298D,  // left square bracket with tick in top corner
+  0x298F,  // left square bracket with tick in bottom corner
+  0x2991,  // left angle bracket with dot
+  0x2993,  // left arc less-than bracket
+  0x2995,  // double left arc greater-than bracket
+  0x2997,  // left black tortoise shell bracket
+  0x29D8,  // left wiggly fence
+  0x29DA,  // left double wiggly fence
+  0x29FC,  // left-pointing curved angle bracket
+  0x3008,  // CJK left angle bracket
+  0x300A,  // CJK left double angle bracket
+  0x3010,  // CJK left black lenticular bracket
+  0x3014,  // CJK left tortoise shell bracket
+  0x3016,  // CJK left white lenticular bracket
+  0x3018,  // CJK left white tortoise shell bracket
+  0x301A,  // CJK left white square bracket
+  0xFD3E,  // Ornate left parenthesis
+  0xFE59,  // small left parenthesis
+  0xFE5B,  // small left curly bracket
+  0xFF08,  // fullwidth left parenthesis
+  0xFF3B,  // fullwidth left square bracket
+  0xFF5B,  // fullwidth left curly bracket
+)
+
+// Punctuation, such as parens, that closes a "nested expression" of text.
+DEFINE_CHAR_PROPERTY_AS_SET(close_expr_punc,
+  ')',
+  ']',
+  '>',
+  '}',
+  0x207E,  // superscript right parenthesis
+  0x208E,  // subscript right parenthesis
+  0x27E7,  // mathematical right white square bracket
+  0x27E9,  // mathematical right angle bracket
+  0x27EB,  // mathematical right double angle bracket
+  0x2984,  // right white curly bracket
+  0x2986,  // right white parenthesis
+  0x2988,  // Z notation right image bracket
+  0x298A,  // Z notation right binding bracket
+  0x298C,  // right square bracket with underbar
+  0x298E,  // right square bracket with tick in top corner
+  0x2990,  // right square bracket with tick in bottom corner
+  0x2992,  // right angle bracket with dot
+  0x2994,  // right arc greater-than bracket
+  0x2996,  // double right arc less-than bracket
+  0x2998,  // right black tortoise shell bracket
+  0x29D9,  // right wiggly fence
+  0x29DB,  // right double wiggly fence
+  0x29FD,  // right-pointing curved angle bracket
+  0x3009,  // CJK right angle bracket
+  0x300B,  // CJK right double angle bracket
+  0x3011,  // CJK right black lenticular bracket
+  0x3015,  // CJK right tortoise shell bracket
+  0x3017,  // CJK right white lenticular bracket
+  0x3019,  // CJK right white tortoise shell bracket
+  0x301B,  // CJK right white square bracket
+  0xFD3F,  // Ornate right parenthesis
+  0xFE5A,  // small right parenthesis
+  0xFE5C,  // small right curly bracket
+  0xFF09,  // fullwidth right parenthesis
+  0xFF3D,  // fullwidth right square bracket
+  0xFF5D,  // fullwidth right curly bracket
+)
+
+// Chars that open a quotation.
+// Based on: http://www.unicode.org/uni2book/ch06.pdf
+DEFINE_CHAR_PROPERTY_AS_SET(open_quote,
+  '"',
+  '\'',
+  '`',
+  0xFF07,  // fullwidth apostrophe
+  0xFF02,  // fullwidth quotation mark
+  0x2018,  // left single quotation mark (English, others)
+  0x201C,  // left double quotation mark (English, others)
+  0x201B,  // single high-reveresed-9 quotation mark (PropList.txt)
+  0x201A,  // single low-9 quotation mark (Czech, German, Slovak)
+  0x201E,  // double low-9 quotation mark (Czech, German, Slovak)
+  0x201F,  // double high-reversed-9 quotation mark (PropList.txt)
+  0x2019,  // right single quotation mark (Danish, Finnish, Swedish, Norw.)
+  0x201D,  // right double quotation mark (Danish, Finnish, Swedish, Norw.)
+  0x2039,  // single left-pointing angle quotation mark (French, others)
+  0x00AB,  // left-pointing double angle quotation mark (French, others)
+  0x203A,  // single right-pointing angle quotation mark (Slovenian, others)
+  0x00BB,  // right-pointing double angle quotation mark (Slovenian, others)
+  0x300C,  // left corner bracket (East Asian languages)
+  0xFE41,  // presentation form for vertical left corner bracket
+  0xFF62,  // halfwidth left corner bracket (East Asian languages)
+  0x300E,  // left white corner bracket (East Asian languages)
+  0xFE43,  // presentation form for vertical left white corner bracket
+  0x301D,  // reversed double prime quotation mark (East Asian langs, horiz.)
+)
+
+// Chars that close a quotation.
+// Based on: http://www.unicode.org/uni2book/ch06.pdf
+DEFINE_CHAR_PROPERTY_AS_SET(close_quote,
+  '\'',
+  '"',
+  '`',
+  0xFF07,  // fullwidth apostrophe
+  0xFF02,  // fullwidth quotation mark
+  0x2019,  // right single quotation mark (English, others)
+  0x201D,  // right double quotation mark (English, others)
+  0x2018,  // left single quotation mark (Czech, German, Slovak)
+  0x201C,  // left double quotation mark (Czech, German, Slovak)
+  0x203A,  // single right-pointing angle quotation mark (French, others)
+  0x00BB,  // right-pointing double angle quotation mark (French, others)
+  0x2039,  // single left-pointing angle quotation mark (Slovenian, others)
+  0x00AB,  // left-pointing double angle quotation mark (Slovenian, others)
+  0x300D,  // right corner bracket (East Asian languages)
+  0xfe42,  // presentation form for vertical right corner bracket
+  0xFF63,  // halfwidth right corner bracket (East Asian languages)
+  0x300F,  // right white corner bracket (East Asian languages)
+  0xfe44,  // presentation form for vertical right white corner bracket
+  0x301F,  // low double prime quotation mark (East Asian languages)
+  0x301E,  // close double prime (East Asian languages written horizontally)
+)
+
+// Punctuation chars that open an expression or a quotation.
+DEFINE_CHAR_PROPERTY(open_punc, prop) {
+  prop->AddCharProperty("open_expr_punc");
+  prop->AddCharProperty("open_quote");
+}
+
+// Punctuation chars that close an expression or a quotation.
+DEFINE_CHAR_PROPERTY(close_punc, prop) {
+  prop->AddCharProperty("close_expr_punc");
+  prop->AddCharProperty("close_quote");
+}
+
+// Punctuation chars that can come at the beginning of a sentence.
+DEFINE_CHAR_PROPERTY(leading_sentence_punc, prop) {
+  prop->AddCharProperty("open_punc");
+  prop->AddCharProperty("start_sentence_punc");
+}
+
+// Punctuation chars that can come at the end of a sentence.
+DEFINE_CHAR_PROPERTY(trailing_sentence_punc, prop) {
+  prop->AddCharProperty("close_punc");
+  prop->AddCharProperty("end_sentence_punc");
+}
+
+//======================================================================
+// Special symbols
+//
+
+// Currency symbols.
+// From: http://www.unicode.org/charts/PDF/U20A0.pdf
+DEFINE_CHAR_PROPERTY_AS_SET(currency_symbol,
+  '$',
+  // 0x00A2,  // cents (NB: typically FOLLOWS the amount)
+  0x00A3,  // pounds and liras
+  0x00A4,  // general currency sign
+  0x00A5,  // yen or yuan
+  0x0192,  // Dutch florin (latin small letter "f" with hook)
+  0x09F2,  // Bengali rupee mark
+  0x09F3,  // Bengali rupee sign
+  0x0AF1,  // Guajarati rupee sign
+  0x0BF9,  // Tamil rupee sign
+  0x0E3F,  // Thai baht
+  0x17DB,  // Khmer riel
+  0x20A0,  // alternative euro sign
+  0x20A1,  // Costa Rica, El Salvador (colon sign)
+  0x20A2,  // Brazilian cruzeiro
+  0x20A3,  // French Franc
+  0x20A4,  // alternative lira sign
+  0x20A5,  // mill sign (USA 1/10 cent)
+  0x20A6,  // Nigerian Naira
+  0x20A7,  // Spanish peseta
+  0x20A8,  // Indian rupee
+  0x20A9,  // Korean won
+  0x20AA,  // Israeli new sheqel
+  0x20AB,  // Vietnam dong
+  0x20AC,  // euro sign
+  0x20AD,  // Laotian kip
+  0x20AE,  // Mongolian tugrik
+  0x20AF,  // Greek drachma
+  0x20B0,  // German penny
+  0x20B1,  // Philippine peso (Mexican peso uses "$")
+  0x2133,  // Old German mark (script capital M)
+  0xFDFC,  // rial sign
+  0xFFE0,  // fullwidth cents
+  0xFFE1,  // fullwidth pounds
+  0xFFE5,  // fullwidth Japanese yen
+  0xFFE6,  // fullwidth Korean won
+)
+
+// Chinese bookquotes.
+// They look like "<<" and ">>" except that they are single UTF8 chars
+// (U+300A, U+300B). These are used in chinese as special
+// punctuation, refering to the title of a book, an article, a movie,
+// etc.  For example: "cellphone" means cellphone, but <<cellphone>>
+// means (exclusively) the movie.
+DEFINE_CHAR_PROPERTY_AS_SET(open_bookquote,
+ 0x300A
+)
+
+DEFINE_CHAR_PROPERTY_AS_SET(close_bookquote,
+ 0x300B
+)
+
+//======================================================================
+// Token-level punctuation
+//
+
+// Token-prefix symbols, excluding currency symbols -- glom on
+// to following token (esp. if no space after)
+DEFINE_CHAR_PROPERTY_AS_SET(noncurrency_token_prefix_symbol,
+  '#',
+  0x2116,  // numero sign ("No")
+)
+
+// Token-prefix symbols -- glom on to following token (esp. if no space after)
+DEFINE_CHAR_PROPERTY(token_prefix_symbol, prop) {
+  prop->AddCharProperty("currency_symbol");
+  prop->AddCharProperty("noncurrency_token_prefix_symbol");
+}
+
+// Token-suffix symbols -- glom on to preceding token (esp. if no space before)
+DEFINE_CHAR_PROPERTY_AS_SET(token_suffix_symbol,
+  '%',
+  0x066A,  // Arabic percent sign
+  0x2030,  // per mille
+  0x2031,  // per ten thousand
+  0x00A2,  // cents sign
+  0x2125,  // ounces sign
+  0x00AA,  // feminine ordinal indicator (Spanish)
+  0x00BA,  // masculine ordinal indicator (Spanish)
+  0x00B0,  // degrees
+  0x2109,  // degrees Fahrenheit
+  0x2103,  // degrees Celsius
+  0x2126,  // ohms
+  0x212A,  // Kelvin
+  0x212B,  // Angstroms ("A" with circle on top)
+  0x00A9,  // copyright
+  0x2117,  // sound recording copyright (circled "P")
+  0x2122,  // trade mark
+  0x00AE,  // registered trade mark
+  0x2120,  // service mark
+  0x2106,  // cada una ("c/a" == "each" in Spanish)
+  0x2020,  // dagger (can be used for footnotes)
+  0x2021,  // double dagger (can be used for footnotes)
+)
+
+// Subscripts
+DEFINE_CHAR_PROPERTY_AS_SET(subscript_symbol,
+  0x2080,  // subscript 0
+  0x2081,  // subscript 1
+  0x2082,  // subscript 2
+  0x2083,  // subscript 3
+  0x2084,  // subscript 4
+  0x2085,  // subscript 5
+  0x2086,  // subscript 6
+  0x2087,  // subscript 7
+  0x2088,  // subscript 8
+  0x2089,  // subscript 9
+  0x208A,  // subscript "+"
+  0x208B,  // subscript "-"
+  0x208C,  // subscript "="
+  0x208D,  // subscript "("
+  0x208E,  // subscript ")"
+)
+
+// Superscripts
+DEFINE_CHAR_PROPERTY_AS_SET(superscript_symbol,
+  0x2070,  // superscript 0
+  0x00B9,  // superscript 1
+  0x00B2,  // superscript 2
+  0x00B3,  // superscript 3
+  0x2074,  // superscript 4
+  0x2075,  // superscript 5
+  0x2076,  // superscript 6
+  0x2077,  // superscript 7
+  0x2078,  // superscript 8
+  0x2079,  // superscript 9
+  0x2071,  // superscript Latin small "i"
+  0x207A,  // superscript "+"
+  0x207B,  // superscript "-"
+  0x207C,  // superscript "="
+  0x207D,  // superscript "("
+  0x207E,  // superscript ")"
+  0x207F,  // superscript Latin small "n"
+)
+
+//======================================================================
+// General punctuation
+//
+
+// Connector punctuation
+// Code Pc from http://www.unicode.org/Public/UNIDATA/PropList.txt
+// NB: This list is not necessarily exhaustive.
+DEFINE_CHAR_PROPERTY_AS_SET(connector_punc,
+  0x30fb,  // Katakana middle dot
+  0xff65,  // halfwidth Katakana middle dot
+  0x2040,  // character tie
+)
+
+// Dashes
+// Code Pd from http://www.unicode.org/Public/UNIDATA/PropList.txt
+// NB: This list is not necessarily exhaustive.
+DEFINE_CHAR_PROPERTY_AS_SET(dash_punc,
+  '-',
+  '~',
+  0x058a,  // Armenian hyphen
+  0x1806,  // Mongolian todo soft hyphen
+  RANGE(0x2010, 0x2015),  // hyphen..horizontal bar
+  0x2053,  // swung dash -- from Table 6-3 of Unicode book
+  0x207b,  // superscript minus
+  0x208b,  // subscript minus
+  0x2212,  // minus sign
+  0x301c,  // wave dash
+  0x3030,  // wavy dash
+  RANGE(0xfe31, 0xfe32),  // presentation form for vertical em dash..en dash
+  0xfe58,  // small em dash
+  0xfe63,  // small hyphen-minus
+  0xff0d,  // fullwidth hyphen-minus
+)
+
+// Other punctuation
+// Code Po from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
+// NB: This list is not exhaustive.
+DEFINE_CHAR_PROPERTY_AS_SET(other_punc,
+  ',',
+  ':',
+  ';',
+  0x00b7,  // middle dot
+  0x0387,  // Greek ano teleia
+  0x05c3,  // Hebrew punctuation sof pasuq
+  0x060c,  // Arabic comma
+  0x061b,  // Arabic semicolon
+  0x066b,  // Arabic decimal separator
+  0x066c,  // Arabic thousands separator
+  RANGE(0x0703, 0x70a),  // Syriac contraction and others
+  0x070c,  // Syric harklean metobelus
+  0x0e5a,  // Thai character angkhankhu
+  0x0e5b,  // Thai character khomut
+  0x0f08,  // Tibetan mark sbrul shad
+  RANGE(0x0f0d, 0x0f12),  // Tibetan mark shad..Tibetan mark rgya gram shad
+  0x1361,  // Ethiopic wordspace
+  RANGE(0x1363, 0x1366),  // other Ethiopic chars
+  0x166d,  // Canadian syllabics chi sign
+  RANGE(0x16eb, 0x16ed),  // Runic single punctuation..Runic cross punctuation
+  RANGE(0x17d5, 0x17d6),  // Khmer sign camnuc pii huuh and other
+  0x17da,  // Khmer sign koomut
+  0x1802,  // Mongolian comma
+  RANGE(0x1804, 0x1805),  // Mongolian four dots and other
+  0x1808,  // Mongolian manchu comma
+  0x3001,  // ideographic comma
+  RANGE(0xfe50, 0xfe51),  // small comma and others
+  RANGE(0xfe54, 0xfe55),  // small semicolon and other
+  0xff0c,  // fullwidth comma
+  RANGE(0xff0e, 0xff0f),  // fullwidth stop..fullwidth solidus
+  RANGE(0xff1a, 0xff1b),  // fullwidth colon..fullwidth semicolon
+  0xff64,  // halfwidth ideographic comma
+  0x2016,  // double vertical line
+  RANGE(0x2032, 0x2034),  // prime..triple prime
+  0xfe61,  // small asterisk
+  0xfe68,  // small reverse solidus
+  0xff3c,  // fullwidth reverse solidus
+)
+
+// All punctuation.
+// Code P from http://www.unicode.org/Public/UNIDATA/PropList.txt
+// NB: This list is not necessarily exhaustive.
+DEFINE_CHAR_PROPERTY(punctuation, prop) {
+  prop->AddCharProperty("open_punc");
+  prop->AddCharProperty("close_punc");
+  prop->AddCharProperty("leading_sentence_punc");
+  prop->AddCharProperty("trailing_sentence_punc");
+  prop->AddCharProperty("connector_punc");
+  prop->AddCharProperty("dash_punc");
+  prop->AddCharProperty("other_punc");
+  prop->AddAsciiPredicate(&ispunct);
+}
+
+//======================================================================
+// Separators
+//
+
+// Line separators
+// Code Zl from http://www.unicode.org/Public/UNIDATA/PropList.txt
+// NB: This list is not necessarily exhaustive.
+DEFINE_CHAR_PROPERTY_AS_SET(line_separator,
+  0x2028,                           // line separator
+)
+
+// Paragraph separators
+// Code Zp from http://www.unicode.org/Public/UNIDATA/PropList.txt
+// NB: This list is not necessarily exhaustive.
+DEFINE_CHAR_PROPERTY_AS_SET(paragraph_separator,
+  0x2029,                           // paragraph separator
+)
+
+// Space separators
+// Code Zs from http://www.unicode.org/Public/UNIDATA/PropList.txt
+// NB: This list is not necessarily exhaustive.
+DEFINE_CHAR_PROPERTY_AS_SET(space_separator,
+  0x0020,                           // space
+  0x00a0,                           // no-break space
+  0x1680,                           // Ogham space mark
+  0x180e,                           // Mongolian vowel separator
+  RANGE(0x2000, 0x200a),            // en quad..hair space
+  0x202f,                           // narrow no-break space
+  0x205f,                           // medium mathematical space
+  0x3000,                           // ideographic space
+
+  // Google additions
+  0xe5e5,                           // "private" char used as space in Chinese
+)
+
+// Separators -- all line, paragraph, and space separators.
+// Code Z from http://www.unicode.org/Public/UNIDATA/PropList.txt
+// NB: This list is not necessarily exhaustive.
+DEFINE_CHAR_PROPERTY(separator, prop) {
+  prop->AddCharProperty("line_separator");
+  prop->AddCharProperty("paragraph_separator");
+  prop->AddCharProperty("space_separator");
+  prop->AddAsciiPredicate(&isspace);
+}
+
+//======================================================================
+// Alphanumeric Characters
+//
+
+// Digits
+DEFINE_CHAR_PROPERTY_AS_SET(digit,
+  RANGE('0', '9'),
+  RANGE(0x0660, 0x0669),  // Arabic-Indic digits
+  RANGE(0x06F0, 0x06F9),  // Eastern Arabic-Indic digits
+)
+
+//======================================================================
+// Japanese Katakana
+//
+
+DEFINE_CHAR_PROPERTY_AS_SET(katakana,
+  0x3099,  // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
+  0x309A,  // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
+  0x309B,  // KATAKANA-HIRAGANA VOICED SOUND MARK
+  0x309C,  // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
+  RANGE(0x30A0, 0x30FF),  // Fullwidth Katakana
+  RANGE(0xFF65, 0xFF9F),  // Halfwidth Katakana
+)
+
+//======================================================================
+// BiDi Directional Formatting Codes
+//
+
+// See http://www.unicode.org/reports/tr9/ for a description of Bidi
+// and http://www.unicode.org/charts/PDF/U2000.pdf for the character codes.
+DEFINE_CHAR_PROPERTY_AS_SET(directional_formatting_code,
+  0x200E,  // LRM (Left-to-Right Mark)
+  0x200F,  // RLM (Right-to-Left Mark)
+  0x202A,  // LRE (Left-to-Right Embedding)
+  0x202B,  // RLE (Right-to-Left Embedding)
+  0x202C,  // PDF (Pop Directional Format)
+  0x202D,  // LRO (Left-to-Right Override)
+  0x202E,  // RLO (Right-to-Left Override)
+)
+
+//======================================================================
+// Special collections
+//
+
+// NB: This does not check for all punctuation and symbols in the
+// standard; just those listed in our code. See the definitions in
+// char_properties.cc
+DEFINE_CHAR_PROPERTY(punctuation_or_symbol, prop) {
+  prop->AddCharProperty("punctuation");
+  prop->AddCharProperty("subscript_symbol");
+  prop->AddCharProperty("superscript_symbol");
+  prop->AddCharProperty("token_prefix_symbol");
+  prop->AddCharProperty("token_suffix_symbol");
+}
+
+}  // namespace syntaxnet

+ 362 - 0
syntaxnet/syntaxnet/char_properties.h

@@ -0,0 +1,362 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// char_properties.h - define is_X() tests for various character properties
+//
+// Character properties can be defined in two ways:
+//
+// (1) Set-based:
+//
+//     Enumerate the chars that have the property.  Example:
+//
+//       DEFINE_CHAR_PROPERTY_AS_SET(my_fave,
+//         RANGE('0', '9'),
+//         '\'',
+//         0x00BF,   // Spanish inverted question mark
+//       )
+//
+//     Characters are expressed as Unicode code points; note that ascii codes
+//     are a subset.  RANGE() specifies an inclusive range of code points.
+//
+//     This defines two functions:
+//
+//       bool is_my_fave(const char *str, int len)
+//       bool is_my_fave(int c)
+//
+//     Each returns true for precisely the 12 characters specified above.
+//     Each takes a *single* UTf8 char as its argument -- the first expresses
+//     it as a char * and a length, the second as a Unicode code point.
+//     Please do not pass a string of multiple UTF8 chars to the first one.
+//
+//     To make is_my_fave() externally accessible, put in your .h file:
+//
+//       DECLARE_CHAR_PROPERTY(my_fave)
+//
+// (2) Function-based:
+//
+//     Specify a function that assigns the desired chars to a CharProperty
+//     object.  Example:
+//
+//       DEFINE_CHAR_PROPERTY(my_other_fave, prop) {
+//         for (int i = '0'; i <= '9'; i += 2) {
+//           prop->AddChar(i);
+//         }
+//         prop->AddAsciiPredicate(&ispunct);
+//         prop->AddCharProperty("currency_symbol");
+//       }
+//
+//     This defines a function of one arg: CharProperty *prop.  The function
+//     calls various CharProperty methods to populate the prop.  The last call
+//     above, AddCharProperty(), adds the chars from another char property
+//     ("currency_symbol").
+//
+//     As in the set-based case, put a DECLARE_CHAR_PROPERTY(my_other_fave)
+//     in your .h if you want is_my_other_fave() to be externally accessible.
+//
+
+#ifndef SYNTAXNET_CHAR_PROPERTIES_H_
+#define SYNTAXNET_CHAR_PROPERTIES_H_
+
+#include <string>  // for string
+
+#include "syntaxnet/registry.h"
+#include "syntaxnet/utils.h"
+
+// =====================================================================
+// Registry for accessing CharProperties by name
+//
+// This is for internal use by the CharProperty class and macros; callers
+// should not use it explicitly.
+//
+
+namespace syntaxnet {
+
+class CharProperty;   // forward declaration
+
+// Wrapper around a CharProperty, allowing it to be stored in a registry.
+struct CharPropertyWrapper : RegisterableClass<CharPropertyWrapper> {
+  virtual ~CharPropertyWrapper() { }
+  virtual CharProperty *GetCharProperty() = 0;
+};
+
+#define REGISTER_CHAR_PROPERTY_WRAPPER(type, component) \
+  REGISTER_CLASS_COMPONENT(CharPropertyWrapper, type, component)
+
+#define REGISTER_CHAR_PROPERTY(lsp, name)                         \
+  struct name##CharPropertyWrapper : public CharPropertyWrapper { \
+    CharProperty *GetCharProperty() { return lsp.get(); }         \
+  };                                                              \
+  REGISTER_CHAR_PROPERTY_WRAPPER(#name, name##CharPropertyWrapper)
+
+// =====================================================================
+// Macros for defining character properties
+//
+
+// Define is_X() functions to test whether a single UTF8 character has
+// the 'X' char prop.
+#define DEFINE_IS_X_CHAR_PROPERTY_FUNCTIONS(lsp, name) \
+  bool is_##name(const char *str, int len) {                                 \
+    return lsp->HoldsFor(str, len);                                          \
+  }                                                                          \
+  bool is_##name(int c) {                                                    \
+    return lsp->HoldsFor(c);                                                 \
+  }
+
+// Define a char property by enumerating the unicode char points,
+// or RANGE()s thereof, for which it holds.  Example:
+//
+//   DEFINE_CHAR_PROPERTY_AS_SET(my_fave,
+//     'q',
+//     RANGE('0', '9'),
+//     0x20AB,
+//   )
+//
+// "..." is a GNU extension.
+#define DEFINE_CHAR_PROPERTY_AS_SET(name, unicodes...)                         \
+  static const int k_##name##_unicodes[] = {unicodes};                         \
+  static utils::LazyStaticPtr<CharProperty, const char *, const int *, size_t> \
+      name##_char_property = {#name, k_##name##_unicodes,                      \
+                              arraysize(k_##name##_unicodes)};                 \
+  REGISTER_CHAR_PROPERTY(name##_char_property, name);                          \
+  DEFINE_IS_X_CHAR_PROPERTY_FUNCTIONS(name##_char_property, name)
+
+// Specify a range (inclusive) of Unicode character values.
+// Example: RANGE('0', '9') specifies the 10 digits.
+// For use as an element in a DEFINE_CHAR_PROPERTY_AS_SET() list.
+static const int kPreUnicodeRange = -1;
+static const int kPostUnicodeRange = -2;
+#define RANGE(lower, upper) \
+  kPreUnicodeRange, lower, upper, kPostUnicodeRange
+
+// A function to initialize a CharProperty.
+typedef void CharPropertyInitializer(CharProperty *prop);
+
+// Define a char property by specifying a block of code that initializes it.
+// Example:
+//
+//   DEFINE_CHAR_PROPERTY(my_other_fave, prop) {
+//     for (int i = '0'; i <= '9'; i += 2) {
+//       prop->AddChar(i);
+//     }
+//     prop->AddAsciiPredicate(&ispunct);
+//     prop->AddCharProperty("currency_symbol");
+//   }
+//
+#define DEFINE_CHAR_PROPERTY(name, charpropvar)                       \
+  static void init_##name##_char_property(CharProperty *charpropvar); \
+  static utils::LazyStaticPtr<CharProperty, const char *,             \
+                              CharPropertyInitializer *>              \
+      name##_char_property = {#name, &init_##name##_char_property};   \
+  REGISTER_CHAR_PROPERTY(name##_char_property, name);                 \
+  DEFINE_IS_X_CHAR_PROPERTY_FUNCTIONS(name##_char_property, name)     \
+  static void init_##name##_char_property(CharProperty *charpropvar)
+
+// =====================================================================
+// Macro for declaring character properties
+//
+
+#define DECLARE_CHAR_PROPERTY(name) \
+  extern bool is_##name(const char *str, int len);                           \
+  extern bool is_##name(int c);                                              \
+
+// ===========================================================
+// CharProperty - a property that holds for selected Unicode chars
+//
+// A CharProperty is semantically equivalent to set<char32>.
+//
+// The characters for which a CharProperty holds are represented as a trie,
+// i.e., a tree that is indexed by successive bytes of the UTF-8 encoding
+// of the characters.  This permits fast lookup (HoldsFor).
+//
+
+// A function that defines a subset of [0..255], e.g., isspace.
+typedef int AsciiPredicate(int c);
+
+class CharProperty {
+ public:
+  // Constructor for set-based char properties.
+  CharProperty(const char *name, const int *unicodes, int num_unicodes);
+
+  // Constructor for function-based char properties.
+  CharProperty(const char *name, CharPropertyInitializer *init_fn);
+
+  virtual ~CharProperty();
+
+  // Various ways of adding chars to a CharProperty; for use only in
+  // CharPropertyInitializer functions.
+  void AddChar(int c);
+  void AddCharRange(int c1, int c2);
+  void AddAsciiPredicate(AsciiPredicate *pred);
+  void AddCharProperty(const char *name);
+  void AddCharSpec(const int *unicodes, int num_unicodes);
+
+  // Return true iff the CharProperty holds for a single given UTF8 char.
+  bool HoldsFor(const char *str, int len) const;
+
+  // Return true iff the CharProperty holds for a single given Unicode char.
+  bool HoldsFor(int c) const;
+
+  // You can use this to enumerate the set elements (it was easier
+  // than defining a real iterator).  Returns -1 if there are no more.
+  // Call with -1 to get the first element.  Expects c == -1 or HoldsFor(c).
+  int NextElementAfter(int c) const;
+
+  // Return NULL or the CharProperty with the given name.  Looks up the name
+  // in a CharProperty registry.
+  static const CharProperty *Lookup(const char *name);
+
+ private:
+  void CheckUnicodeVal(int c) const;
+  static string UnicodeToString(int c);
+
+  const char *name_;
+  struct CharPropertyImplementation *impl_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CharProperty);
+};
+
+//======================================================================
+// Expression-level punctuation
+//
+
+// Punctuation that starts a sentence.
+DECLARE_CHAR_PROPERTY(start_sentence_punc);
+
+// Punctuation that ends a sentence.
+DECLARE_CHAR_PROPERTY(end_sentence_punc);
+
+// Punctuation, such as parens, that opens a "nested expression" of text.
+DECLARE_CHAR_PROPERTY(open_expr_punc);
+
+// Punctuation, such as parens, that closes a "nested expression" of text.
+DECLARE_CHAR_PROPERTY(close_expr_punc);
+
+// Chars that open a quotation.
+DECLARE_CHAR_PROPERTY(open_quote);
+
+// Chars that close a quotation.
+DECLARE_CHAR_PROPERTY(close_quote);
+
+// Punctuation chars that open an expression or a quotation.
+DECLARE_CHAR_PROPERTY(open_punc);
+
+// Punctuation chars that close an expression or a quotation.
+DECLARE_CHAR_PROPERTY(close_punc);
+
+// Punctuation chars that can come at the beginning of a sentence.
+DECLARE_CHAR_PROPERTY(leading_sentence_punc);
+
+// Punctuation chars that can come at the end of a sentence.
+DECLARE_CHAR_PROPERTY(trailing_sentence_punc);
+
+//======================================================================
+// Token-level punctuation
+//
+
+// Token-prefix symbols -- glom on to following token
+// (esp. if no space after) -- except for currency symbols.
+DECLARE_CHAR_PROPERTY(noncurrency_token_prefix_symbol);
+
+// Token-prefix symbols -- glom on to following token (esp. if no space after).
+DECLARE_CHAR_PROPERTY(token_prefix_symbol);
+
+// Token-suffix symbols -- glom on to preceding token (esp. if no space
+// before).
+DECLARE_CHAR_PROPERTY(token_suffix_symbol);
+
+// Subscripts.
+DECLARE_CHAR_PROPERTY(subscript_symbol);
+
+// Superscripts.
+DECLARE_CHAR_PROPERTY(superscript_symbol);
+
+//======================================================================
+// General punctuation
+//
+
+// Connector punctuation.
+DECLARE_CHAR_PROPERTY(connector_punc);
+
+// Dashes.
+DECLARE_CHAR_PROPERTY(dash_punc);
+
+// Other punctuation.
+DECLARE_CHAR_PROPERTY(other_punc);
+
+// All punctuation.
+DECLARE_CHAR_PROPERTY(punctuation);
+
+//======================================================================
+// Special symbols
+//
+
+// Currency symbols.
+DECLARE_CHAR_PROPERTY(currency_symbol);
+
+// Chinese bookquotes.
+DECLARE_CHAR_PROPERTY(open_bookquote);
+DECLARE_CHAR_PROPERTY(close_bookquote);
+
+//======================================================================
+// Separators
+//
+
+// Line separators.
+DECLARE_CHAR_PROPERTY(line_separator);
+
+// Paragraph separators.
+DECLARE_CHAR_PROPERTY(paragraph_separator);
+
+// Space separators.
+DECLARE_CHAR_PROPERTY(space_separator);
+
+// Separators -- all line, paragraph, and space separators.
+DECLARE_CHAR_PROPERTY(separator);
+
+//======================================================================
+// Alphanumeric Characters
+//
+
+// Digits.
+DECLARE_CHAR_PROPERTY(digit);
+
+// Japanese Katakana.
+DECLARE_CHAR_PROPERTY(katakana);
+
+//======================================================================
+// BiDi Directional Formatting Codes
+//
+
+// Explicit directional formatting codes (LRM, RLM, LRE, RLE, PDF, LRO, RLO)
+// used by the bidirectional algorithm.
+//
+// Note: Use this only to classify characters. To actually determine
+// directionality of BiDi text, look under i18n/bidi.
+//
+// See http://www.unicode.org/reports/tr9/ for a description of the algorithm
+// and http://www.unicode.org/charts/PDF/U2000.pdf for the character codes.
+DECLARE_CHAR_PROPERTY(directional_formatting_code);
+
+//======================================================================
+// Special collections
+//
+
+// NB: This does not check for all punctuation and symbols in the standard;
+// just those listed in our code. See the definitions in char_properties.cc.
+DECLARE_CHAR_PROPERTY(punctuation_or_symbol);
+
+}  // namespace syntaxnet
+
+#endif  // SYNTAXNET_CHAR_PROPERTIES_H_

+ 364 - 0
syntaxnet/syntaxnet/char_properties_test.cc

@@ -0,0 +1,364 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Tests for char_properties.cc:
+//
+// (1) Test the DEFINE_CHAR_PROPERTY_AS_SET and DEFINE_CHAR_PROPERTY macros
+//     by defining a few fake char properties and verifying their contents.
+//
+// (2) Test the char properties defined in char_properties.cc by spot-checking
+//     a few chars.
+//
+
+#include "syntaxnet/char_properties.h"
+
+#include <ctype.h>  // for ispunct, isspace
+#include <map>
+#include <set>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>  // for ContainerEq, EXPECT_THAT
+#include "tensorflow/core/platform/test.h"
+#include "third_party/utf/utf.h"
+#include "util/utf8/unilib.h"  // for IsValidCodepoint, etc
+#include "util/utf8/unilib_utf8_utils.h"
+
+using ::testing::ContainerEq;
+
+namespace syntaxnet {
+
+// Invalid UTF-8 bytes are decoded as the Replacement Character, U+FFFD
+// (which is also Runeerror). Invalid code points are encoded in UTF-8
+// with the UTF-8 representation of the Replacement Character.
+static const char ReplacementCharacterUTF8[3] = {'\xEF', '\xBF', '\xBD'};
+
+// ====================================================================
+// CharPropertiesTest
+//
+
+class CharPropertiesTest : public testing::Test {
+ protected:
+  // Collect a set of chars.
+  void CollectChars(const std::set<char32> &chars) {
+    collected_set_.insert(chars.begin(), chars.end());
+  }
+
+  // Collect an array of chars.
+  void CollectArray(const char32 arr[], int len) {
+    collected_set_.insert(arr, arr + len);
+  }
+
+  // Collect the chars for which the named CharProperty holds.
+  void CollectCharProperty(const char *name) {
+    const CharProperty *prop = CharProperty::Lookup(name);
+    ASSERT_TRUE(prop != nullptr) << "for " << name;
+
+    for (char32 c = 0; c <= 0x10FFFF; ++c) {
+      if (UniLib::IsValidCodepoint(c) && prop->HoldsFor(c)) {
+        collected_set_.insert(c);
+      }
+    }
+  }
+
+  // Collect the chars for which an ascii predicate holds.
+  void CollectAsciiPredicate(AsciiPredicate *pred) {
+    for (char32 c = 0; c < 256; ++c) {
+      if ((*pred)(c)) {
+        collected_set_.insert(c);
+      }
+    }
+  }
+
+  // Expect the named char property to be true for precisely the chars in
+  // the collected set.
+  void ExpectCharPropertyEqualsCollectedSet(const char *name) {
+    const CharProperty *prop = CharProperty::Lookup(name);
+    ASSERT_TRUE(prop != nullptr) << "for " << name;
+
+    // Test that char property holds for all collected chars.  Exercises both
+    // signatures of CharProperty::HoldsFor().
+    for (std::set<char32>::const_iterator it = collected_set_.begin();
+         it != collected_set_.end(); ++it) {
+      // Test utf8 version of is_X().
+      const char32 c = *it;
+      string utf8_char = EncodeAsUTF8(&c, 1);
+      EXPECT_TRUE(prop->HoldsFor(utf8_char.c_str(), utf8_char.size()));
+
+      // Test ucs-2 version of is_X().
+      EXPECT_TRUE(prop->HoldsFor(static_cast<int>(c)));
+    }
+
+    // Test that the char property holds for precisely the collected chars.
+    // Somewhat redundant with previous test, but exercises
+    // CharProperty::NextElementAfter().
+    std::set<char32> actual_chars;
+    int c = -1;
+    while ((c = prop->NextElementAfter(c)) >= 0) {
+      actual_chars.insert(static_cast<char32>(c));
+    }
+    EXPECT_THAT(actual_chars, ContainerEq(collected_set_))
+        << " for " << name;
+  }
+
+  // Expect the named char property to be true for at least the chars in
+  // the collected set.
+  void ExpectCharPropertyContainsCollectedSet(const char *name) {
+    const CharProperty *prop = CharProperty::Lookup(name);
+    ASSERT_TRUE(prop != nullptr) << "for " << name;
+
+    for (std::set<char32>::const_iterator it = collected_set_.begin();
+         it != collected_set_.end(); ++it) {
+      EXPECT_TRUE(prop->HoldsFor(static_cast<int>(*it)));
+    }
+  }
+
+  string EncodeAsUTF8(const char32 *in, int size) {
+    string out;
+    out.reserve(size);
+    for (int i = 0; i < size; ++i) {
+      char buf[UTFmax];
+      int len = EncodeAsUTF8Char(*in++, buf);
+      out.append(buf, len);
+    }
+    return out;
+  }
+
+  int EncodeAsUTF8Char(char32 in, char *out) {
+    if (UniLib::IsValidCodepoint(in)) {
+      return runetochar(out, &in);
+    } else {
+      memcpy(out, ReplacementCharacterUTF8, 3);
+      return 3;
+    }
+  }
+
+ private:
+  std::set<char32> collected_set_;
+};
+
+//======================================================================
+// Declarations of the sample character sets below
+// (to test the DECLARE_CHAR_PROPERTY() macro)
+//
+
+DECLARE_CHAR_PROPERTY(test_digit);
+DECLARE_CHAR_PROPERTY(test_wavy_dash);
+DECLARE_CHAR_PROPERTY(test_digit_or_wavy_dash);
+DECLARE_CHAR_PROPERTY(test_punctuation_plus);
+
+//======================================================================
+// Definitions of sample character sets
+//
+
+// Digits.
+DEFINE_CHAR_PROPERTY_AS_SET(test_digit,
+  RANGE('0', '9'),
+)
+
+// Wavy dashes.
+DEFINE_CHAR_PROPERTY_AS_SET(test_wavy_dash,
+  '~',
+  0x301C,  // wave dash
+  0x3030,  // wavy dash
+)
+
+// Digits or wavy dashes.
+DEFINE_CHAR_PROPERTY(test_digit_or_wavy_dash, prop) {
+  prop->AddCharProperty("test_digit");
+  prop->AddCharProperty("test_wavy_dash");
+}
+
+// Punctuation plus a few extraneous chars.
+DEFINE_CHAR_PROPERTY(test_punctuation_plus, prop) {
+  prop->AddChar('a');
+  prop->AddCharRange('b', 'b');
+  prop->AddCharRange('c', 'e');
+  static const int kUnicodes[] = {'f', RANGE('g', 'i'), 'j'};
+  prop->AddCharSpec(kUnicodes, arraysize(kUnicodes));
+  prop->AddCharProperty("punctuation");
+}
+
+//====================================================================
+// Another form of the character sets above -- for verification
+//
+
+const char32 kTestDigit[] = {
+  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
+};
+
+const char32 kTestWavyDash[] = {
+  '~',
+  0x301C,  // wave dash,
+  0x3030,  // wavy dash
+};
+
+const char32 kTestPunctuationPlusExtras[] = {
+  'a',
+  'b',
+  'c',
+  'd',
+  'e',
+  'f',
+  'g',
+  'h',
+  'i',
+  'j',
+};
+
+// ====================================================================
+// Tests
+//
+
+TEST_F(CharPropertiesTest, TestDigit) {
+  CollectArray(kTestDigit, arraysize(kTestDigit));
+  ExpectCharPropertyEqualsCollectedSet("test_digit");
+}
+
+TEST_F(CharPropertiesTest, TestWavyDash) {
+  CollectArray(kTestWavyDash, arraysize(kTestWavyDash));
+  ExpectCharPropertyEqualsCollectedSet("test_wavy_dash");
+}
+
+TEST_F(CharPropertiesTest, TestDigitOrWavyDash) {
+  CollectArray(kTestDigit, arraysize(kTestDigit));
+  CollectArray(kTestWavyDash, arraysize(kTestWavyDash));
+  ExpectCharPropertyEqualsCollectedSet("test_digit_or_wavy_dash");
+}
+
+TEST_F(CharPropertiesTest, TestPunctuationPlus) {
+  CollectCharProperty("punctuation");
+  CollectArray(kTestPunctuationPlusExtras,
+               arraysize(kTestPunctuationPlusExtras));
+  ExpectCharPropertyEqualsCollectedSet("test_punctuation_plus");
+}
+
+// ====================================================================
+// Spot-check predicates in char_properties.cc
+//
+
+TEST_F(CharPropertiesTest, StartSentencePunc) {
+  CollectChars({0x00A1, 0x00BF});
+  ExpectCharPropertyContainsCollectedSet("start_sentence_punc");
+}
+
+TEST_F(CharPropertiesTest, EndSentencePunc) {
+  CollectChars({'.', '!', '?'});
+  ExpectCharPropertyContainsCollectedSet("end_sentence_punc");
+}
+
+TEST_F(CharPropertiesTest, OpenExprPunc) {
+  CollectChars({'(', '['});
+  ExpectCharPropertyContainsCollectedSet("open_expr_punc");
+}
+
+TEST_F(CharPropertiesTest, CloseExprPunc) {
+  CollectChars({')', ']'});
+  ExpectCharPropertyContainsCollectedSet("close_expr_punc");
+}
+
+TEST_F(CharPropertiesTest, OpenQuote) {
+  CollectChars({'\'', '"'});
+  ExpectCharPropertyContainsCollectedSet("open_quote");
+}
+
+TEST_F(CharPropertiesTest, CloseQuote) {
+  CollectChars({'\'', '"'});
+  ExpectCharPropertyContainsCollectedSet("close_quote");
+}
+
+TEST_F(CharPropertiesTest, OpenBookquote) {
+  CollectChars({0x300A});
+  ExpectCharPropertyContainsCollectedSet("open_bookquote");
+}
+
+TEST_F(CharPropertiesTest, CloseBookquote) {
+  CollectChars({0x300B});
+  ExpectCharPropertyContainsCollectedSet("close_bookquote");
+}
+
+TEST_F(CharPropertiesTest, OpenPunc) {
+  CollectChars({'(', '['});
+  CollectChars({'\'', '"'});
+  ExpectCharPropertyContainsCollectedSet("open_punc");
+}
+
+TEST_F(CharPropertiesTest, ClosePunc) {
+  CollectChars({')', ']'});
+  CollectChars({'\'', '"'});
+  ExpectCharPropertyContainsCollectedSet("close_punc");
+}
+
+TEST_F(CharPropertiesTest, LeadingSentencePunc) {
+  CollectChars({'(', '['});
+  CollectChars({'\'', '"'});
+  CollectChars({0x00A1, 0x00BF});
+  ExpectCharPropertyContainsCollectedSet("leading_sentence_punc");
+}
+
+TEST_F(CharPropertiesTest, TrailingSentencePunc) {
+  CollectChars({')', ']'});
+  CollectChars({'\'', '"'});
+  CollectChars({'.', '!', '?'});
+  ExpectCharPropertyContainsCollectedSet("trailing_sentence_punc");
+}
+
+TEST_F(CharPropertiesTest, NoncurrencyTokenPrefixSymbol) {
+  CollectChars({'#'});
+  ExpectCharPropertyContainsCollectedSet("noncurrency_token_prefix_symbol");
+}
+
+TEST_F(CharPropertiesTest, TokenSuffixSymbol) {
+  CollectChars({'%', 0x2122, 0x00A9, 0x00B0});
+  ExpectCharPropertyContainsCollectedSet("token_suffix_symbol");
+}
+
+TEST_F(CharPropertiesTest, TokenPrefixSymbol) {
+  CollectChars({'#'});
+  CollectChars({'$', 0x00A5, 0x20AC});
+  ExpectCharPropertyContainsCollectedSet("token_prefix_symbol");
+}
+
+TEST_F(CharPropertiesTest, SubscriptSymbol) {
+  CollectChars({0x2082, 0x2083});
+  ExpectCharPropertyContainsCollectedSet("subscript_symbol");
+}
+
+TEST_F(CharPropertiesTest, SuperscriptSymbol) {
+  CollectChars({0x00B2, 0x00B3});
+  ExpectCharPropertyContainsCollectedSet("superscript_symbol");
+}
+
+TEST_F(CharPropertiesTest, CurrencySymbol) {
+  CollectChars({'$', 0x00A5, 0x20AC});
+  ExpectCharPropertyContainsCollectedSet("currency_symbol");
+}
+
+TEST_F(CharPropertiesTest, DirectionalFormattingCode) {
+  CollectChars({0x200E, 0x200F, 0x202A, 0x202B, 0x202C, 0x202D, 0x202E});
+  ExpectCharPropertyContainsCollectedSet("directional_formatting_code");
+}
+
+TEST_F(CharPropertiesTest, Punctuation) {
+  CollectAsciiPredicate(ispunct);
+  ExpectCharPropertyContainsCollectedSet("punctuation");
+}
+
+TEST_F(CharPropertiesTest, Separator) {
+  CollectAsciiPredicate(isspace);
+  ExpectCharPropertyContainsCollectedSet("separator");
+}
+
+}  // namespace syntaxnet

+ 4 - 2
syntaxnet/syntaxnet/document_filters.cc

@@ -77,7 +77,8 @@ class DocumentSource : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("batch_size", &batch_size_));
     OP_REQUIRES(context, batch_size_ > 0,
                 InvalidArgument("invalid batch_size provided"));
-    corpus_.reset(new TextReader(*task_context_.GetInput(corpus_name)));
+    corpus_.reset(
+        new TextReader(*task_context_.GetInput(corpus_name), &task_context_));
   }
 
   void Compute(OpKernelContext *context) override {
@@ -124,7 +125,8 @@ class DocumentSink : public OpKernel {
     GetTaskContext(context, &task_context_);
     string corpus_name;
     OP_REQUIRES_OK(context, context->GetAttr("corpus_name", &corpus_name));
-    writer_.reset(new TextWriter(*task_context_.GetInput(corpus_name)));
+    writer_.reset(
+        new TextWriter(*task_context_.GetInput(corpus_name), &task_context_));
   }
 
   void Compute(OpKernelContext *context) override {

+ 2 - 0
syntaxnet/syntaxnet/document_format.h

@@ -38,6 +38,8 @@ class DocumentFormat : public RegisterableClass<DocumentFormat> {
   DocumentFormat() {}
   virtual ~DocumentFormat() {}
 
+  virtual void Setup(TaskContext *context) {}
+
   // Reads a record from the given input buffer with format specific logic.
   // Returns false if no record could be read because we reached end of file.
   virtual bool ReadRecord(tensorflow::io::InputBuffer *buffer,

+ 12 - 1
syntaxnet/syntaxnet/lexicon_builder.cc

@@ -19,6 +19,7 @@ limitations under the License.
 #include "syntaxnet/affix.h"
 #include "syntaxnet/dictionary.pb.h"
 #include "syntaxnet/feature_extractor.h"
+#include "syntaxnet/segmenter_utils.h"
 #include "syntaxnet/sentence.pb.h"
 #include "syntaxnet/sentence_batch.h"
 #include "syntaxnet/term_frequency_map.h"
@@ -75,6 +76,7 @@ class LexiconBuilder : public OpKernel {
     TermFrequencyMap tags;
     TermFrequencyMap categories;
     TermFrequencyMap labels;
+    TermFrequencyMap chars;
 
     // Affix tables to be populated by the corpus.
     AffixTable prefixes(AffixTable::PREFIX, max_prefix_length_);
@@ -87,7 +89,7 @@ class LexiconBuilder : public OpKernel {
     int64 num_tokens = 0;
     int64 num_documents = 0;
     Sentence *document;
-    TextReader corpus(*task_context_.GetInput(corpus_name_));
+    TextReader corpus(*task_context_.GetInput(corpus_name_), &task_context_);
     while ((document = corpus.Read()) != nullptr) {
       // Gather token information.
       for (int t = 0; t < document->token_size(); ++t) {
@@ -114,6 +116,14 @@ class LexiconBuilder : public OpKernel {
         // Add mapping from tag to category.
         tag_to_category.SetCategory(token.tag(), token.category());
 
+        // Add characters.
+        vector<tensorflow::StringPiece> char_sp;
+        SegmenterUtils::GetUTF8Chars(word, &char_sp);
+        for (const auto &c : char_sp) {
+          const string c_str = c.ToString();
+          if (!c_str.empty() && !HasSpaces(c_str)) chars.Increment(c_str);
+        }
+
         // Update the number of processed tokens.
         ++num_tokens;
       }
@@ -131,6 +141,7 @@ class LexiconBuilder : public OpKernel {
     categories.Save(
         TaskContext::InputFile(*task_context_.GetInput("category-map")));
     labels.Save(TaskContext::InputFile(*task_context_.GetInput("label-map")));
+    chars.Save(TaskContext::InputFile(*task_context_.GetInput("char-map")));
 
     // Write affixes to disk.
     WriteAffixTable(prefixes, TaskContext::InputFile(

+ 25 - 1
syntaxnet/syntaxnet/lexicon_builder_test.py

@@ -69,6 +69,8 @@ TOKENIZED_DOCS = u'''बात गलत हो तो गुस्सा से
 लेकिन अभिनेत्री के इस कदम से वहां रंग में भंग पड़ गया ।
 '''
 
+CHARS = u'''अ इ आ क ग ज ट त द न प भ ब य म र ल व ह स ि ा ु ी े ै ो ् ड़ । ं'''
+
 COMMENTS = u'# Line with fake comments.'
 
 
@@ -93,7 +95,7 @@ class LexiconBuilderTest(test_util.TensorFlowTestCase):
     self.AddInput('documents', self.corpus_file, corpus_format, context)
     for name in ('word-map', 'lcword-map', 'tag-map',
                  'category-map', 'label-map', 'prefix-table',
-                 'suffix-table', 'tag-to-category'):
+                 'suffix-table', 'tag-to-category', 'char-map'):
       self.AddInput(name, os.path.join(FLAGS.test_tmpdir, name), '', context)
     logging.info('Writing context to: %s', self.context_file)
     with open(self.context_file, 'w') as f:
@@ -133,6 +135,26 @@ class LexiconBuilderTest(test_util.TensorFlowTestCase):
       self.assertIn(tag, TAGS)
       self.assertIn(category, CATEGORIES)
 
+  def LoadMap(self, map_name):
+    loaded_map = {}
+    with file(os.path.join(FLAGS.test_tmpdir, map_name), 'r') as f:
+      for line in f:
+        entries = line.strip().split(' ')
+        if len(entries) == 2:
+          loaded_map[entries[0]] = entries[1]
+    return loaded_map
+
+  def ValidateCharMap(self):
+    char_map = self.LoadMap('char-map')
+    self.assertEqual(len(char_map), len(CHARS.split(' ')))
+    for char in CHARS.split(' '):
+      self.assertIn(char.encode('utf-8'), char_map)
+
+  def ValidateWordMap(self):
+    word_map = self.LoadMap('word-map')
+    for word in filter(None, TOKENIZED_DOCS.replace('\n', ' ').split(' ')):
+      self.assertIn(word.encode('utf-8'), word_map)
+
   def BuildLexicon(self):
     with self.test_session():
       gen_parser_ops.lexicon_builder(task_context=self.context_file).run()
@@ -146,6 +168,8 @@ class LexiconBuilderTest(test_util.TensorFlowTestCase):
     self.ValidateDocuments()
     self.BuildLexicon()
     self.ValidateTagToCategoryMap()
+    self.ValidateCharMap()
+    self.ValidateWordMap()
 
   def testCoNLLFormatExtraNewlinesAndComments(self):
     self.WriteContext('conll-sentence')

+ 298 - 0
syntaxnet/syntaxnet/morpher_transitions.cc

@@ -0,0 +1,298 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Morpher transition system.
+//
+// This transition system has one type of actions:
+//  - The SHIFT action pushes the next input token to the stack and
+//    advances to the next input token, assigning a part-of-speech tag to the
+//    token that was shifted.
+//
+// The transition system operates with parser actions encoded as integers:
+//  - A SHIFT action is encoded as number starting from 0.
+
+#include <string>
+
+#include "syntaxnet/morphology_label_set.h"
+#include "syntaxnet/parser_features.h"
+#include "syntaxnet/parser_state.h"
+#include "syntaxnet/parser_transitions.h"
+#include "syntaxnet/sentence_features.h"
+#include "syntaxnet/shared_store.h"
+#include "syntaxnet/task_context.h"
+#include "syntaxnet/term_frequency_map.h"
+#include "syntaxnet/utils.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace syntaxnet {
+
+class MorphologyTransitionState : public ParserTransitionState {
+ public:
+  explicit MorphologyTransitionState(const MorphologyLabelSet *label_set)
+      : label_set_(label_set) {}
+
+  explicit MorphologyTransitionState(const MorphologyTransitionState *state)
+      : MorphologyTransitionState(state->label_set_) {
+    tag_ = state->tag_;
+    gold_tag_ = state->gold_tag_;
+  }
+
+  // Clones the transition state by returning a new object.
+  ParserTransitionState *Clone() const override {
+    return new MorphologyTransitionState(this);
+  }
+
+  // Reads gold tags for each token.
+  void Init(ParserState *state) override {
+    tag_.resize(state->sentence().token_size(), -1);
+    gold_tag_.resize(state->sentence().token_size(), -1);
+    for (int pos = 0; pos < state->sentence().token_size(); ++pos) {
+      const Token &token = state->GetToken(pos);
+
+      // NOTE: we allow token to not have a TokenMorphology extension or for the
+      // TokenMorphology to be absent from the label_set_ because this can
+      // happen at test time.
+      gold_tag_[pos] = label_set_->LookupExisting(
+          token.GetExtension(TokenMorphology::morphology));
+    }
+  }
+
+  // Returns the tag assigned to a given token.
+  int Tag(int index) const {
+    DCHECK_GE(index, 0);
+    DCHECK_LT(index, tag_.size());
+    return index == -1 ? -1 : tag_[index];
+  }
+
+  // Sets this tag on the token at index.
+  void SetTag(int index, int tag) {
+    DCHECK_GE(index, 0);
+    DCHECK_LT(index, tag_.size());
+    tag_[index] = tag;
+  }
+
+  // Returns the gold tag for a given token.
+  int GoldTag(int index) const {
+    DCHECK_GE(index, -1);
+    DCHECK_LT(index, gold_tag_.size());
+    return index == -1 ? -1 : gold_tag_[index];
+  }
+
+  // Returns the proto corresponding to the tag, or an empty proto if the tag is
+  // not found.
+  const TokenMorphology &TagAsProto(int tag) const {
+    if (tag >= 0 && tag < label_set_->Size()) {
+      return label_set_->Lookup(tag);
+    }
+    return TokenMorphology::default_instance();
+  }
+
+  // Adds transition state specific annotations to the document.
+  void AddParseToDocument(const ParserState &state, bool rewrite_root_labels,
+                          Sentence *sentence) const override {
+    for (int i = 0; i < tag_.size(); ++i) {
+      Token *token = sentence->mutable_token(i);
+      *token->MutableExtension(TokenMorphology::morphology) =
+          TagAsProto(Tag(i));
+    }
+  }
+
+  // Whether a parsed token should be considered correct for evaluation.
+  bool IsTokenCorrect(const ParserState &state, int index) const override {
+    return GoldTag(index) == Tag(index);
+  }
+
+  // Returns a human readable string representation of this state.
+  string ToString(const ParserState &state) const override {
+    string str;
+    for (int i = state.StackSize(); i > 0; --i) {
+      const string &word = state.GetToken(state.Stack(i - 1)).word();
+      if (i != state.StackSize() - 1) str.append(" ");
+      tensorflow::strings::StrAppend(
+          &str, word, "[",
+          TagAsProto(Tag(state.StackSize() - i)).ShortDebugString(), "]");
+    }
+    for (int i = state.Next(); i < state.NumTokens(); ++i) {
+      tensorflow::strings::StrAppend(&str, " ", state.GetToken(i).word());
+    }
+    return str;
+  }
+
+ private:
+  // Currently assigned morphological analysis for each token in this sentence.
+  vector<int> tag_;
+
+  // Gold morphological analysis from the input document.
+  vector<int> gold_tag_;
+
+  // Tag map used for conversions between integer and string representations
+  // part of speech tags. Not owned.
+  const MorphologyLabelSet *label_set_ = nullptr;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(MorphologyTransitionState);
+};
+
+class MorphologyTransitionSystem : public ParserTransitionSystem {
+ public:
+  ~MorphologyTransitionSystem() override { SharedStore::Release(label_set_); }
+
+  // Determines tag map location.
+  void Setup(TaskContext *context) override {
+    context->GetInput("morph-label-set");
+  }
+
+  // Reads tag map and tag to category map.
+  void Init(TaskContext *context) override {
+    const string fname =
+        TaskContext::InputFile(*context->GetInput("morph-label-set"));
+    label_set_ =
+        SharedStoreUtils::GetWithDefaultName<MorphologyLabelSet>(fname);
+  }
+
+  // The SHIFT action uses the same value as the corresponding action type.
+  static ParserAction ShiftAction(int tag) { return tag; }
+
+  // The morpher transition system doesn't look at the dependency tree, so it
+  // allows non-projective trees.
+  bool AllowsNonProjective() const override { return true; }
+
+  // Returns the number of action types.
+  int NumActionTypes() const override { return 1; }
+
+  // Returns the number of possible actions.
+  int NumActions(int num_labels) const override { return label_set_->Size(); }
+
+  // The default action for a given state is assigning the most frequent tag.
+  ParserAction GetDefaultAction(const ParserState &state) const override {
+    return ShiftAction(0);
+  }
+
+  // Returns the next gold action for a given state according to the
+  // underlying annotated sentence.
+  ParserAction GetNextGoldAction(const ParserState &state) const override {
+    if (!state.EndOfInput()) {
+      return ShiftAction(TransitionState(state).GoldTag(state.Next()));
+    }
+    return ShiftAction(0);
+  }
+
+  // Checks if the action is allowed in a given parser state.
+  bool IsAllowedAction(ParserAction action,
+                       const ParserState &state) const override {
+    return !state.EndOfInput();
+  }
+
+  // Makes a shift by pushing the next input token on the stack and moving to
+  // the next position.
+  void PerformActionWithoutHistory(ParserAction action,
+                                   ParserState *state) const override {
+    DCHECK(!state->EndOfInput());
+    if (!state->EndOfInput()) {
+      MutableTransitionState(state)->SetTag(state->Next(), action);
+      state->Push(state->Next());
+      state->Advance();
+    }
+  }
+
+  // We are in a final state when we reached the end of the input and the stack
+  // is empty.
+  bool IsFinalState(const ParserState &state) const override {
+    return state.EndOfInput();
+  }
+
+  // Returns a string representation of a parser action.
+  string ActionAsString(ParserAction action,
+                        const ParserState &state) const override {
+    return tensorflow::strings::StrCat(
+        "SHIFT(", label_set_->Lookup(action).ShortDebugString(), ")");
+  }
+
+  // No state is deterministic in this transition system.
+  bool IsDeterministicState(const ParserState &state) const override {
+    return false;
+  }
+
+  // Returns a new transition state to be used to enhance the parser state.
+  ParserTransitionState *NewTransitionState(bool training_mode) const override {
+    return new MorphologyTransitionState(label_set_);
+  }
+
+  // Downcasts the const ParserTransitionState in ParserState to a const
+  // MorphologyTransitionState.
+  static const MorphologyTransitionState &TransitionState(
+      const ParserState &state) {
+    return *static_cast<const MorphologyTransitionState *>(
+        state.transition_state());
+  }
+
+  // Downcasts the ParserTransitionState in ParserState to an
+  // MorphologyTransitionState.
+  static MorphologyTransitionState *MutableTransitionState(ParserState *state) {
+    return static_cast<MorphologyTransitionState *>(
+        state->mutable_transition_state());
+  }
+
+  // Input for the tag map. Not owned.
+  TaskInput *input_label_set_ = nullptr;
+
+  // Tag map used for conversions between integer and string representations
+  // morphology labels. Owned through SharedStore.
+  const MorphologyLabelSet *label_set_;
+};
+
+REGISTER_TRANSITION_SYSTEM("morpher", MorphologyTransitionSystem);
+
+// Feature function for retrieving the tag assigned to a token by the tagger
+// transition system.
+class PredictedMorphTagFeatureFunction : public ParserIndexFeatureFunction {
+ public:
+  PredictedMorphTagFeatureFunction() {}
+
+  // Determines tag map location.
+  void Setup(TaskContext *context) override {
+    context->GetInput("morph-label-set", "recordio", "token-morphology");
+  }
+
+  // Reads tag map.
+  void Init(TaskContext *context) override {
+    const string fname =
+        TaskContext::InputFile(*context->GetInput("morph-label-set"));
+    label_set_ = SharedStore::Get<MorphologyLabelSet>(fname, fname);
+    set_feature_type(new FullLabelFeatureType(name(), label_set_));
+  }
+
+  // Gets the MorphologyTransitionState from the parser state and reads the
+  // assigned
+  // tag at the focus index. Returns -1 if the focus is not within the sentence.
+  FeatureValue Compute(const WorkspaceSet &workspaces, const ParserState &state,
+                       int focus, const FeatureVector *result) const override {
+    if (focus < 0 || focus >= state.sentence().token_size()) return -1;
+    return static_cast<const MorphologyTransitionState *>(
+               state.transition_state())
+        ->Tag(focus);
+  }
+
+ private:
+  // Tag map used for conversions between integer and string representations
+  // part of speech tags. Owned through SharedStore.
+  const MorphologyLabelSet *label_set_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(PredictedMorphTagFeatureFunction);
+};
+
+REGISTER_PARSER_IDX_FEATURE_FUNCTION("pred-morph-tag",
+                                     PredictedMorphTagFeatureFunction);
+
+}  // namespace syntaxnet

+ 91 - 0
syntaxnet/syntaxnet/morphology_label_set.cc

@@ -0,0 +1,91 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "syntaxnet/morphology_label_set.h"
+
+namespace syntaxnet {
+
+const char MorphologyLabelSet::kSeparator[] = "\t";
+
+int MorphologyLabelSet::Add(const TokenMorphology &morph) {
+  string repr = StringForMatch(morph);
+  auto it = fast_lookup_.find(repr);
+  if (it != fast_lookup_.end()) return it->second;
+  fast_lookup_[repr] = label_set_.size();
+  label_set_.push_back(morph);
+  return label_set_.size() - 1;
+}
+
+// Look up an existing TokenMorphology.  If it is not present, return -1.
+int MorphologyLabelSet::LookupExisting(const TokenMorphology &morph) const {
+  string repr = StringForMatch(morph);
+  auto it = fast_lookup_.find(repr);
+  if (it != fast_lookup_.end()) return it->second;
+  return -1;
+}
+
+// Return the TokenMorphology at position i.  The input i should be in the range
+// 0..size().
+const TokenMorphology &MorphologyLabelSet::Lookup(int i) const {
+  CHECK_GE(i, 0);
+  CHECK_LT(i, label_set_.size());
+  return label_set_[i];
+}
+
+void MorphologyLabelSet::Read(const string &filename) {
+  ProtoRecordReader reader(filename);
+  Read(&reader);
+}
+
+void MorphologyLabelSet::Read(ProtoRecordReader *reader) {
+  TokenMorphology morph;
+  while (reader->Read(&morph).ok()) {
+    CHECK_EQ(-1, LookupExisting(morph));
+    Add(morph);
+  }
+}
+
+void MorphologyLabelSet::Write(const string &filename) const {
+  ProtoRecordWriter writer(filename);
+  Write(&writer);
+}
+
+void MorphologyLabelSet::Write(ProtoRecordWriter *writer) const {
+  for (const TokenMorphology &morph : label_set_) {
+    writer->Write(morph);
+  }
+}
+
+string MorphologyLabelSet::StringForMatch(const TokenMorphology &morph) const {
+  vector<string> attributes;
+  for (const auto &a : morph.attribute()) {
+    attributes.push_back(
+        tensorflow::strings::StrCat(a.name(), kSeparator, a.value()));
+  }
+  std::sort(attributes.begin(), attributes.end());
+  return utils::Join(attributes, kSeparator);
+}
+
+string FullLabelFeatureType::GetFeatureValueName(FeatureValue value) const {
+  const TokenMorphology &morph = label_set_->Lookup(value);
+  vector<string> attributes;
+  for (const auto &a : morph.attribute()) {
+    attributes.push_back(tensorflow::strings::StrCat(a.name(), ":", a.value()));
+  }
+  std::sort(attributes.begin(), attributes.end());
+  return utils::Join(attributes, ",");
+}
+
+}  // namespace syntaxnet

+ 110 - 0
syntaxnet/syntaxnet/morphology_label_set.h

@@ -0,0 +1,110 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A class to store the set of possible TokenMorphology objects.  This includes
+// lookup, iteration and serialziation.
+
+#ifndef SYNTAXNET_MORPHOLOGY_LABEL_SET_H_
+#define SYNTAXNET_MORPHOLOGY_LABEL_SET_H_
+
+#include <unordered_map>
+#include <string>
+#include <vector>
+
+#include "syntaxnet/proto_io.h"
+#include "syntaxnet/sentence.pb.h"
+
+namespace syntaxnet {
+
+class MorphologyLabelSet {
+ public:
+  // Initalize as an empty morphology.
+  MorphologyLabelSet() {}
+
+  // Initalizes by reading the given file, which has been saved by Write().
+  // This makes using the shared store easier.
+  explicit MorphologyLabelSet(const string &fname) { Read(fname); }
+
+  // Adds a TokenMorphology to the set if it is not present. In any case, return
+  // its position in the list. Note: This is slow, and should not be called
+  // outside of training or init.
+  int Add(const TokenMorphology &morph);
+
+  // Look up an existing TokenMorphology. If it is not present, return -1.
+  // Note: This is slow, and should not be called outside of training workflow
+  // or init.
+  int LookupExisting(const TokenMorphology &morph) const;
+
+  // Return the TokenMorphology at position i. The input i should be in the
+  // range 0..size(). Note: this will be called at inference time and needs to
+  // be kept fast.
+  const TokenMorphology &Lookup(int i) const;
+
+  // Return the number of elements.
+  int Size() const { return label_set_.size(); }
+
+  // Deserialization and serialization.
+  void Read(const string &filename);
+  void Write(const string &filename) const;
+
+ private:
+  string StringForMatch(const TokenMorphology &morhp) const;
+
+  // Deserialization and serialziation implementation.
+  void Read(ProtoRecordReader *reader);
+  void Write(ProtoRecordWriter *writer) const;
+
+  // List of all possible annotations.  This is a unique list, where equality is
+  // defined as follows:
+  //
+  //   a == b iff the set of attribute pairs (attribute, value) is identical.
+  vector<TokenMorphology> label_set_;
+
+  // Because protocol buffer equality is complicated, we implement our own
+  // equality operator based on strings. This unordered_map allows us to do the
+  // lookup more quickly.
+  unordered_map<string, int> fast_lookup_;
+
+  // A separator string that should not occur in any of the attribute names.
+  // This should never be serialized, so that it can be changed in the code if
+  // we change attribute names and it occurs in the new names.
+  static const char kSeparator[];
+};
+
+// A feature type with one value for each complete morphological analysis
+// (analogous to the fulltag analyzer).
+class FullLabelFeatureType : public FeatureType {
+ public:
+  FullLabelFeatureType(const string &name, const MorphologyLabelSet *label_set)
+      : FeatureType(name), label_set_(label_set) {}
+
+  ~FullLabelFeatureType() override {}
+
+  // Converts a feature value to a name.  We don't use StringForMatch, since the
+  // goal of these are to be readable, even if they might occasionally be
+  // non-unique.
+  string GetFeatureValueName(FeatureValue value) const override;
+
+  // Returns the size of the feature values domain.
+  FeatureValue GetDomainSize() const override { return label_set_->Size(); }
+
+ private:
+  // Not owned.
+  const MorphologyLabelSet *label_set_ = nullptr;
+};
+
+}  // namespace syntaxnet
+
+#endif  // SYNTAXNET_MORPHOLOGY_LABEL_SET_H_

+ 101 - 0
syntaxnet/syntaxnet/morphology_label_set_test.cc

@@ -0,0 +1,101 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "syntaxnet/morphology_label_set.h"
+#include "syntaxnet/sentence.pb.h"
+#include <gmock/gmock.h>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace syntaxnet {
+
+class MorphologyLabelSetTest : public ::testing::Test {
+ protected:
+  MorphologyLabelSet label_set_;
+};
+
+// Test that Add and LookupExisting work as expected.
+TEST_F(MorphologyLabelSetTest, AddLookupExisting) {
+  TokenMorphology si1, si2;  // singular, imperative
+  TokenMorphology pi;        // plural, imperative
+  TokenMorphology six;       // singular, imperative with extra value
+  TextFormat::ParseFromString(R"(
+      attribute {name: "Number" value: "Singular"}
+      attribute {name: "POS" value: "IMP"})",
+                                      &si1);
+  TextFormat::ParseFromString(R"(
+      attribute {name: "POS" value: "IMP"}
+      attribute {name: "Number" value: "Singular"})",
+                                      &si2);
+  TextFormat::ParseFromString(R"(
+      attribute {name: "Number" value: "Plural"}
+      attribute {name: "POS" value: "IMP"})",
+                                      &pi);
+  TextFormat::ParseFromString(R"(
+      attribute {name: "Number" value: "Plural"}
+      attribute {name: "POS" value: "IMP"}
+      attribute {name: "x" value: "x"})",
+                                      &six);
+
+  // Check Lookup existing returns -1 for non-existing entries.
+  EXPECT_EQ(-1, label_set_.LookupExisting(si1));
+  EXPECT_EQ(-1, label_set_.LookupExisting(si2));
+  EXPECT_EQ(0, label_set_.Size());
+
+  // Check that adding returns 0 (this is the only possiblity given Size())
+  EXPECT_EQ(0, label_set_.Add(si1));
+  EXPECT_EQ(0, label_set_.Add(si1));  // calling Add twice adds only once
+  EXPECT_EQ(1, label_set_.Size());
+
+  // Check that order of attributes does not matter.
+  EXPECT_EQ(0, label_set_.LookupExisting(si2));
+
+  // Check that un-added entries still are not present.
+  EXPECT_EQ(-1, label_set_.LookupExisting(pi));
+  EXPECT_EQ(-1, label_set_.LookupExisting(six));
+
+  // Check that we can add them.
+  EXPECT_EQ(1, label_set_.Add(pi));
+  EXPECT_EQ(2, label_set_.Add(six));
+  EXPECT_EQ(3, label_set_.Size());
+}
+
+// Test write and deserializing constructor.
+TEST_F(MorphologyLabelSetTest, Serialization) {
+  TokenMorphology si;  // singular, imperative
+  TokenMorphology pi;  // plural, imperative
+  TextFormat::ParseFromString(R"(
+      attribute {name: "Number" value: "Singular"}
+      attribute {name: "POS" value: "IMP"})",
+                                      &si);
+  TextFormat::ParseFromString(R"(
+      attribute {name: "Number" value: "Plural"}
+      attribute {name: "POS" value: "IMP"})",
+                                      &pi);
+  EXPECT_EQ(0, label_set_.Add(si));
+  EXPECT_EQ(1, label_set_.Add(pi));
+
+  // Serialize and deserialize.
+  string fname = utils::JoinPath({tensorflow::testing::TmpDir(), "label-set"});
+  label_set_.Write(fname);
+  MorphologyLabelSet label_set2(fname);
+  EXPECT_EQ(0, label_set2.LookupExisting(si));
+  EXPECT_EQ(1, label_set2.LookupExisting(pi));
+  EXPECT_EQ(2, label_set2.Size());
+}
+
+}  // namespace syntaxnet

+ 0 - 1
syntaxnet/syntaxnet/parser_eval.py

@@ -22,7 +22,6 @@ import time
 
 import tensorflow as tf
 
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from syntaxnet import sentence_pb2
 from syntaxnet import graph_builder

+ 18 - 0
syntaxnet/syntaxnet/parser_features.cc

@@ -166,6 +166,9 @@ REGISTER_PARSER_IDX_FEATURE_FUNCTION("label", LabelFeatureFunction);
 typedef BasicParserSentenceFeatureFunction<Word> WordFeatureFunction;
 REGISTER_PARSER_IDX_FEATURE_FUNCTION("word", WordFeatureFunction);
 
+typedef BasicParserSentenceFeatureFunction<Char> CharFeatureFunction;
+REGISTER_PARSER_IDX_FEATURE_FUNCTION("char", CharFeatureFunction);
+
 typedef BasicParserSentenceFeatureFunction<Tag> TagFeatureFunction;
 REGISTER_PARSER_IDX_FEATURE_FUNCTION("tag", TagFeatureFunction);
 
@@ -175,6 +178,21 @@ REGISTER_PARSER_IDX_FEATURE_FUNCTION("digit", DigitFeatureFunction);
 typedef BasicParserSentenceFeatureFunction<Hyphen> HyphenFeatureFunction;
 REGISTER_PARSER_IDX_FEATURE_FUNCTION("hyphen", HyphenFeatureFunction);
 
+typedef BasicParserSentenceFeatureFunction<Capitalization>
+    CapitalizationFeatureFunction;
+REGISTER_PARSER_IDX_FEATURE_FUNCTION("capitalization",
+                                     CapitalizationFeatureFunction);
+
+typedef BasicParserSentenceFeatureFunction<PunctuationAmount>
+    PunctuationAmountFeatureFunction;
+REGISTER_PARSER_IDX_FEATURE_FUNCTION("punctuation-amount",
+                                     PunctuationAmountFeatureFunction);
+
+typedef BasicParserSentenceFeatureFunction<Quote>
+    QuoteFeatureFunction;
+REGISTER_PARSER_IDX_FEATURE_FUNCTION("quote",
+                                     QuoteFeatureFunction);
+
 typedef BasicParserSentenceFeatureFunction<PrefixFeature> PrefixFeatureFunction;
 REGISTER_PARSER_IDX_FEATURE_FUNCTION("prefix", PrefixFeatureFunction);
 

+ 4 - 2
syntaxnet/syntaxnet/proto_io.h

@@ -144,7 +144,7 @@ class StdIn : public tensorflow::RandomAccessFile {
 // Reads sentence protos from a text file.
 class TextReader {
  public:
-  explicit TextReader(const TaskInput &input) {
+  explicit TextReader(const TaskInput &input, TaskContext *context) {
     CHECK_EQ(input.record_format_size(), 1)
         << "TextReader only supports inputs with one record format: "
         << input.DebugString();
@@ -153,6 +153,7 @@ class TextReader {
         << input.DebugString();
     filename_ = TaskContext::InputFile(input);
     format_.reset(DocumentFormat::Create(input.record_format(0)));
+    format_->Setup(context);
     Reset();
   }
 
@@ -202,7 +203,7 @@ class TextReader {
 // Writes sentence protos to a text conll file.
 class TextWriter {
  public:
-  explicit TextWriter(const TaskInput &input) {
+  explicit TextWriter(const TaskInput &input, TaskContext *context) {
     CHECK_EQ(input.record_format_size(), 1)
         << "TextWriter only supports files with one record format: "
         << input.DebugString();
@@ -211,6 +212,7 @@ class TextWriter {
         << input.DebugString();
     filename_ = TaskContext::InputFile(input);
     format_.reset(DocumentFormat::Create(input.record_format(0)));
+    format_->Setup(context);
     if (filename_ != "-") {
       TF_CHECK_OK(
           tensorflow::Env::Default()->NewWritableFile(filename_, &file_));

+ 85 - 0
syntaxnet/syntaxnet/segmenter_utils.cc

@@ -0,0 +1,85 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "syntaxnet/segmenter_utils.h"
+#include "util/utf8/unicodetext.h"
+#include "util/utf8/unilib.h"
+#include "util/utf8/unilib_utf8_utils.h"
+
+namespace syntaxnet {
+
+// Separators, code Zs from http://www.unicode.org/Public/UNIDATA/PropList.txt
+// NB: This list is not necessarily exhaustive.
+const std::unordered_set<int> SegmenterUtils::kBreakChars({
+  0x2028,  // line separator
+  0x2029,  // paragraph separator
+  0x0020,  // space
+  0x00a0,  // no-break space
+  0x1680,  // Ogham space mark
+  0x180e,  // Mongolian vowel separator
+  0x202f,  // narrow no-break space
+  0x205f,  // medium mathematical space
+  0x3000,  // ideographic space
+  0xe5e5,  // Google addition
+  0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008,
+  0x2009, 0x200a
+});
+
+void SegmenterUtils::GetUTF8Chars(const string &text,
+                                  vector<tensorflow::StringPiece> *chars) {
+  const char *start = text.c_str();
+  const char *end = text.c_str() + text.size();
+  while (start < end) {
+    int char_length = UniLib::OneCharLen(start);
+    chars->emplace_back(start, char_length);
+    start += char_length;
+  }
+}
+
+void SegmenterUtils::SetCharsAsTokens(
+    const string &text,
+    const vector<tensorflow::StringPiece> &chars,
+    Sentence *sentence) {
+  sentence->clear_token();
+  sentence->set_text(text);
+  for (int i = 0; i < chars.size(); ++i) {
+    Token *tok = sentence->add_token();
+    tok->set_word(chars[i].ToString());  // NOLINT
+    int start_byte, end_byte;
+    GetCharStartEndBytes(text, chars[i], &start_byte, &end_byte);
+    tok->set_start(start_byte);
+    tok->set_end(end_byte);
+  }
+}
+
+bool SegmenterUtils::IsValidSegment(const Sentence &sentence,
+                                    const Token &token) {
+  // Check that the token is not empty, both by string and by bytes.
+  if (token.word().empty()) return false;
+  if (token.start() > token.end()) return false;
+
+  // Check token boudaries inside of text.
+  if (token.start() < 0) return false;
+  if (token.end() >= sentence.text().size()) return false;
+
+  // Check that token string is valid UTF8, by bytes.
+  const char s = sentence.text()[token.start()];
+  const char e = sentence.text()[token.end() + 1];
+  if (UniLib::IsTrailByte(s)) return false;
+  if (UniLib::IsTrailByte(e)) return false;
+  return true;
+}
+
+}  // namespace syntaxnet

+ 93 - 0
syntaxnet/syntaxnet/segmenter_utils.h

@@ -0,0 +1,93 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef SYNTAXNET_SEGMENTER_UTILS_H_
+#define SYNTAXNET_SEGMENTER_UTILS_H_
+
+#include <string>
+#include <vector>
+#include <unordered_set>
+
+#include "syntaxnet/sentence.pb.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "util/utf8/unicodetext.h"
+
+namespace syntaxnet {
+
+// A set of common convenience functions.
+class SegmenterUtils {
+ public:
+  // Takes a text and convert it into a vector, where each element is a utf8
+  // character.
+  static void GetUTF8Chars(const string &text,
+                           vector<tensorflow::StringPiece> *chars);
+
+  // Sets tokens in the sentence so that each token is a single character.
+  // Assigns the start/end byte offsets.
+  //
+  // If the sentence is not empty, the current tokens will be cleared.
+  static void SetCharsAsTokens(const string &text,
+                               const vector<tensorflow::StringPiece> &chars,
+                               Sentence *sentence);
+
+  // Returns true for UTF-8 characters that cannot be 'real' tokens. This is
+  // defined as any whitespace, line break or paragraph break.
+  static bool IsBreakChar(const string &word) {
+    if (word == "\n" || word == "\t") return true;
+    UnicodeText text;
+    text.PointToUTF8(word.c_str(), word.length());
+    CHECK_EQ(text.size(), 1);
+    return kBreakChars.find(*text.begin()) != kBreakChars.end();
+  }
+
+  // Returns the break level for the next token based on the current character.
+  static Token::BreakLevel BreakLevel(const string &word) {
+    UnicodeText text;
+    text.PointToUTF8(word.c_str(), word.length());
+    auto point = *text.begin();
+    if (word == "\n" || point == kLineSeparator) {
+      return Token::LINE_BREAK;
+    } else if (point == kParagraphSeparator) {
+      return Token::SENTENCE_BREAK;  // No PARAGRAPH_BREAK in sentence proto.
+    } else if (word == "\t" || kBreakChars.find(point) != kBreakChars.end()) {
+      return Token::SPACE_BREAK;
+    }
+    return Token::NO_BREAK;
+  }
+
+  // Convenience function for computing start/end byte offsets of a character
+  // StringPiece relative to original text.
+  static void GetCharStartEndBytes(const string &text,
+                                   tensorflow::StringPiece c,
+                                   int *start,
+                                   int *end) {
+    *start = c.data() - text.data();
+    *end = *start + c.size() - 1;
+  }
+
+  // Returns true if this segment is a valid segment. Currently checks:
+  // 1) It is non-empty
+  // 2) It is valid UTF8
+  static bool IsValidSegment(const Sentence &sentence, const Token &token);
+
+  // Set for utf8 break characters.
+  static const std::unordered_set<int> kBreakChars;
+  static const int kLineSeparator = 0x2028;
+  static const int kParagraphSeparator = 0x2029;
+};
+
+}  // namespace syntaxnet
+
+#endif  // SYNTAXNET_SEGMENTER_UTILS_H_

+ 149 - 0
syntaxnet/syntaxnet/segmenter_utils_test.cc

@@ -0,0 +1,149 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "syntaxnet/segmenter_utils.h"
+
+#include <string>
+#include <vector>
+
+#include "syntaxnet/char_properties.h"
+#include "syntaxnet/sentence.pb.h"
+#include <gmock/gmock.h>
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace syntaxnet {
+
+// Creates a Korean senence and also initializes the token field.
+static Sentence GetKoSentence() {
+  Sentence sentence;
+
+  string text = "서울시는 2012년부터";
+
+  // Add tokens.
+  sentence.set_text(text);
+  Token *tok = sentence.add_token();
+  tok->set_word("서울시");
+  tok->set_start(0);
+  tok->set_end(8);
+  tok = sentence.add_token();
+  tok->set_word("는");
+  tok->set_start(9);
+  tok->set_end(11);
+  tok = sentence.add_token();
+  tok->set_word("2012");
+  tok->set_start(13);
+  tok->set_end(16);
+  tok = sentence.add_token();
+  tok->set_word("년");
+  tok->set_start(17);
+  tok->set_end(19);
+  tok = sentence.add_token();
+  tok->set_word("부터");
+  tok->set_start(20);
+  tok->set_end(25);
+
+  return sentence;
+}
+
+// Gets the start end bytes of the given chars in the given text.
+static void GetStartEndBytes(const string &text,
+                             const vector<tensorflow::StringPiece> &chars,
+                             vector<int> *starts,
+                             vector<int> *ends) {
+  SegmenterUtils segment_utils;
+  for (const tensorflow::StringPiece &c : chars) {
+    int start; int end;
+    segment_utils.GetCharStartEndBytes(text, c, &start, &end);
+    starts->push_back(start);
+    ends->push_back(end);
+  }
+}
+
+// Test the GetChars function.
+TEST(SegmenterUtilsTest, GetCharsTest) {
+  // Create test sentence.
+  const Sentence sentence = GetKoSentence();
+  vector<tensorflow::StringPiece> chars;
+  SegmenterUtils::GetUTF8Chars(sentence.text(), &chars);
+
+  // Check the number of characters is correct.
+  CHECK_EQ(chars.size(), 12);
+
+  vector<int> starts;
+  vector<int> ends;
+  GetStartEndBytes(sentence.text(), chars, &starts, &ends);
+
+  // Check start positions.
+  CHECK_EQ(starts[0], 0);
+  CHECK_EQ(starts[1], 3);
+  CHECK_EQ(starts[2], 6);
+  CHECK_EQ(starts[3], 9);
+  CHECK_EQ(starts[4], 12);
+  CHECK_EQ(starts[5], 13);
+  CHECK_EQ(starts[6], 14);
+  CHECK_EQ(starts[7], 15);
+  CHECK_EQ(starts[8], 16);
+  CHECK_EQ(starts[9], 17);
+  CHECK_EQ(starts[10], 20);
+  CHECK_EQ(starts[11], 23);
+
+  // Check end positions.
+  CHECK_EQ(ends[0], 2);
+  CHECK_EQ(ends[1], 5);
+  CHECK_EQ(ends[2], 8);
+  CHECK_EQ(ends[3], 11);
+  CHECK_EQ(ends[4], 12);
+  CHECK_EQ(ends[5], 13);
+  CHECK_EQ(ends[6], 14);
+  CHECK_EQ(ends[7], 15);
+  CHECK_EQ(ends[8], 16);
+  CHECK_EQ(ends[9], 19);
+  CHECK_EQ(ends[10], 22);
+  CHECK_EQ(ends[11], 25);
+}
+
+// Test the SetCharsAsTokens function.
+TEST(SegmenterUtilsTest, SetCharsAsTokensTest) {
+  // Create test sentence.
+  const Sentence sentence = GetKoSentence();
+  vector<tensorflow::StringPiece> chars;
+  SegmenterUtils segment_utils;
+  segment_utils.GetUTF8Chars(sentence.text(), &chars);
+
+  vector<int> starts;
+  vector<int> ends;
+  GetStartEndBytes(sentence.text(), chars, &starts, &ends);
+
+  // Check that the new docs word, start and end positions are properly set.
+  Sentence new_sentence;
+  segment_utils.SetCharsAsTokens(sentence.text(), chars, &new_sentence);
+  CHECK_EQ(new_sentence.token_size(), chars.size());
+  for (int t = 0; t < sentence.token_size(); ++t) {
+    CHECK_EQ(new_sentence.token(t).word(), chars[t]);
+    CHECK_EQ(new_sentence.token(t).start(), starts[t]);
+    CHECK_EQ(new_sentence.token(t).end(), ends[t]);
+  }
+
+  // Re-running should remove the old tokens.
+  segment_utils.SetCharsAsTokens(sentence.text(), chars, &new_sentence);
+  CHECK_EQ(new_sentence.token_size(), chars.size());
+  for (int t = 0; t < sentence.token_size(); ++t) {
+    CHECK_EQ(new_sentence.token(t).word(), chars[t]);
+    CHECK_EQ(new_sentence.token(t).start(), starts[t]);
+    CHECK_EQ(new_sentence.token(t).end(), ends[t]);
+  }
+}
+
+}  // namespace syntaxnet

+ 15 - 0
syntaxnet/syntaxnet/sentence.proto

@@ -59,3 +59,18 @@ message Token {
 
   extensions 1000 to max;
 }
+
+// Stores information about the morphology of a token.
+message TokenMorphology {
+  extend Token {
+    optional TokenMorphology morphology = 63949837;
+  }
+
+  // Morphology is represented by a set of attribute values.
+  message Attribute {
+    required string name = 1;
+    required string value = 2;
+  }
+  // This attribute field is designated to hold a single disambiguated analysis.
+  repeated Attribute attribute = 3;
+};

+ 1 - 1
syntaxnet/syntaxnet/sentence_batch.cc

@@ -24,7 +24,7 @@ limitations under the License.
 namespace syntaxnet {
 
 void SentenceBatch::Init(TaskContext *context) {
-  reader_.reset(new TextReader(*context->GetInput(input_name_)));
+  reader_.reset(new TextReader(*context->GetInput(input_name_), context));
   size_ = 0;
 }
 

+ 233 - 3
syntaxnet/syntaxnet/sentence_features.cc

@@ -14,9 +14,11 @@ limitations under the License.
 ==============================================================================*/
 
 #include "syntaxnet/sentence_features.h"
-
+#include "syntaxnet/char_properties.h"
 #include "syntaxnet/registry.h"
 #include "util/utf8/unicodetext.h"
+#include "util/utf8/unilib.h"
+#include "util/utf8/unilib_utf8_utils.h"
 
 namespace syntaxnet {
 
@@ -55,6 +57,83 @@ string TermFrequencyMapFeature::WorkspaceName() const {
                                              min_freq_, max_num_terms_);
 }
 
+TermFrequencyMapSetFeature::~TermFrequencyMapSetFeature() {
+  if (term_map_ != nullptr) {
+    SharedStore::Release(term_map_);
+    term_map_ = nullptr;
+  }
+}
+
+void TermFrequencyMapSetFeature::Setup(TaskContext *context) {
+  context->GetInput(input_name_, "text", "");
+}
+
+void TermFrequencyMapSetFeature::Init(TaskContext *context) {
+  min_freq_ = GetIntParameter("min-freq", 0);
+  max_num_terms_ = GetIntParameter("max-num-terms", 0);
+  file_name_ = context->InputFile(*context->GetInput(input_name_));
+  term_map_ = SharedStoreUtils::GetWithDefaultName<TermFrequencyMap>(
+      file_name_, min_freq_, max_num_terms_);
+  TokenLookupSetFeature::Init(context);
+}
+
+string TermFrequencyMapSetFeature::WorkspaceName() const {
+  return SharedStoreUtils::CreateDefaultName(
+      "term-frequency-map-set", input_name_, min_freq_, max_num_terms_);
+}
+
+namespace {
+void GetUTF8Chars(const string &word, vector<tensorflow::StringPiece> *chars) {
+  UnicodeText text;
+  text.PointToUTF8(word.c_str(), word.size());
+  for (UnicodeText::const_iterator it = text.begin(); it != text.end(); ++it) {
+    chars->push_back(tensorflow::StringPiece(it.utf8_data(), it.utf8_length()));
+  }
+}
+
+int UTF8FirstLetterNumBytes(const char *utf8_str) {
+  if (*utf8_str == '\0') return 0;
+  return UniLib::OneCharLen(utf8_str);
+}
+
+}  // namespace
+
+void CharNgram::GetTokenIndices(const Token &token, vector<int> *values) const {
+  values->clear();
+  vector<tensorflow::StringPiece> char_sp;
+  if (use_terminators_) char_sp.push_back("^");
+  GetUTF8Chars(token.word(), &char_sp);
+  if (use_terminators_) char_sp.push_back("$");
+  for (int start = 0; start < char_sp.size(); ++start) {
+    string char_ngram;
+    for (int index = 0;
+         index < max_char_ngram_length_ && start + index < char_sp.size();
+         ++index) {
+      tensorflow::StringPiece c = char_sp[start + index];
+      if (c == " ") break;  // Never add char ngrams containing spaces.
+      tensorflow::strings::StrAppend(&char_ngram, c);
+      int value = LookupIndex(char_ngram);
+      if (value != -1) {  // Skip unknown values.
+        values->push_back(value);
+      }
+    }
+  }
+}
+
+void MorphologySet::GetTokenIndices(const Token &token,
+                                    vector<int> *values) const {
+  values->clear();
+  const TokenMorphology &token_morphology =
+      token.GetExtension(TokenMorphology::morphology);
+  for (const TokenMorphology::Attribute &att : token_morphology.attribute()) {
+    int value =
+        LookupIndex(tensorflow::strings::StrCat(att.name(), "=", att.value()));
+    if (value != -1) {  // Skip unknown values.
+      values->push_back(value);
+    }
+  }
+}
+
 string Hyphen::GetFeatureValueName(FeatureValue value) const {
   switch (value) {
     case NO_HYPHEN:
@@ -70,6 +149,152 @@ FeatureValue Hyphen::ComputeValue(const Token &token) const {
   return (word.find('-') < word.length() ? HAS_HYPHEN : NO_HYPHEN);
 }
 
+void Capitalization::Setup(TaskContext *context) {
+  utf8_ = (GetParameter("utf8") == "true");
+}
+
+// Runs ComputeValue for each token in the sentence.
+void Capitalization::Preprocess(WorkspaceSet *workspaces,
+                                Sentence *sentence) const {
+  if (workspaces->Has<VectorIntWorkspace>(Workspace())) return;
+  VectorIntWorkspace *workspace =
+      new VectorIntWorkspace(sentence->token_size());
+  for (int i = 0; i < sentence->token_size(); ++i) {
+    const int value = ComputeValueWithFocus(sentence->token(i), i);
+    workspace->set_element(i, value);
+  }
+  workspaces->Set<VectorIntWorkspace>(Workspace(), workspace);
+}
+
+string Capitalization::GetFeatureValueName(FeatureValue value) const {
+  switch (value) {
+    case LOWERCASE:
+      return "LOWERCASE";
+    case UPPERCASE:
+      return "UPPERCASE";
+    case CAPITALIZED:
+      return "CAPITALIZED";
+    case CAPITALIZED_SENTENCE_INITIAL:
+      return "CAPITALIZED_SENTENCE_INITIAL";
+    case NON_ALPHABETIC:
+      return "NON_ALPHABETIC";
+  }
+  return "<INVALID>";
+}
+
+FeatureValue Capitalization::ComputeValueWithFocus(const Token &token,
+                                                   int focus) const {
+  const string &word = token.word();
+
+  // Check whether there is an uppercase or lowercase character.
+  bool has_upper = false;
+  bool has_lower = false;
+  if (utf8_) {
+    LOG(FATAL) << "Not implemented.";
+  } else {
+    const char *str = word.c_str();
+    for (int i = 0; i < word.length(); ++i) {
+      const char c = str[i];
+      has_upper = (has_upper || (c >= 'A' && c <= 'Z'));
+      has_lower = (has_lower || (c >= 'a' && c <= 'z'));
+    }
+  }
+
+  // Compute simple values.
+  if (!has_upper && has_lower) return LOWERCASE;
+  if (has_upper && !has_lower) return UPPERCASE;
+  if (!has_upper && !has_lower) return NON_ALPHABETIC;
+
+  // Else has_upper && has_lower; a normal capitalized word.  Check the break
+  // level to determine whether the capitalized word is sentence-initial.
+  const bool sentence_initial = (focus == 0);
+  return sentence_initial ? CAPITALIZED_SENTENCE_INITIAL : CAPITALIZED;
+}
+
+string PunctuationAmount::GetFeatureValueName(FeatureValue value) const {
+  switch (value) {
+    case NO_PUNCTUATION:
+      return "NO_PUNCTUATION";
+    case SOME_PUNCTUATION:
+      return "SOME_PUNCTUATION";
+    case ALL_PUNCTUATION:
+      return "ALL_PUNCTUATION";
+  }
+  return "<INVALID>";
+}
+
+FeatureValue PunctuationAmount::ComputeValue(const Token &token) const {
+  const string &word = token.word();
+  bool has_punctuation = false;
+  bool all_punctuation = true;
+
+  const char *start = word.c_str();
+  const char *end = word.c_str() + word.size();
+  while (start < end) {
+    int char_length = UTF8FirstLetterNumBytes(start);
+    bool char_is_punct = is_punctuation_or_symbol(start, char_length);
+    all_punctuation &= char_is_punct;
+    has_punctuation |= char_is_punct;
+    if (!all_punctuation && has_punctuation) return SOME_PUNCTUATION;
+    start += char_length;
+  }
+  if (!all_punctuation) return NO_PUNCTUATION;
+  return ALL_PUNCTUATION;
+}
+
+string Quote::GetFeatureValueName(FeatureValue value) const {
+  switch (value) {
+    case NO_QUOTE:
+      return "NO_QUOTE";
+    case OPEN_QUOTE:
+      return "OPEN_QUOTE";
+    case CLOSE_QUOTE:
+      return "CLOSE_QUOTE";
+    case UNKNOWN_QUOTE:
+      return "UNKNOWN_QUOTE";
+  }
+  return "<INVALID>";
+}
+
+FeatureValue Quote::ComputeValue(const Token &token) const {
+  const string &word = token.word();
+
+  // Penn Treebank open and close quotes are multi-character.
+  if (word == "``") return OPEN_QUOTE;
+  if (word == "''") return CLOSE_QUOTE;
+  if (word.length() == 1) {
+    int char_len = UTF8FirstLetterNumBytes(word.c_str());
+    bool is_open = is_open_quote(word.c_str(), char_len);
+    bool is_close = is_close_quote(word.c_str(), char_len);
+    if (is_open && !is_close) return OPEN_QUOTE;
+    if (is_close && !is_open) return CLOSE_QUOTE;
+    if (is_open && is_close) return UNKNOWN_QUOTE;
+  }
+  return NO_QUOTE;
+}
+
+void Quote::Preprocess(WorkspaceSet *workspaces, Sentence *sentence) const {
+  if (workspaces->Has<VectorIntWorkspace>(Workspace())) return;
+  VectorIntWorkspace *workspace =
+      new VectorIntWorkspace(sentence->token_size());
+
+  // For double quote ", it is unknown whether they are open or closed without
+  // looking at the prior tokens in the sentence.  in_quote is true iff an odd
+  // number of " marks have been seen so far in the sentence (similar to the
+  // behavior of some tokenizers).
+  bool in_quote = false;
+  for (int i = 0; i < sentence->token_size(); ++i) {
+    int quote_type = ComputeValue(sentence->token(i));
+    if (quote_type == UNKNOWN_QUOTE) {
+      // Update based on in_quote and flip in_quote.
+      quote_type = in_quote ? CLOSE_QUOTE : OPEN_QUOTE;
+      in_quote = !in_quote;
+    }
+    workspace->set_element(i, quote_type);
+  }
+  workspaces->Set<VectorIntWorkspace>(Workspace(), workspace);
+}
+
 string Digit::GetFeatureValueName(FeatureValue value) const {
   switch (value) {
     case NO_DIGIT:
@@ -130,8 +355,7 @@ static AffixTable *CreateAffixTable(const string &filename,
 void AffixTableFeature::Setup(TaskContext *context) {
   context->GetInput(input_name_, "recordio", "affix-table");
   affix_length_ = GetIntParameter("length", 0);
-  CHECK_GE(affix_length_, 0)
-      << "Length must be specified for affix preprocessor.";
+  CHECK_GE(affix_length_, 0) << "Length must be specified for affix feature.";
   TokenLookupFeature::Setup(context);
 }
 
@@ -181,6 +405,7 @@ REGISTER_CLASS_REGISTRY("sentence+index feature function", SentenceFeature);
 
 // Register the features defined in the header.
 REGISTER_SENTENCE_IDX_FEATURE("word", Word);
+REGISTER_SENTENCE_IDX_FEATURE("char", Char);
 REGISTER_SENTENCE_IDX_FEATURE("lcword", LowercaseWord);
 REGISTER_SENTENCE_IDX_FEATURE("tag", Tag);
 REGISTER_SENTENCE_IDX_FEATURE("offset", Offset);
@@ -188,5 +413,10 @@ REGISTER_SENTENCE_IDX_FEATURE("hyphen", Hyphen);
 REGISTER_SENTENCE_IDX_FEATURE("digit", Digit);
 REGISTER_SENTENCE_IDX_FEATURE("prefix", PrefixFeature);
 REGISTER_SENTENCE_IDX_FEATURE("suffix", SuffixFeature);
+REGISTER_SENTENCE_IDX_FEATURE("char-ngram", CharNgram);
+REGISTER_SENTENCE_IDX_FEATURE("morphology-set", MorphologySet);
+REGISTER_SENTENCE_IDX_FEATURE("capitalization", Capitalization);
+REGISTER_SENTENCE_IDX_FEATURE("punctuation-amount", PunctuationAmount);
+REGISTER_SENTENCE_IDX_FEATURE("quote", Quote);
 
 }  // namespace syntaxnet

+ 329 - 5
syntaxnet/syntaxnet/sentence_features.h

@@ -23,6 +23,7 @@ limitations under the License.
 #include "syntaxnet/affix.h"
 #include "syntaxnet/feature_extractor.h"
 #include "syntaxnet/feature_types.h"
+#include "syntaxnet/segmenter_utils.h"
 #include "syntaxnet/shared_store.h"
 #include "syntaxnet/task_context.h"
 #include "syntaxnet/workspace.h"
@@ -85,6 +86,88 @@ class TokenLookupFeature : public SentenceFeature {
     return workspaces.Get<VectorIntWorkspace>(workspace_).element(focus);
   }
 
+  int Workspace() const { return workspace_; }
+
+ private:
+  int workspace_;
+};
+
+// A multi purpose specialization of the feature. Processes the tokens in a
+// Sentence by looking up a value set for each token and storing that in
+// a VectorVectorInt workspace. Given a set of base values of size Size(),
+// reserves an extra value for unknown tokens.
+class TokenLookupSetFeature : public SentenceFeature {
+ public:
+  void Init(TaskContext *context) override {
+    set_feature_type(new ResourceBasedFeatureType<TokenLookupSetFeature>(
+        name(), this, {{NumValues(), "<OUTSIDE>"}}));
+  }
+
+  // Number of unique values.
+  virtual int64 NumValues() const = 0;
+
+  // Given a position in a sentence and workspaces, looks up the corresponding
+  // feature value set. The index is relative to the start of the sentence.
+  virtual void LookupToken(const WorkspaceSet &workspaces,
+                           const Sentence &sentence, int index,
+                           vector<int> *values) const = 0;
+
+  // Given a feature value, returns a string representation.
+  virtual string GetFeatureValueName(int value) const = 0;
+
+  // Name of the shared workspace.
+  virtual string WorkspaceName() const = 0;
+
+  // TokenLookupSetFeatures use VectorVectorIntWorkspaces by default.
+  void RequestWorkspaces(WorkspaceRegistry *registry) override {
+    workspace_ = registry->Request<VectorVectorIntWorkspace>(WorkspaceName());
+  }
+
+  // Default preprocessing: looks up a value set for each token in the Sentence.
+  void Preprocess(WorkspaceSet *workspaces, Sentence *sentence) const override {
+    // Default preprocessing: lookup a value set for each token in the Sentence.
+    if (workspaces->Has<VectorVectorIntWorkspace>(workspace_)) return;
+    VectorVectorIntWorkspace *workspace =
+        new VectorVectorIntWorkspace(sentence->token_size());
+    for (int i = 0; i < sentence->token_size(); ++i) {
+      LookupToken(*workspaces, *sentence, i, workspace->mutable_elements(i));
+    }
+    workspaces->Set<VectorVectorIntWorkspace>(workspace_, workspace);
+  }
+
+  // Returns a pre-computed token value from the cache. This assumes the cache
+  // is populated.
+  const vector<int> &GetCachedValueSet(const WorkspaceSet &workspaces,
+                                       const Sentence &sentence,
+                                       int focus) const {
+    // Do bounds checking on focus.
+    CHECK_GE(focus, 0);
+    CHECK_LT(focus, sentence.token_size());
+
+    // Return value from cache.
+    return workspaces.Get<VectorVectorIntWorkspace>(workspace_).elements(focus);
+  }
+
+  // Adds any precomputed features at the given focus, if present.
+  void Evaluate(const WorkspaceSet &workspaces, const Sentence &sentence,
+                int focus, FeatureVector *result) const override {
+    if (focus >= 0 && focus < sentence.token_size()) {
+      const vector<int> &elements =
+          GetCachedValueSet(workspaces, sentence, focus);
+      for (auto &value : elements) {
+        result->add(this->feature_type(), value);
+      }
+    }
+  }
+
+  // Returns the precomputed value, or NumValues() for features outside
+  // the sentence.
+  FeatureValue Compute(const WorkspaceSet &workspaces, const Sentence &sentence,
+                       int focus, const FeatureVector *result) const override {
+    if (focus < 0 || focus >= sentence.token_size()) return NumValues();
+    return workspaces.Get<VectorIntWorkspace>(workspace_).element(focus);
+  }
+
  private:
   int workspace_;
 };
@@ -134,6 +217,83 @@ class TermFrequencyMapFeature : public TokenLookupFeature {
   int max_num_terms_;
 };
 
+// Specialization of the TokenLookupSetFeature class to use a TermFrequencyMap
+// to perform the mapping. This takes two options: "min_freq" (discard tokens
+// with less than this min frequency), and "max_num_terms" (only read in at most
+// these terms.)
+class TermFrequencyMapSetFeature : public TokenLookupSetFeature {
+ public:
+  // Initializes with an empty name, since we need the options to compute the
+  // actual workspace name.
+  explicit TermFrequencyMapSetFeature(const string &input_name)
+      : input_name_(input_name), min_freq_(0), max_num_terms_(0) {}
+
+  // Releases shared resources.
+  ~TermFrequencyMapSetFeature() override;
+
+  // Returns index of raw word text.
+  virtual void GetTokenIndices(const Token &token,
+                               vector<int> *values) const = 0;
+
+  // Requests the resource inputs.
+  void Setup(TaskContext *context) override;
+
+  // Obtains resources using the shared store. At this point options are known
+  // so the full name can be computed.
+  void Init(TaskContext *context) override;
+
+  // Number of unique values.
+  int64 NumValues() const override { return term_map_->Size(); }
+
+  // Special value for strings not in the map.
+  FeatureValue UnknownValue() const { return term_map_->Size(); }
+
+  // Gets pointer to the underlying map.
+  const TermFrequencyMap *term_map() const { return term_map_; }
+
+  // Returns the term index or the unknown value. Used inside GetTokenIndex()
+  // specializations for convenience.
+  int LookupIndex(const string &term) const {
+    return term_map_->LookupIndex(term, -1);
+  }
+
+  // Given a position in a sentence and workspaces, looks up the corresponding
+  // feature value set. The index is relative to the start of the sentence.
+  void LookupToken(const WorkspaceSet &workspaces, const Sentence &sentence,
+                   int index, vector<int> *values) const override {
+    GetTokenIndices(sentence.token(index), values);
+  }
+
+  // Uses the TermFrequencyMap to lookup the string associated with a value.
+  string GetFeatureValueName(int value) const override {
+    if (value == UnknownValue()) return "<UNKNOWN>";
+    if (value >= 0 && value < NumValues()) {
+      return term_map_->GetTerm(value);
+    }
+    LOG(ERROR) << "Invalid feature value: " << value;
+    return "<INVALID>";
+  }
+
+  // Name of the shared workspace.
+  string WorkspaceName() const override;
+
+ private:
+  // Shortcut pointer to shared map. Not owned.
+  const TermFrequencyMap *term_map_ = nullptr;
+
+  // Name of the input for the term map.
+  string input_name_;
+
+  // Filename of the underlying resource.
+  string file_name_;
+
+  // Minimum frequency for term map.
+  int min_freq_;
+
+  // Maximum number of terms for term map.
+  int max_num_terms_;
+};
+
 class Word : public TermFrequencyMapFeature {
  public:
   Word() : TermFrequencyMapFeature("word-map") {}
@@ -144,6 +304,36 @@ class Word : public TermFrequencyMapFeature {
   }
 };
 
+class Char : public TermFrequencyMapFeature {
+ public:
+  Char() : TermFrequencyMapFeature("char-map") {}
+
+  FeatureValue ComputeValue(const Token &token) const override {
+    const string &form = token.word();
+    if (SegmenterUtils::IsBreakChar(form)) return BreakCharValue();
+    return term_map().LookupIndex(form, UnknownValue());
+  }
+
+  // Special value for breaks.
+  FeatureValue BreakCharValue() const { return term_map().Size(); }
+
+  // Special value for non-break strings not in the map.
+  FeatureValue UnknownValue() const { return term_map().Size() + 1; }
+
+  // Number of unique values.
+  int64 NumValues() const override { return term_map().Size() + 2; }
+
+  string GetFeatureValueName(FeatureValue value) const override {
+    if (value == BreakCharValue()) return "<BREAK_CHAR>";
+    if (value == UnknownValue()) return "<UNKNOWN>";
+    if (value >= 0 && value < term_map().Size()) {
+      return term_map().GetTerm(value);
+    }
+    LOG(ERROR) << "Invalid feature value: " << value;
+    return "<INVALID>";
+  }
+};
+
 class LowercaseWord : public TermFrequencyMapFeature {
  public:
   LowercaseWord() : TermFrequencyMapFeature("lc-word-map") {}
@@ -172,6 +362,47 @@ class Label : public TermFrequencyMapFeature {
   }
 };
 
+class CharNgram : public TermFrequencyMapSetFeature {
+ public:
+  CharNgram() : TermFrequencyMapSetFeature("char-ngram-map") {}
+  ~CharNgram() override {}
+
+  void Setup(TaskContext *context) override {
+    TermFrequencyMapSetFeature::Setup(context);
+    max_char_ngram_length_ = context->Get("lexicon_max_char_ngram_length", 3);
+    use_terminators_ =
+        context->Get("lexicon_char_ngram_include_terminators", false);
+  }
+
+  // Returns index of raw word text.
+  void GetTokenIndices(const Token &token, vector<int> *values) const override;
+
+ private:
+  // Size parameter (n) for the ngrams.
+  int max_char_ngram_length_ = 3;
+
+  // Whether to pad the word with ^ and $ before extracting ngrams.
+  bool use_terminators_ = false;
+};
+
+class MorphologySet : public TermFrequencyMapSetFeature {
+ public:
+  MorphologySet() : TermFrequencyMapSetFeature("morphology-map") {}
+  ~MorphologySet() override {}
+
+  void Setup(TaskContext *context) override {
+    TermFrequencyMapSetFeature::Setup(context);
+  }
+
+
+  int64 NumValues() const override {
+    return term_map()->Size() - 1;
+  }
+
+  // Returns index of raw word text.
+  void GetTokenIndices(const Token &token, vector<int> *values) const override;
+};
+
 class LexicalCategoryFeature : public TokenLookupFeature {
  public:
   LexicalCategoryFeature(const string &name, int cardinality)
@@ -180,7 +411,7 @@ class LexicalCategoryFeature : public TokenLookupFeature {
 
   FeatureValue NumValues() const override { return cardinality_; }
 
-  // Returns the identifier for the workspace for this preprocessor.
+  // Returns the identifier for the workspace for this feature.
   string WorkspaceName() const override {
     return tensorflow::strings::StrCat(name_, ":", cardinality_);
   }
@@ -193,7 +424,7 @@ class LexicalCategoryFeature : public TokenLookupFeature {
   const int cardinality_;
 };
 
-// Preprocessor that computes whether a word has a hyphen or not.
+// Feature that computes whether a word has a hyphen or not.
 class Hyphen : public LexicalCategoryFeature {
  public:
   // Enumeration of values.
@@ -213,7 +444,100 @@ class Hyphen : public LexicalCategoryFeature {
   FeatureValue ComputeValue(const Token &token) const override;
 };
 
-// Preprocessor that computes whether a word has a hyphen or not.
+// Feature that categorizes the capitalization of the word. If the option
+// utf8=true is specified, lowercase and uppercase checks are done with UTF8
+// compliant functions.
+class Capitalization : public LexicalCategoryFeature {
+ public:
+  // Enumeration of values.
+  enum Category {
+    LOWERCASE = 0,                     // normal word
+    UPPERCASE = 1,                     // all-caps
+    CAPITALIZED = 2,                   // has one cap and one non-cap
+    CAPITALIZED_SENTENCE_INITIAL = 3,  // same as above but sentence-initial
+    NON_ALPHABETIC = 4,                // contains no alphabetic characters
+    CARDINALITY = 5,
+  };
+
+  // Default constructor.
+  Capitalization() : LexicalCategoryFeature("capitalization", CARDINALITY) {}
+
+  // Sets one of the options for the capitalization.
+  void Setup(TaskContext *context) override;
+
+  // Capitalization needs special preprocessing because token category can
+  // depend on whether the token is at the start of the sentence.
+  void Preprocess(WorkspaceSet *workspaces, Sentence *sentence) const override;
+
+  // Returns a string representation of the enum value.
+  string GetFeatureValueName(FeatureValue value) const override;
+
+  // Returns the category value for the token.
+  FeatureValue ComputeValue(const Token &token) const override {
+    LOG(FATAL) << "Capitalization should use ComputeValueWithFocus.";
+    return 0;
+  }
+
+  // Returns the category value for the token.
+  FeatureValue ComputeValueWithFocus(const Token &token, int focus) const;
+
+ private:
+  // Whether to use UTF8 compliant functions to check capitalization.
+  bool utf8_ = false;
+};
+
+// A feature for computing whether the focus token contains any punctuation
+// for ternary features.
+class PunctuationAmount : public LexicalCategoryFeature {
+ public:
+  // Enumeration of values.
+  enum Category {
+    NO_PUNCTUATION = 0,
+    SOME_PUNCTUATION = 1,
+    ALL_PUNCTUATION = 2,
+    CARDINALITY = 3,
+  };
+
+  // Default constructor.
+  PunctuationAmount()
+      : LexicalCategoryFeature("punctuation-amount", CARDINALITY) {}
+
+  // Returns a string representation of the enum value.
+  string GetFeatureValueName(FeatureValue value) const override;
+
+  // Returns the category value for the token.
+  FeatureValue ComputeValue(const Token &token) const override;
+};
+
+// A feature for a feature that returns whether the word is an open or
+// close quotation mark, based on its relative position to other quotation marks
+// in the sentence.
+class Quote : public LexicalCategoryFeature {
+ public:
+  // Enumeration of values.
+  enum Category {
+    NO_QUOTE = 0,
+    OPEN_QUOTE = 1,
+    CLOSE_QUOTE = 2,
+    UNKNOWN_QUOTE = 3,
+    CARDINALITY = 4,
+  };
+
+  // Default constructor.
+  Quote() : LexicalCategoryFeature("quote", CARDINALITY) {}
+
+  // Returns a string representation of the enum value.
+  string GetFeatureValueName(FeatureValue value) const override;
+
+  // Returns the category value for the token.
+  FeatureValue ComputeValue(const Token &token) const override;
+
+  // Override preprocess to compute open and close quotes from prior context of
+  // the sentence.
+  void Preprocess(WorkspaceSet *workspaces, Sentence *instance) const override;
+};
+
+// Feature that computes whether a word has digits or not.
 class Digit : public LexicalCategoryFeature {
  public:
   // Enumeration of values.
@@ -234,9 +558,9 @@ class Digit : public LexicalCategoryFeature {
   FeatureValue ComputeValue(const Token &token) const override;
 };
 
-// TokenLookupPreprocessor object to compute prefixes and suffixes of words. The
+// TokenLookupFeature object to compute prefixes and suffixes of words. The
 // AffixTable is stored in the SharedStore. This is very similar to the
-// implementation of TermFrequencyMapPreprocessor, but using an AffixTable to
+// implementation of TermFrequencyMapFeature, but using an AffixTable to
 // perform the lookups. There are only two specializations, for prefixes and
 // suffixes.
 class AffixTableFeature : public TokenLookupFeature {

+ 123 - 4
syntaxnet/syntaxnet/sentence_features_test.cc

@@ -26,6 +26,7 @@ limitations under the License.
 #include "syntaxnet/utils.h"
 #include "syntaxnet/workspace.h"
 #include <gmock/gmock.h>
+#include "tensorflow/core/platform/test.h"
 
 using testing::UnorderedElementsAreArray;
 
@@ -83,6 +84,27 @@ class SentenceFeaturesTest : public ::testing::Test {
     return values;
   }
 
+  // Adds an input to the task context.
+  void AddInputToContext(const string &name, const string &file_pattern,
+                         const string &file_format,
+                         const string &record_format) {
+    TaskInput *input = context_.GetInput(name);
+    TaskInput::Part *part = input->add_part();
+    part->set_file_pattern(file_pattern);
+    part->set_file_format(file_format);
+    part->set_record_format(record_format);
+  }
+
+  // Checks that a vector workspace is equal to a target vector.
+  void CheckVectorWorkspace(const VectorIntWorkspace &workspace,
+                            vector<int> target) {
+    vector<int> src;
+    for (int i = 0; i < workspace.size(); ++i) {
+      src.push_back(workspace.element(i));
+    }
+    EXPECT_THAT(src, testing::ContainerEq(target));
+  }
+
   Sentence sentence_;
   WorkspaceSet workspaces_;
 
@@ -99,13 +121,18 @@ class CommonSentenceFeaturesTest : public SentenceFeaturesTest {
       : SentenceFeaturesTest(
             "text: 'I saw a man with a telescope.' "
             "token { word: 'I' start: 0 end: 0 tag: 'PRP' category: 'PRON'"
-            " head: 1 label: 'nsubj' break_level: NO_BREAK } "
+            "  head: 1 label: 'nsubj' break_level: NO_BREAK } "
             "token { word: 'saw' start: 2 end: 4 tag: 'VBD' category: 'VERB'"
-            " label: 'ROOT' break_level: SPACE_BREAK } "
+            "  label: 'ROOT' break_level: SPACE_BREAK } "
             "token { word: 'a' start: 6 end: 6 tag: 'DT' category: 'DET'"
-            " head: 3 label: 'det' break_level: SPACE_BREAK } "
+            "  head: 3 label: 'det' break_level: SPACE_BREAK } "
             "token { word: 'man' start: 8 end: 10 tag: 'NN' category: 'NOUN'"
-            " head: 1 label: 'dobj' break_level: SPACE_BREAK } "
+            "  head: 1 label: 'dobj' break_level: SPACE_BREAK"
+            "  [syntaxnet.TokenMorphology.morphology] { "
+            "    attribute { name:'morph' value:'Sg' } "
+            "    attribute { name:'morph' value:'Masc' } "
+            "  } "
+            "} "
             "token { word: 'with' start: 12 end: 15 tag: 'IN' category: 'ADP'"
             " head: 1 label: 'prep' break_level: SPACE_BREAK } "
             "token { word: 'a' start: 17 end: 17 tag: 'DT' category: 'DET'"
@@ -152,4 +179,96 @@ TEST_F(CommonSentenceFeaturesTest, OffsetPlusTag) {
   EXPECT_EQ("<OUTSIDE>", ExtractFeature(9));
 }
 
+TEST_F(CommonSentenceFeaturesTest, CharNgramFeature) {
+  TermFrequencyMap char_ngram_map;
+  char_ngram_map.Increment("a");
+  char_ngram_map.Increment("aw");
+  char_ngram_map.Increment("sa");
+  creators_.Add(
+      "char-ngram-map", "text", "",
+      [&char_ngram_map](const string &path) { char_ngram_map.Save(path); });
+
+  // Test that CharNgram works as expected.
+  PrepareFeature("char-ngram");
+  EXPECT_EQ("", utils::Join(ExtractMultiFeature(-1), ","));
+  EXPECT_EQ("", utils::Join(ExtractMultiFeature(0), ","));
+  EXPECT_EQ("sa,a,aw", utils::Join(ExtractMultiFeature(1), ","));
+  EXPECT_EQ("a", utils::Join(ExtractMultiFeature(2), ","));
+  EXPECT_EQ("a", utils::Join(ExtractMultiFeature(3), ","));
+  EXPECT_EQ("", utils::Join(ExtractMultiFeature(8), ","));
+}
+
+TEST_F(CommonSentenceFeaturesTest, MorphologySetFeature) {
+  TermFrequencyMap morphology_map;
+  morphology_map.Increment("morph=Sg");
+  morphology_map.Increment("morph=Sg");
+  morphology_map.Increment("morph=Masc");
+  morphology_map.Increment("morph=Masc");
+  morphology_map.Increment("morph=Pl");
+  creators_.Add(
+      "morphology-map", "text", "",
+      [&morphology_map](const string &path) { morphology_map.Save(path); });
+
+  // Test that CharNgram works as expected.
+  PrepareFeature("morphology-set");
+  EXPECT_EQ("", utils::Join(ExtractMultiFeature(-1), ","));
+  EXPECT_EQ("", utils::Join(ExtractMultiFeature(0), ","));
+  EXPECT_EQ("morph=Sg,morph=Masc", utils::Join(ExtractMultiFeature(3), ","));
+}
+
+TEST_F(CommonSentenceFeaturesTest, CapitalizationProcessesCorrectly) {
+  Capitalization feature;
+  feature.RequestWorkspaces(&registry_);
+  workspaces_.Reset(registry_);
+  feature.Preprocess(&workspaces_, &sentence_);
+
+  // Check the workspace contains what we expect.
+  EXPECT_TRUE(workspaces_.Has<VectorIntWorkspace>(feature.Workspace()));
+  const VectorIntWorkspace &workspace =
+      workspaces_.Get<VectorIntWorkspace>(feature.Workspace());
+  constexpr int UPPERCASE = Capitalization::UPPERCASE;
+  constexpr int LOWERCASE = Capitalization::LOWERCASE;
+  constexpr int NON_ALPHABETIC = Capitalization::NON_ALPHABETIC;
+  CheckVectorWorkspace(workspace,
+                       {UPPERCASE, LOWERCASE, LOWERCASE, LOWERCASE, LOWERCASE,
+                        LOWERCASE, LOWERCASE, NON_ALPHABETIC});
+}
+
+class CharFeatureTest : public SentenceFeaturesTest {
+ protected:
+  CharFeatureTest()
+      : SentenceFeaturesTest(
+          "text: '一 个 测 试 员  ' "
+          "token { word: '一' start: 0 end: 2 } "
+          "token { word: '个' start: 3 end: 5 } "
+          "token { word: '测' start: 6 end: 8 } "
+          "token { word: '试' start: 9 end: 11 } "
+          "token { word: '员' start: 12 end: 14 } "
+          "token { word: ' ' start: 15 end: 15 } "
+          "token { word: '\t' start: 16 end: 16 } ") {}
+};
+
+TEST_F(CharFeatureTest, CharFeature) {
+  TermFrequencyMap char_map;
+  char_map.Increment("一");
+  char_map.Increment("个");
+  char_map.Increment("试");
+  char_map.Increment("员");
+  creators_.Add(
+      "char-map", "text", "",
+      [&char_map](const string &path) { char_map.Save(path); });
+
+  // Test that Char works as expected.
+  PrepareFeature("char");
+  EXPECT_EQ("<OUTSIDE>", ExtractFeature(-1));
+  EXPECT_EQ("一", ExtractFeature(0));
+  EXPECT_EQ("个", ExtractFeature(1));
+  EXPECT_EQ("<UNKNOWN>", ExtractFeature(2));  // "测" is not in the char map.
+  EXPECT_EQ("试", ExtractFeature(3));
+  EXPECT_EQ("员", ExtractFeature(4));
+  EXPECT_EQ("<BREAK_CHAR>", ExtractFeature(5));
+  EXPECT_EQ("<BREAK_CHAR>", ExtractFeature(6));
+  EXPECT_EQ("<OUTSIDE>", ExtractFeature(7));
+}
+
 }  // namespace syntaxnet

+ 40 - 5
syntaxnet/syntaxnet/tagger_transitions.cc

@@ -25,8 +25,10 @@ limitations under the License.
 
 #include <string>
 
+#include "syntaxnet/parser_features.h"
 #include "syntaxnet/parser_state.h"
 #include "syntaxnet/parser_transitions.h"
+#include "syntaxnet/sentence_features.h"
 #include "syntaxnet/shared_store.h"
 #include "syntaxnet/task_context.h"
 #include "syntaxnet/term_frequency_map.h"
@@ -98,7 +100,9 @@ class TaggerTransitionState : public ParserTransitionState {
     for (size_t i = 0; i < tag_.size(); ++i) {
       Token *token = sentence->mutable_token(i);
       token->set_tag(TagAsString(Tag(i)));
-      token->set_category(tag_to_category_->GetCategory(token->tag()));
+      if (tag_to_category_) {
+        token->set_category(tag_to_category_->GetCategory(token->tag()));
+      }
     }
   }
 
@@ -146,6 +150,7 @@ class TaggerTransitionSystem : public ParserTransitionSystem {
   // Determines tag map location.
   void Setup(TaskContext *context) override {
     input_tag_map_ = context->GetInput("tag-map", "text", "");
+    join_category_to_pos_ = context->GetBoolParameter("join_category_to_pos");
     input_tag_to_category_ = context->GetInput("tag-to-category", "text", "");
   }
 
@@ -154,15 +159,21 @@ class TaggerTransitionSystem : public ParserTransitionSystem {
     const string tag_map_path = TaskContext::InputFile(*input_tag_map_);
     tag_map_ = SharedStoreUtils::GetWithDefaultName<TermFrequencyMap>(
         tag_map_path, 0, 0);
-    const string tag_to_category_path =
-        TaskContext::InputFile(*input_tag_to_category_);
-    tag_to_category_ = SharedStoreUtils::GetWithDefaultName<TagToCategoryMap>(
-        tag_to_category_path);
+    if (!join_category_to_pos_) {
+      const string tag_to_category_path =
+          TaskContext::InputFile(*input_tag_to_category_);
+      tag_to_category_ = SharedStoreUtils::GetWithDefaultName<TagToCategoryMap>(
+          tag_to_category_path);
+    }
   }
 
   // The SHIFT action uses the same value as the corresponding action type.
   static ParserAction ShiftAction(int tag) { return tag; }
 
+  // The tagger transition system doesn't look at the dependency tree, so it
+  // allows non-projective trees.
+  bool AllowsNonProjective() const override { return true; }
+
   // Returns the number of action types.
   int NumActionTypes() const override { return 1; }
 
@@ -251,8 +262,32 @@ class TaggerTransitionSystem : public ParserTransitionSystem {
 
   // Tag to category map. Owned through SharedStore.
   const TagToCategoryMap *tag_to_category_ = nullptr;
+
+  bool join_category_to_pos_ = false;
 };
 
 REGISTER_TRANSITION_SYSTEM("tagger", TaggerTransitionSystem);
 
+// Feature function for retrieving the tag assigned to a token by the tagger
+// transition system.
+class PredictedTagFeatureFunction
+    : public BasicParserSentenceFeatureFunction<Tag> {
+ public:
+  PredictedTagFeatureFunction() {}
+
+  // Gets the TaggerTransitionState from the parser state and reads the assigned
+  // tag at the focus index. Returns -1 if the focus is not within the sentence.
+  FeatureValue Compute(const WorkspaceSet &workspaces, const ParserState &state,
+                       int focus, const FeatureVector *result) const override {
+    if (focus < 0 || focus >= state.sentence().token_size()) return -1;
+    return static_cast<const TaggerTransitionState *>(state.transition_state())
+        ->Tag(focus);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(PredictedTagFeatureFunction);
+};
+
+REGISTER_PARSER_IDX_FEATURE_FUNCTION("pred-tag", PredictedTagFeatureFunction);
+
 }  // namespace syntaxnet

+ 6 - 0
syntaxnet/syntaxnet/testdata/context.pbtxt

@@ -61,6 +61,12 @@ input {
   }
 }
 input {
+  name: 'char-map'
+  Part {
+    file_pattern: 'OUTPATH/char-map'
+  }
+}
+input {
   name: 'prefix-table'
   Part {
     file_pattern: 'OUTPATH/prefix-table'

+ 107 - 7
syntaxnet/syntaxnet/text_formats.cc

@@ -63,6 +63,11 @@ class CoNLLSyntaxFormat : public DocumentFormat {
  public:
   CoNLLSyntaxFormat() {}
 
+  void Setup(TaskContext *context) override {
+    join_category_to_pos_ = context->GetBoolParameter("join_category_to_pos");
+    add_pos_as_attribute_ = context->GetBoolParameter("add_pos_as_attribute");
+  }
+
   // Reads up to the first empty line and returns false end of file is reached.
   bool ReadRecord(tensorflow::io::InputBuffer *buffer,
                   string *record) override {
@@ -121,6 +126,7 @@ class CoNLLSyntaxFormat : public DocumentFormat {
       const string &word = fields[1];
       const string &cpostag = fields[3];
       const string &tag = fields[4];
+      const string &attributes = fields[5];
       const int head = utils::ParseUsing<int>(fields[6], 0, utils::ParseInt32);
       const string &label = fields[7];
 
@@ -139,6 +145,9 @@ class CoNLLSyntaxFormat : public DocumentFormat {
       if (!tag.empty()) token->set_tag(tag);
       if (!cpostag.empty()) token->set_category(cpostag);
       if (!label.empty()) token->set_label(label);
+      if (!attributes.empty()) AddMorphAttributes(attributes, token);
+      if (join_category_to_pos_) JoinCategoryToPos(token);
+      if (add_pos_as_attribute_) AddPosAsAttribute(token);
     }
 
     if (sentence->token_size() > 0) {
@@ -158,16 +167,18 @@ class CoNLLSyntaxFormat : public DocumentFormat {
     *key = sentence.docid();
     vector<string> lines;
     for (int i = 0; i < sentence.token_size(); ++i) {
+      Token token = sentence.token(i);
+      if (join_category_to_pos_) SplitCategoryFromPos(&token);
+      if (add_pos_as_attribute_) RemovePosFromAttributes(&token);
       vector<string> fields(10);
       fields[0] = tensorflow::strings::Printf("%d", i + 1);
-      fields[1] = sentence.token(i).word();
+      fields[1] = token.word();
       fields[2] = "_";
-      fields[3] = sentence.token(i).category();
-      fields[4] = sentence.token(i).tag();
-      fields[5] = "_";
-      fields[6] =
-          tensorflow::strings::Printf("%d", sentence.token(i).head() + 1);
-      fields[7] = sentence.token(i).label();
+      fields[3] = token.category();
+      fields[4] = token.tag();
+      fields[5] = GetMorphAttributes(token);
+      fields[6] = tensorflow::strings::Printf("%d", token.head() + 1);
+      fields[7] = token.label();
       fields[8] = "_";
       fields[9] = "_";
       lines.push_back(utils::Join(fields, "\t"));
@@ -176,6 +187,95 @@ class CoNLLSyntaxFormat : public DocumentFormat {
   }
 
  private:
+  // Creates a TokenMorphology object out of a list of attribute values of the
+  // form: a1=v1|a2=v2|... or v1|v2|...
+  void AddMorphAttributes(const string &attributes, Token *token) {
+    TokenMorphology *morph =
+        token->MutableExtension(TokenMorphology::morphology);
+    vector<string> att_vals = utils::Split(attributes, '|');
+    for (int i = 0; i < att_vals.size(); ++i) {
+      vector<string> att_val = utils::Split(att_vals[i], '=');
+      CHECK_LE(att_val.size(), 2)
+          << "Error parsing morphology features "
+          << "column, must be of format "
+          << "a1=v1|a2=v2|... or v1|v2|... <field>: " << attributes;
+
+      // Format is either:
+      //   1) a1=v1|a2=v2..., e.g., Czech CoNLL data, or,
+      //   2) v1|v2|..., e.g., German CoNLL data.
+      const pair<string, string> name_value =
+          att_val.size() == 2 ? std::make_pair(att_val[0], att_val[1])
+                              : std::make_pair(att_val[0], "on");
+
+      // We currently don't expect an empty attribute value, but might have an
+      // empty attribute name due to data input errors.
+      if (name_value.second.empty()) {
+        LOG(WARNING) << "Invalid attributes string: " << attributes
+                     << " for token: " << token->ShortDebugString();
+        continue;
+      }
+      if (!name_value.first.empty()) {
+        TokenMorphology::Attribute *attribute = morph->add_attribute();
+        attribute->set_name(name_value.first);
+        attribute->set_value(name_value.second);
+      }
+    }
+  }
+
+  // Creates a list of attribute values of the form a1=v1|a2=v2|... or v1|v2|...
+  // from a TokenMorphology object.
+  string GetMorphAttributes(const Token &token) {
+    const TokenMorphology &morph =
+        token.GetExtension(TokenMorphology::morphology);
+    if (morph.attribute_size() == 0) return "_";
+    string attributes;
+    for (const TokenMorphology::Attribute &attribute : morph.attribute()) {
+      if (!attributes.empty()) tensorflow::strings::StrAppend(&attributes, "|");
+      tensorflow::strings::StrAppend(&attributes, attribute.name());
+      if (attribute.value() != "on") {
+        tensorflow::strings::StrAppend(&attributes, "=", attribute.value());
+      }
+    }
+    return attributes;
+  }
+
+  void JoinCategoryToPos(Token *token) {
+    token->set_tag(
+        tensorflow::strings::StrCat(token->category(), "++", token->tag()));
+    token->clear_category();
+  }
+
+  void SplitCategoryFromPos(Token *token) {
+    const string &tag = token->tag();
+    const size_t pos = tag.find("++");
+    if (pos != string::npos) {
+      token->set_category(tag.substr(0, pos));
+      token->set_tag(tag.substr(pos + 2));
+    }
+  }
+
+  void AddPosAsAttribute(Token *token) {
+    if (!token->tag().empty()) {
+      TokenMorphology *morph =
+          token->MutableExtension(TokenMorphology::morphology);
+      TokenMorphology::Attribute *attribute = morph->add_attribute();
+      attribute->set_name("fPOS");
+      attribute->set_value(token->tag());
+    }
+  }
+
+  void RemovePosFromAttributes(Token *token) {
+    // Assumes the "fPOS" attribute, if present, is the last one.
+    TokenMorphology *morph =
+        token->MutableExtension(TokenMorphology::morphology);
+    if (morph->attribute().rbegin()->name() == "fPOS") {
+      morph->mutable_attribute()->RemoveLast();
+    }
+  }
+
+  bool join_category_to_pos_ = false;
+  bool add_pos_as_attribute_ = false;
+
   TF_DISALLOW_COPY_AND_ASSIGN(CoNLLSyntaxFormat);
 };
 

+ 59 - 1
syntaxnet/syntaxnet/utils.h

@@ -62,7 +62,7 @@ string Join(const std::vector<T> &s, const char *sep) {
   return result;
 }
 
-string JoinPath(std::initializer_list<StringPiece> paths);
+string JoinPath(std::initializer_list<tensorflow::StringPiece> paths);
 
 size_t RemoveLeadingWhitespace(tensorflow::StringPiece *text);
 
@@ -165,6 +165,64 @@ class PunctuationUtil {
 
 void NormalizeDigits(string *form);
 
+// Helper type to mark missing c-tor argument types
+// for Type's c-tor in LazyStaticPtr<Type, ...>.
+struct NoArg {};
+
+template <typename Type, typename Arg1 = NoArg, typename Arg2 = NoArg,
+          typename Arg3 = NoArg>
+class LazyStaticPtr {
+ public:
+  typedef Type element_type;  // per smart pointer convention
+
+  // Pretend to be a pointer to Type (never NULL due to on-demand creation):
+  Type &operator*() const { return *get(); }
+  Type *operator->() const { return get(); }
+
+  // Named accessor/initializer:
+  Type *get() const {
+    if (!ptr_) Initialize(this);
+    return ptr_;
+  }
+
+ public:
+  // All the data is public and LazyStaticPtr has no constructors so that we can
+  // initialize LazyStaticPtr objects with the "= { arg_value, ... }" syntax.
+  // Clients of LazyStaticPtr must not access the data members directly.
+
+  // Arguments for Type's c-tor
+  // (unused NoArg-typed arguments consume either no space, or 1 byte to
+  //  ensure address uniqueness):
+  Arg1 arg1_;
+  Arg2 arg2_;
+  Arg3 arg3_;
+
+  // The object we create and show.
+  mutable Type *ptr_;
+
+ private:
+  template <typename A1, typename A2, typename A3>
+  static Type *Factory(const A1 &a1, const A2 &a2, const A3 &a3) {
+    return new Type(a1, a2, a3);
+  }
+
+  template <typename A1, typename A2>
+  static Type *Factory(const A1 &a1, const A2 &a2, NoArg a3) {
+    return new Type(a1, a2);
+  }
+
+  template <typename A1>
+  static Type *Factory(const A1 &a1, NoArg a2, NoArg a3) {
+    return new Type(a1);
+  }
+
+  static Type *Factory(NoArg a1, NoArg a2, NoArg a3) { return new Type(); }
+
+  static void Initialize(const LazyStaticPtr *lsp) {
+    lsp->ptr_ = Factory(lsp->arg1_, lsp->arg2_, lsp->arg3_);
+  }
+};
+
 }  // namespace utils
 }  // namespace syntaxnet
 

+ 2 - 0
syntaxnet/syntaxnet/workspace.h

@@ -185,6 +185,8 @@ class VectorIntWorkspace : public Workspace {
   // Sets the i'th element.
   void set_element(int i, int value) { elements_[i] = value; }
 
+  int size() const { return elements_.size(); }
+
  private:
   // The enclosed vector.
   vector<int> elements_;

+ 6 - 0
syntaxnet/util/utf8/unicodetext.h

@@ -462,6 +462,12 @@ inline string UnicodeTextToUTF8(const UnicodeText& t) {
   return string(t.utf8_data(), t.utf8_length());
 }
 
+// This template function declaration is used in defining arraysize.
+// Note that the function doesn't need an implementation, as we only
+// use its type.
+template <typename T, size_t N>
+char (&ArraySizeHelper(T (&array)[N]))[N];
+#define arraysize(array) (sizeof(ArraySizeHelper(array)))
 
 // For debugging.  Return a string of integers, written in uppercase
 // hex (%X), corresponding to the codepoints within the text. Each

+ 0 - 4
syntaxnet/util/utf8/unicodetext_unittest.cc

@@ -25,10 +25,6 @@
 
 namespace {
 
-template <typename T, size_t N>
-char (&ArraySizeHelper(T (&array)[N]))[N];
-#define arraysize(array) (sizeof(ArraySizeHelper(array)))
-
 class UnicodeTextTest : public testing::Test {
  protected:
   UnicodeTextTest() : empty_text_() {

+ 14 - 0
syntaxnet/util/utf8/unilib_utf8_utils.h

@@ -21,6 +21,7 @@
 // They are also exported from unilib.h for legacy reasons.
 
 #include "syntaxnet/base.h"
+#include "third_party/utf/utf.h"
 
 namespace UniLib {
 
@@ -32,6 +33,19 @@ inline bool IsValidCodepoint(char32 c) {
     || (c >= 0xE000 && c <= 0x10FFFF);
 }
 
+// Returns true if 'str' is the start of a structurally valid UTF-8
+// sequence and is not a surrogate codepoint. Returns false if str.empty()
+// or if str.length() < UniLib::OneCharLen(str[0]). Otherwise, this function
+// will access 1-4 bytes of src, where n is UniLib::OneCharLen(src[0]).
+inline bool IsUTF8ValidCodepoint(StringPiece str) {
+  char32 c;
+  int consumed;
+  // It's OK if str.length() > consumed.
+  return !str.empty()
+      && isvalidcharntorune(str.data(), str.size(), &c, &consumed)
+      && IsValidCodepoint(c);
+}
+
 // Returns the length (number of bytes) of the Unicode code point
 // starting at src, based on inspecting just that one byte. This
 // requires that src point to a well-formed UTF-8 string; the result