9 rokov pred · 64675fc72f
--- a/syntaxnet/README.md
+++ b/syntaxnet/README.md
@@ -107,8 +107,8 @@ Bazel should complete reporting all tests passed.
 
				 You can also compile SyntaxNet in a [Docker](https://www.docker.com/what-docker)
			
 
				 container using this [Dockerfile](Dockerfile).
			
 
				 
			
 
				-**Note:** If you are running Docker on OSX, make sure that you have enough memory allocated
			
 
				-for your Docker VM.
			
 
				+**Note:** If you are running Docker on OSX, make sure that you have enough
			
 
				+memory allocated for your Docker VM.
			
 
				 
			
 
				 ## Getting Started
			
 
				 
			
@@ -612,6 +612,7 @@ Original authors of the code in this package include (in alphabetical order):
 
				 *   David Weiss
			
 
				 *   Emily Pitler
			
 
				 *   Greg Coppola
			
 
				+*   Ji Ma
			
 
				 *   Keith Hall
			
 
				 *   Kuzman Ganchev
			
 
				 *   Michael Collins
			
--- a/syntaxnet/syntaxnet/BUILD
+++ b/syntaxnet/syntaxnet/BUILD
@@ -159,6 +159,31 @@ cc_library(
 
				 )
			
 
				 
			
 
				 cc_library(
			
 
				+    name = "char_properties",
			
 
				+    srcs = ["char_properties.cc"],
			
 
				+    hdrs = ["char_properties.h"],
			
 
				+    deps = [
			
 
				+        ":registry",
			
 
				+        ":utils",
			
 
				+        "//util/utf8:unicodetext",
			
 
				+    ],
			
 
				+    alwayslink = 1,
			
 
				+)
			
 
				+
			
 
				+cc_library(
			
 
				+    name = "segmenter_utils",
			
 
				+    srcs = ["segmenter_utils.cc"],
			
 
				+    hdrs = ["segmenter_utils.h"],
			
 
				+    deps = [
			
 
				+        ":base",
			
 
				+        ":char_properties",
			
 
				+        ":sentence_proto",
			
 
				+        "//util/utf8:unicodetext",
			
 
				+    ],
			
 
				+    alwayslink = 1,
			
 
				+)
			
 
				+
			
 
				+cc_library(
			
 
				     name = "feature_extractor",
			
 
				     srcs = ["feature_extractor.cc"],
			
 
				     hdrs = [
			
@@ -199,6 +224,7 @@ cc_library(
 
				         ":affix",
			
 
				         ":feature_extractor",
			
 
				         ":registry",
			
 
				+        ":segmenter_utils",
			
 
				     ],
			
 
				 )
			
 
				 
			
@@ -251,24 +277,50 @@ cc_library(
 
				 )
			
 
				 
			
 
				 cc_library(
			
 
				+    name = "morphology_label_set",
			
 
				+    srcs = ["morphology_label_set.cc"],
			
 
				+    hdrs = ["morphology_label_set.h"],
			
 
				+    deps = [
			
 
				+        ":document_format",
			
 
				+        ":feature_extractor",
			
 
				+        ":proto_io",
			
 
				+        ":registry",
			
 
				+        ":sentence_proto",
			
 
				+        ":utils",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+cc_library(
			
 
				     name = "parser_transitions",
			
 
				     srcs = [
			
 
				         "arc_standard_transitions.cc",
			
 
				+        "binary_segment_state.cc",
			
 
				+        "binary_segment_transitions.cc",
			
 
				+        "morpher_transitions.cc",
			
 
				+        "parser_features.cc",
			
 
				         "parser_state.cc",
			
 
				         "parser_transitions.cc",
			
 
				         "tagger_transitions.cc",
			
 
				     ],
			
 
				     hdrs = [
			
 
				+        "binary_segment_state.h",
			
 
				+        "parser_features.h",
			
 
				         "parser_state.h",
			
 
				         "parser_transitions.h",
			
 
				     ],
			
 
				     deps = [
			
 
				+        ":affix",
			
 
				+        ":feature_extractor",
			
 
				         ":kbest_syntax_proto",
			
 
				+        ":morphology_label_set",
			
 
				         ":registry",
			
 
				+        ":segmenter_utils",
			
 
				+        ":sentence_features",
			
 
				         ":sentence_proto",
			
 
				         ":shared_store",
			
 
				         ":task_context",
			
 
				         ":term_frequency_map",
			
 
				+        ":workspace",
			
 
				     ],
			
 
				     alwayslink = 1,
			
 
				 )
			
@@ -289,29 +341,11 @@ cc_library(
 
				 )
			
 
				 
			
 
				 cc_library(
			
 
				-    name = "parser_features",
			
 
				-    srcs = ["parser_features.cc"],
			
 
				-    hdrs = ["parser_features.h"],
			
 
				-    deps = [
			
 
				-        ":affix",
			
 
				-        ":feature_extractor",
			
 
				-        ":parser_transitions",
			
 
				-        ":registry",
			
 
				-        ":sentence_features",
			
 
				-        ":task_context",
			
 
				-        ":term_frequency_map",
			
 
				-        ":workspace",
			
 
				-    ],
			
 
				-    alwayslink = 1,
			
 
				-)
			
 
				-
			
 
				-cc_library(
			
 
				     name = "embedding_feature_extractor",
			
 
				     srcs = ["embedding_feature_extractor.cc"],
			
 
				     hdrs = ["embedding_feature_extractor.h"],
			
 
				     deps = [
			
 
				         ":feature_extractor",
			
 
				-        ":parser_features",
			
 
				         ":parser_transitions",
			
 
				         ":sparse_proto",
			
 
				         ":task_context",
			
@@ -326,7 +360,6 @@ cc_library(
 
				     deps = [
			
 
				         ":embedding_feature_extractor",
			
 
				         ":feature_extractor",
			
 
				-        ":parser_features",
			
 
				         ":parser_transitions",
			
 
				         ":sentence_proto",
			
 
				         ":sparse_proto",
			
@@ -344,7 +377,6 @@ cc_library(
 
				         "reader_ops.cc",
			
 
				     ],
			
 
				     deps = [
			
 
				-        ":parser_features",
			
 
				         ":parser_transitions",
			
 
				         ":sentence_batch",
			
 
				         ":sentence_proto",
			
@@ -360,7 +392,6 @@ cc_library(
 
				     srcs = ["document_filters.cc"],
			
 
				     deps = [
			
 
				         ":document_format",
			
 
				-        ":parser_features",
			
 
				         ":parser_transitions",
			
 
				         ":sentence_batch",
			
 
				         ":sentence_proto",
			
@@ -376,8 +407,8 @@ cc_library(
 
				     deps = [
			
 
				         ":dictionary_proto",
			
 
				         ":document_format",
			
 
				-        ":parser_features",
			
 
				         ":parser_transitions",
			
 
				+        ":segmenter_utils",
			
 
				         ":sentence_batch",
			
 
				         ":sentence_proto",
			
 
				         ":task_context",
			
@@ -439,6 +470,18 @@ filegroup(
 
				 )
			
 
				 
			
 
				 cc_test(
			
 
				+    name = "binary_segment_state_test",
			
 
				+    size = "small",
			
 
				+    srcs = ["binary_segment_state_test.cc"],
			
 
				+    deps = [
			
 
				+        ":base",
			
 
				+        ":parser_transitions",
			
 
				+        ":term_frequency_map",
			
 
				+        ":test_main",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+cc_test(
			
 
				     name = "shared_store_test",
			
 
				     size = "small",
			
 
				     srcs = ["shared_store_test.cc"],
			
@@ -449,6 +492,26 @@ cc_test(
 
				 )
			
 
				 
			
 
				 cc_test(
			
 
				+    name = "char_properties_test",
			
 
				+    srcs = ["char_properties_test.cc"],
			
 
				+    deps = [
			
 
				+        ":char_properties",
			
 
				+        ":test_main",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+cc_test(
			
 
				+    name = "segmenter_utils_test",
			
 
				+    srcs = ["segmenter_utils_test.cc"],
			
 
				+    deps = [
			
 
				+        ":base",
			
 
				+        ":segmenter_utils",
			
 
				+        ":sentence_proto",
			
 
				+        ":test_main",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+cc_test(
			
 
				     name = "sentence_features_test",
			
 
				     size = "medium",
			
 
				     srcs = ["sentence_features_test.cc"],
			
@@ -466,6 +529,15 @@ cc_test(
 
				 )
			
 
				 
			
 
				 cc_test(
			
 
				+    name = "morphology_label_set_test",
			
 
				+    srcs = ["morphology_label_set_test.cc"],
			
 
				+    deps = [
			
 
				+        ":morphology_label_set",
			
 
				+        ":test_main",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+cc_test(
			
 
				     name = "arc_standard_transitions_test",
			
 
				     size = "small",
			
 
				     srcs = ["arc_standard_transitions_test.cc"],
			
@@ -480,6 +552,17 @@ cc_test(
 
				 )
			
 
				 
			
 
				 cc_test(
			
 
				+    name = "binary_segment_transitions_test",
			
 
				+    size = "small",
			
 
				+    srcs = ["binary_segment_transitions_test.cc"],
			
 
				+    deps = [
			
 
				+        ":parser_transitions",
			
 
				+        ":sentence_proto",
			
 
				+        ":test_main",
			
 
				+    ],
			
 
				+)
			
 
				+
			
 
				+cc_test(
			
 
				     name = "tagger_transitions_test",
			
 
				     size = "small",
			
 
				     srcs = ["tagger_transitions_test.cc"],
			
@@ -499,7 +582,6 @@ cc_test(
 
				     srcs = ["parser_features_test.cc"],
			
 
				     deps = [
			
 
				         ":feature_extractor",
			
 
				-        ":parser_features",
			
 
				         ":parser_transitions",
			
 
				         ":populate_test_inputs",
			
 
				         ":sentence_proto",
			
--- a/syntaxnet/syntaxnet/binary_segment_state.cc
+++ b/syntaxnet/syntaxnet/binary_segment_state.cc
@@ -0,0 +1,102 @@
 
				+/* Copyright 2016 Google Inc. All Rights Reserved.
			
 
				+
			
 
				+Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+you may not use this file except in compliance with the License.
			
 
				+You may obtain a copy of the License at
			
 
				+
			
 
				+    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software
			
 
				+distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+See the License for the specific language governing permissions and
			
 
				+limitations under the License.
			
 
				+==============================================================================*/
			
 
				+
			
 
				+#include "syntaxnet/binary_segment_state.h"
			
 
				+
			
 
				+#include <string>
			
 
				+#include "syntaxnet/segmenter_utils.h"
			
 
				+#include "syntaxnet/sentence.pb.h"
			
 
				+
			
 
				+namespace syntaxnet {
			
 
				+
			
 
				+ParserTransitionState *BinarySegmentState::Clone() const {
			
 
				+  return new BinarySegmentState();
			
 
				+}
			
 
				+
			
 
				+string BinarySegmentState::ToString(const ParserState &state) const {
			
 
				+  string str("[");
			
 
				+  for (int i = NumStarts(state) - 1; i >=0; --i) {
			
 
				+    int start = LastStart(i, state);
			
 
				+    int end = 0;
			
 
				+    if (i - 1 >= 0) {
			
 
				+      end = LastStart(i - 1, state) - 1;
			
 
				+    } else if (state.EndOfInput()) {
			
 
				+      end = state.sentence().token_size() - 1;
			
 
				+    } else {
			
 
				+      end = state.Next() - 1;
			
 
				+    }
			
 
				+    for (int k = start; k <= end; ++k) {
			
 
				+      str.append(state.GetToken(k).word());
			
 
				+    }
			
 
				+    if (i >= 1) str.append(" ");
			
 
				+  }
			
 
				+
			
 
				+  str.append("] ");
			
 
				+  for (int i = state.Next(); i < state.NumTokens(); ++i) {
			
 
				+    str.append(state.GetToken(i).word());
			
 
				+  }
			
 
				+  return str;
			
 
				+}
			
 
				+
			
 
				+void BinarySegmentState::AddParseToDocument(const ParserState &state,
			
 
				+                                            bool rewrite_root_labels,
			
 
				+                                            Sentence *sentence) const {
			
 
				+  if (sentence->token_size() == 0) return;
			
 
				+  vector<bool> is_starts(sentence->token_size(), false);
			
 
				+  for (int i = 0; i < NumStarts(state); ++i) {
			
 
				+    is_starts[LastStart(i, state)] = true;
			
 
				+  }
			
 
				+
			
 
				+  // Break level of the current token is determined based on its previous token.
			
 
				+  Token::BreakLevel break_level = Token::NO_BREAK;
			
 
				+  bool is_first_token = true;
			
 
				+  Sentence new_sentence;
			
 
				+  for (int i = 0; i < sentence->token_size(); ++i) {
			
 
				+    const Token &token = sentence->token(i);
			
 
				+    const string &word = token.word();
			
 
				+    bool is_break = SegmenterUtils::IsBreakChar(word);
			
 
				+    if (is_starts[i] || is_first_token) {
			
 
				+      if (!is_break) {
			
 
				+        // The current character is the first char of a new token/word.
			
 
				+        Token *new_token = new_sentence.add_token();
			
 
				+        new_token->set_start(token.start());
			
 
				+        new_token->set_end(token.end());
			
 
				+        new_token->set_word(word);
			
 
				+
			
 
				+        // For the first token, keep the old break level to make sure that the
			
 
				+        // number of sentences stays unchanged.
			
 
				+        new_token->set_break_level(break_level);
			
 
				+        is_first_token = false;
			
 
				+      }
			
 
				+    } else {
			
 
				+      // Append the character to the previous token.
			
 
				+      if (!is_break) {
			
 
				+        int index = new_sentence.token_size() - 1;
			
 
				+        auto *last_token = new_sentence.mutable_token(index);
			
 
				+        last_token->mutable_word()->append(word);
			
 
				+        last_token->set_end(token.end());
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // Update break level. Note we do not introduce new sentences in the
			
 
				+    // transition system, thus anything goes beyond line break would be reduced
			
 
				+    // to line break.
			
 
				+    break_level = is_break ? SegmenterUtils::BreakLevel(word) : Token::NO_BREAK;
			
 
				+    if (break_level >= Token::LINE_BREAK) break_level = Token::LINE_BREAK;
			
 
				+  }
			
 
				+  sentence->mutable_token()->Swap(new_sentence.mutable_token());
			
 
				+}
			
 
				+
			
 
				+}  // namespace syntaxnet
			
--- a/syntaxnet/syntaxnet/binary_segment_state.h
+++ b/syntaxnet/syntaxnet/binary_segment_state.h
@@ -0,0 +1,99 @@
 
				+/* Copyright 2016 Google Inc. All Rights Reserved.
			
 
				+
			
 
				+Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+you may not use this file except in compliance with the License.
			
 
				+You may obtain a copy of the License at
			
 
				+
			
 
				+    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software
			
 
				+distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+See the License for the specific language governing permissions and
			
 
				+limitations under the License.
			
 
				+==============================================================================*/
			
 
				+
			
 
				+#ifndef SYNTAXNET_BINARY_SEGMENT_STATE_H_
			
 
				+#define SYNTAXNET_BINARY_SEGMENT_STATE_H_
			
 
				+
			
 
				+#include "syntaxnet/parser_state.h"
			
 
				+#include "syntaxnet/parser_transitions.h"
			
 
				+
			
 
				+namespace syntaxnet {
			
 
				+
			
 
				+class Sentence;
			
 
				+
			
 
				+// Parser state for binary segmentation transition system. The input of the
			
 
				+// system is a sequence of utf8 characters that are to be segmented into tokens.
			
 
				+// The system contains two type of transitions/actions:
			
 
				+//  -START: the token at input is the first character of a new word.
			
 
				+//  -MERGE: the token at input is to be merged with the its previous token.
			
 
				+//
			
 
				+// A BinarySegmentState is used to store segmentation histories that can be used
			
 
				+// as features. In addition, it also provides the functionality to add
			
 
				+// segmentation results to the document. The function assumes that sentences in
			
 
				+// a document are processed in left-to-right order. See also the comments of
			
 
				+// the FinishDocument function for explaination.
			
 
				+//
			
 
				+// Note on spaces:
			
 
				+// Spaces, or more generally break-characters, should never be any part of a
			
 
				+// word, and the START/MERGE of spaces would be ignored. In addition, if a space
			
 
				+// starts a new word, then the actual first char of that word is the first
			
 
				+// non-space token following the space.
			
 
				+// Some examples:
			
 
				+//  -chars:  ' ' A B
			
 
				+//  -tags:    S  M M
			
 
				+//  -result: 'AB'
			
 
				+//
			
 
				+//  -chars:  A ' ' B
			
 
				+//  -tags:   S  M  M
			
 
				+//  -result: 'AB'
			
 
				+//
			
 
				+//  -chars:  A ' ' B
			
 
				+//  -tags:   S  S  M
			
 
				+//  -result: 'AB'
			
 
				+//
			
 
				+//  -chars:  A  B  ' '
			
 
				+//  -tags:   S  S  M
			
 
				+//  -result: 'A', 'B'
			
 
				+class BinarySegmentState : public ParserTransitionState {
			
 
				+ public:
			
 
				+  ParserTransitionState *Clone() const override;
			
 
				+  void Init(ParserState *state) override {}
			
 
				+
			
 
				+  // Returns the number of start tokens that have already been identified. In
			
 
				+  // other words, number of start tokens between the first token of the sentence
			
 
				+  // and state.Input(), with state.Input() excluded.
			
 
				+  static int NumStarts(const ParserState &state) {
			
 
				+    return state.StackSize();
			
 
				+  }
			
 
				+
			
 
				+  // Returns the index of the k-th most recent start token.
			
 
				+  static int LastStart(int k, const ParserState &state) {
			
 
				+    DCHECK_GE(k, 0);
			
 
				+    DCHECK_LT(k, NumStarts(state));
			
 
				+    return state.Stack(k);
			
 
				+  }
			
 
				+
			
 
				+  // Adds the token at given index as a new start token.
			
 
				+  static void AddStart(int index, ParserState *state) {
			
 
				+    state->Push(index);
			
 
				+  }
			
 
				+
			
 
				+  // Adds segmentation results to the given sentence.
			
 
				+  void AddParseToDocument(const ParserState &state,
			
 
				+                          bool rewrite_root_labels,
			
 
				+                          Sentence *sentence) const override;
			
 
				+
			
 
				+  // Whether a parsed token should be considered correct for evaluation.
			
 
				+  bool IsTokenCorrect(const ParserState &state, int index) const override {
			
 
				+    return true;
			
 
				+  }
			
 
				+
			
 
				+  // Returns a human readable string representation of this state.
			
 
				+  string ToString(const ParserState &state) const override;
			
 
				+};
			
 
				+
			
 
				+}  // namespace syntaxnet
			
 
				+
			
 
				+#endif  // SYNTAXNET_BINARY_SEGMENT_STATE_H_
			
--- a/syntaxnet/syntaxnet/binary_segment_state_test.cc
+++ b/syntaxnet/syntaxnet/binary_segment_state_test.cc
@@ -0,0 +1,218 @@
 
				+/* Copyright 2016 Google Inc. All Rights Reserved.
			
 
				+
			
 
				+Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+you may not use this file except in compliance with the License.
			
 
				+You may obtain a copy of the License at
			
 
				+
			
 
				+    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software
			
 
				+distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+See the License for the specific language governing permissions and
			
 
				+limitations under the License.
			
 
				+==============================================================================*/
			
 
				+
			
 
				+#include "syntaxnet/binary_segment_state.h"
			
 
				+
			
 
				+#include <memory>
			
 
				+
			
 
				+#include "syntaxnet/base.h"
			
 
				+#include "syntaxnet/sentence.pb.h"
			
 
				+#include "syntaxnet/term_frequency_map.h"
			
 
				+#include "tensorflow/core/platform/test.h"
			
 
				+
			
 
				+namespace syntaxnet {
			
 
				+
			
 
				+class BinarySegmentStateTest : public ::testing::Test {
			
 
				+ protected:
			
 
				+  void SetUp() override {
			
 
				+    // Prepare a sentence.
			
 
				+    const char *str_sentence = "text: '测试 的 句子' "
			
 
				+        "token { word: '测' start: 0 end: 2 } "
			
 
				+        "token { word: '试' start: 3 end: 5 } "
			
 
				+        "token { word: ' ' start: 6 end: 6 } "
			
 
				+        "token { word: '的' start: 7 end: 9 } "
			
 
				+        "token { word: ' ' start: 10 end: 10 } "
			
 
				+        "token { word: '句' start: 11 end: 13 } "
			
 
				+        "token { word: '子' start: 14 end: 16 } ";
			
 
				+    sentence_ = std::unique_ptr<Sentence>(new Sentence());
			
 
				+    TextFormat::ParseFromString(str_sentence, sentence_.get());
			
 
				+  }
			
 
				+
			
 
				+  // The test document, parse tree, and sentence.
			
 
				+  std::unique_ptr<Sentence> sentence_;
			
 
				+  TermFrequencyMap label_map_;
			
 
				+};
			
 
				+
			
 
				+TEST_F(BinarySegmentStateTest, AddStartLastStartNumStartsTest) {
			
 
				+  BinarySegmentState *segment_state = new BinarySegmentState();
			
 
				+  ParserState state(sentence_.get(), segment_state, &label_map_);
			
 
				+
			
 
				+  // Test segment_state initialized with zero starts.
			
 
				+  EXPECT_EQ(0, segment_state->NumStarts(state));
			
 
				+
			
 
				+  // Adding the first token as a start token.
			
 
				+  segment_state->AddStart(0, &state);
			
 
				+  ASSERT_EQ(1, segment_state->NumStarts(state));
			
 
				+  EXPECT_EQ(0, segment_state->LastStart(0, state));
			
 
				+
			
 
				+  // Adding more starts.
			
 
				+  segment_state->AddStart(2, &state);
			
 
				+  segment_state->AddStart(3, &state);
			
 
				+  segment_state->AddStart(4, &state);
			
 
				+  segment_state->AddStart(5, &state);
			
 
				+  ASSERT_EQ(5, segment_state->NumStarts(state));
			
 
				+  EXPECT_EQ(5, segment_state->LastStart(0, state));
			
 
				+  EXPECT_EQ(4, segment_state->LastStart(1, state));
			
 
				+  EXPECT_EQ(3, segment_state->LastStart(2, state));
			
 
				+  EXPECT_EQ(2, segment_state->LastStart(3, state));
			
 
				+  EXPECT_EQ(0, segment_state->LastStart(4, state));
			
 
				+}
			
 
				+
			
 
				+TEST_F(BinarySegmentStateTest, AddParseToDocumentTest) {
			
 
				+  BinarySegmentState *segment_state = new BinarySegmentState();
			
 
				+  ParserState state(sentence_.get(), segment_state, &label_map_);
			
 
				+
			
 
				+  // Test gold segmentation.
			
 
				+  // 0   1   2    3   4   5   6
			
 
				+  // 测  试  ' '  的  ' '  句  子
			
 
				+  // S   M   S    S   S   S   M
			
 
				+  segment_state->AddStart(0, &state);
			
 
				+  segment_state->AddStart(2, &state);
			
 
				+  segment_state->AddStart(3, &state);
			
 
				+  segment_state->AddStart(4, &state);
			
 
				+  segment_state->AddStart(5, &state);
			
 
				+  Sentence sentence_with_annotation = *sentence_;
			
 
				+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
			
 
				+
			
 
				+  // Test the number of tokens as well as the start/end byte-offsets of each
			
 
				+  // token.
			
 
				+  ASSERT_EQ(3, sentence_with_annotation.token_size());
			
 
				+
			
 
				+  // The first token is 测试.
			
 
				+  EXPECT_EQ(0, sentence_with_annotation.token(0).start());
			
 
				+  EXPECT_EQ(5, sentence_with_annotation.token(0).end());
			
 
				+
			
 
				+  // The second token is 的.
			
 
				+  EXPECT_EQ(7, sentence_with_annotation.token(1).start());
			
 
				+  EXPECT_EQ(9, sentence_with_annotation.token(1).end());
			
 
				+
			
 
				+  // The third token is 句子.
			
 
				+  EXPECT_EQ(11, sentence_with_annotation.token(2).start());
			
 
				+  EXPECT_EQ(16, sentence_with_annotation.token(2).end());
			
 
				+
			
 
				+  // Test merge space to other tokens. Since spaces, or more generally break
			
 
				+  // characters, should never be a part of any word, they are skipped no matter
			
 
				+  // how they are tagged.
			
 
				+  // 0   1   2    3   4   5   6
			
 
				+  // 测  试  ' '  的  ' '  句  子
			
 
				+  // S   M   M    S   M   M   M
			
 
				+  while (!state.StackEmpty()) state.Pop();
			
 
				+  segment_state->AddStart(0, &state);
			
 
				+  segment_state->AddStart(3, &state);
			
 
				+  sentence_with_annotation = *sentence_;
			
 
				+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
			
 
				+
			
 
				+  ASSERT_EQ(2, sentence_with_annotation.token_size());
			
 
				+
			
 
				+  // The first token is 测试. Note even a space is tagged as "merge", it is not
			
 
				+  // attached to its previous word.
			
 
				+  EXPECT_EQ(0, sentence_with_annotation.token(0).start());
			
 
				+  EXPECT_EQ(5, sentence_with_annotation.token(0).end());
			
 
				+
			
 
				+  // The second token is 的句子.
			
 
				+  EXPECT_EQ(7, sentence_with_annotation.token(1).start());
			
 
				+  EXPECT_EQ(16, sentence_with_annotation.token(1).end());
			
 
				+
			
 
				+  // Test merge a token to space tokens. In such case, the current token would
			
 
				+  // be merged to the first non-space token on its left side.
			
 
				+  // 0   1   2    3   4   5   6
			
 
				+  // 测  试  ' '  的  ' '  句  子
			
 
				+  // S   M   S    M   S   M   M
			
 
				+  while (!state.StackEmpty()) state.Pop();
			
 
				+  segment_state->AddStart(0, &state);
			
 
				+  segment_state->AddStart(2, &state);
			
 
				+  segment_state->AddStart(4, &state);
			
 
				+  sentence_with_annotation = *sentence_;
			
 
				+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
			
 
				+  ASSERT_EQ(1, sentence_with_annotation.token_size());
			
 
				+  EXPECT_EQ(0, sentence_with_annotation.token(0).start());
			
 
				+  EXPECT_EQ(16, sentence_with_annotation.token(0).end());
			
 
				+}
			
 
				+
			
 
				+TEST_F(BinarySegmentStateTest, SpaceDocumentTest) {
			
 
				+  const char *str_sentence = "text: ' \t\t' "
			
 
				+      "token { word: ' ' start: 0 end: 0 } "
			
 
				+      "token { word: '\t' start: 1 end: 1 } "
			
 
				+      "token { word: '\t' start: 2 end: 2 } ";
			
 
				+  TextFormat::ParseFromString(str_sentence, sentence_.get());
			
 
				+  BinarySegmentState *segment_state = new BinarySegmentState();
			
 
				+  ParserState state(sentence_.get(), segment_state, &label_map_);
			
 
				+
			
 
				+  // Break-chars should always be skipped, no matter how they are tagged.
			
 
				+  // 0    1     2
			
 
				+  //' '   '\t'  '\t'
			
 
				+  // M    M     M
			
 
				+  Sentence sentence_with_annotation = *sentence_;
			
 
				+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
			
 
				+  ASSERT_EQ(0, sentence_with_annotation.token_size());
			
 
				+
			
 
				+  // 0    1     2
			
 
				+  //' '   '\t'  '\t'
			
 
				+  // S    S     S
			
 
				+  segment_state->AddStart(0, &state);
			
 
				+  segment_state->AddStart(1, &state);
			
 
				+  segment_state->AddStart(2, &state);
			
 
				+  sentence_with_annotation = *sentence_;
			
 
				+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
			
 
				+  ASSERT_EQ(0, sentence_with_annotation.token_size());
			
 
				+}
			
 
				+
			
 
				+TEST_F(BinarySegmentStateTest, DocumentBeginWithSpaceTest) {
			
 
				+  const char *str_sentence = "text: ' 空格' "
			
 
				+      "token { word: ' ' start: 0 end: 0 } "
			
 
				+      "token { word: '空' start: 1 end: 3 } "
			
 
				+      "token { word: '格' start: 4 end: 6 } ";
			
 
				+  TextFormat::ParseFromString(str_sentence, sentence_.get());
			
 
				+  BinarySegmentState *segment_state = new BinarySegmentState();
			
 
				+  ParserState state(sentence_.get(), segment_state, &label_map_);
			
 
				+
			
 
				+  // 0    1    2
			
 
				+  //' '   空   格
			
 
				+  // M    M    M
			
 
				+  Sentence sentence_with_annotation = *sentence_;
			
 
				+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
			
 
				+
			
 
				+  ASSERT_EQ(1, sentence_with_annotation.token_size());
			
 
				+
			
 
				+  // The first token is 空格.
			
 
				+  EXPECT_EQ(1, sentence_with_annotation.token(0).start());
			
 
				+  EXPECT_EQ(6, sentence_with_annotation.token(0).end());
			
 
				+
			
 
				+  // 0    1    2
			
 
				+  //' '   空   格
			
 
				+  // S    M    M
			
 
				+  while (!state.StackEmpty()) state.Pop();
			
 
				+  segment_state->AddStart(0, &state);
			
 
				+  sentence_with_annotation = *sentence_;
			
 
				+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
			
 
				+
			
 
				+  ASSERT_EQ(1, sentence_with_annotation.token_size());
			
 
				+
			
 
				+  // The first token is 空格.
			
 
				+  EXPECT_EQ(1, sentence_with_annotation.token(0).start());
			
 
				+  EXPECT_EQ(6, sentence_with_annotation.token(0).end());
			
 
				+}
			
 
				+
			
 
				+TEST_F(BinarySegmentStateTest, EmptyDocumentTest) {
			
 
				+  const char *str_sentence = "text: '' ";
			
 
				+  TextFormat::ParseFromString(str_sentence, sentence_.get());
			
 
				+  BinarySegmentState *segment_state = new BinarySegmentState();
			
 
				+  ParserState state(sentence_.get(), segment_state, &label_map_);
			
 
				+  Sentence sentence_with_annotation = *sentence_;
			
 
				+  segment_state->AddParseToDocument(state, false, &sentence_with_annotation);
			
 
				+  ASSERT_EQ(0, sentence_with_annotation.token_size());
			
 
				+}
			
 
				+
			
 
				+}  // namespace syntaxnet
			
--- a/syntaxnet/syntaxnet/binary_segment_transitions.cc
+++ b/syntaxnet/syntaxnet/binary_segment_transitions.cc
@@ -0,0 +1,121 @@
 
				+/* Copyright 2016 Google Inc. All Rights Reserved.
			
 
				+
			
 
				+Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+you may not use this file except in compliance with the License.
			
 
				+You may obtain a copy of the License at
			
 
				+
			
 
				+    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software
			
 
				+distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+See the License for the specific language governing permissions and
			
 
				+limitations under the License.
			
 
				+==============================================================================*/
			
 
				+
			
 
				+#include "syntaxnet/binary_segment_state.h"
			
 
				+#include "syntaxnet/parser_state.h"
			
 
				+#include "syntaxnet/parser_transitions.h"
			
 
				+
			
 
				+namespace syntaxnet {
			
 
				+
			
 
				+// Given an input of utf8 characters, the BinarySegmentTransitionSystem
			
 
				+// conducts word segmentation by performing one of the following two actions:
			
 
				+//  -START: starts a new word with the token at state.input, and also advances
			
 
				+//          the state.input.
			
 
				+//  -MERGE: adds the token at state.input to its prevous word, and also advances
			
 
				+//          state.input.
			
 
				+//
			
 
				+// Also see nlp/saft/components/segmentation/transition/binary-segment-state.h
			
 
				+// for examples on handling spaces.
			
 
				+class BinarySegmentTransitionSystem : public ParserTransitionSystem {
			
 
				+ public:
			
 
				+  BinarySegmentTransitionSystem() {}
			
 
				+  ParserTransitionState *NewTransitionState(bool train_mode) const override {
			
 
				+    return new BinarySegmentState();
			
 
				+  }
			
 
				+
			
 
				+  // Action types for the segmentation-transition system.
			
 
				+  enum ParserActionType {
			
 
				+    START = 0,
			
 
				+    MERGE = 1,
			
 
				+    CARDINAL = 2
			
 
				+  };
			
 
				+
			
 
				+  static int StartAction() { return 0; }
			
 
				+  static int MergeAction() { return 1; }
			
 
				+
			
 
				+  // The system always starts a new word by default.
			
 
				+  ParserAction GetDefaultAction(const ParserState &state) const override {
			
 
				+    return START;
			
 
				+  }
			
 
				+
			
 
				+  // Returns the number of action types.
			
 
				+  int NumActionTypes() const override {
			
 
				+    return CARDINAL;
			
 
				+  }
			
 
				+
			
 
				+  // Returns the number of possible actions.
			
 
				+  int NumActions(int num_labels) const override {
			
 
				+    return CARDINAL;
			
 
				+  }
			
 
				+
			
 
				+  // Returns the next gold action for a given state according to the underlying
			
 
				+  // annotated sentence. The training data for the transition system is created
			
 
				+  // by the binary-segmenter-data task. If a token's break_level is NO_BREAK,
			
 
				+  // then it is a MERGE, START otherwise. The only exception is that the first
			
 
				+  // token in a sentence for the transition sysytem is always a START.
			
 
				+  ParserAction GetNextGoldAction(const ParserState &state) const override {
			
 
				+    if (state.Next() == 0) return StartAction();
			
 
				+    const Token &token = state.GetToken(state.Next());
			
 
				+    return (token.break_level() != Token::NO_BREAK ?
			
 
				+        StartAction() : MergeAction());
			
 
				+  }
			
 
				+
			
 
				+  // Both START and MERGE can be applied to any tokens in the sentence.
			
 
				+  bool IsAllowedAction(
			
 
				+      ParserAction action, const ParserState &state) const override {
			
 
				+    return true;
			
 
				+  }
			
 
				+
			
 
				+  // Performs the specified action on a given parser state, without adding the
			
 
				+  // action to the state's history.
			
 
				+  void PerformActionWithoutHistory(
			
 
				+      ParserAction action, ParserState *state) const override {
			
 
				+    // Note when the action is less than 0, it is treated as a START.
			
 
				+    if (action < 0 || action == StartAction()) {
			
 
				+      MutableTransitionState(state)->AddStart(state->Next(), state);
			
 
				+    }
			
 
				+    state->Advance();
			
 
				+  }
			
 
				+
			
 
				+  // Allows backoff to best allowable transition.
			
 
				+  bool BackOffToBestAllowableTransition() const override { return true; }
			
 
				+
			
 
				+  // A state is a deterministic state iff no tokens have been consumed.
			
 
				+  bool IsDeterministicState(const ParserState &state) const override {
			
 
				+    return state.Next() == 0;
			
 
				+  }
			
 
				+
			
 
				+  // For binary segmentation, a state is a final state iff all tokens have been
			
 
				+  // consumed.
			
 
				+  bool IsFinalState(const ParserState &state) const override {
			
 
				+    return state.EndOfInput();
			
 
				+  }
			
 
				+
			
 
				+  // Returns a string representation of a parser action.
			
 
				+  string ActionAsString(
			
 
				+      ParserAction action, const ParserState &state) const override {
			
 
				+    return action == StartAction() ? "START" : "MERGE";
			
 
				+  }
			
 
				+
			
 
				+  // Downcasts the TransitionState in ParserState to an BinarySegmentState.
			
 
				+  static BinarySegmentState *MutableTransitionState(ParserState *state) {
			
 
				+    return static_cast<BinarySegmentState *>(state->mutable_transition_state());
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+REGISTER_TRANSITION_SYSTEM("binary-segment-transitions",
			
 
				+                           BinarySegmentTransitionSystem);
			
 
				+
			
 
				+}  // namespace syntaxnet
			
--- a/syntaxnet/syntaxnet/binary_segment_transitions_test.cc
+++ b/syntaxnet/syntaxnet/binary_segment_transitions_test.cc
@@ -0,0 +1,111 @@
 
				+/* Copyright 2016 Google Inc. All Rights Reserved.
			
 
				+
			
 
				+Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+you may not use this file except in compliance with the License.
			
 
				+You may obtain a copy of the License at
			
 
				+
			
 
				+    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software
			
 
				+distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+See the License for the specific language governing permissions and
			
 
				+limitations under the License.
			
 
				+==============================================================================*/
			
 
				+
			
 
				+#include "syntaxnet/binary_segment_state.h"
			
 
				+#include "syntaxnet/parser_state.h"
			
 
				+#include "syntaxnet/parser_transitions.h"
			
 
				+#include "syntaxnet/term_frequency_map.h"
			
 
				+#include "tensorflow/core/platform/test.h"
			
 
				+
			
 
				+namespace syntaxnet {
			
 
				+
			
 
				+class SegmentationTransitionTest : public ::testing::Test {
			
 
				+ protected:
			
 
				+  void SetUp() override {
			
 
				+    transition_system_ = std::unique_ptr<ParserTransitionSystem>(
			
 
				+        ParserTransitionSystem::Create("binary-segment-transitions"));
			
 
				+
			
 
				+    // Prepare a sentence.
			
 
				+    const char *str_sentence = "text: '因为 有 这样' "
			
 
				+        "token { word: '因' start: 0 end: 2 break_level: SPACE_BREAK } "
			
 
				+        "token { word: '为' start: 3 end: 5 break_level: NO_BREAK } "
			
 
				+        "token { word: ' ' start: 6 end: 6 break_level: SPACE_BREAK } "
			
 
				+        "token { word: '有' start: 7 end: 9 break_level: SPACE_BREAK } "
			
 
				+        "token { word: ' ' start: 10 end: 10 break_level: SPACE_BREAK } "
			
 
				+        "token { word: '这' start: 11 end: 13 break_level: SPACE_BREAK } "
			
 
				+        "token { word: '样' start: 14 end: 16 break_level: NO_BREAK } ";
			
 
				+    sentence_ = std::unique_ptr<Sentence>(new Sentence());
			
 
				+    TextFormat::ParseFromString(str_sentence, sentence_.get());
			
 
				+  }
			
 
				+
			
 
				+  void CheckStarts(const ParserState &state, const vector<int> &target) {
			
 
				+    ASSERT_EQ(state.StackSize(), target.size());
			
 
				+    vector<int> starts;
			
 
				+    for (int i = 0; i < state.StackSize(); ++i) {
			
 
				+      EXPECT_EQ(state.Stack(i), target[i]);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // The test document, parse tree, and sentence with tags and partial parses.
			
 
				+  std::unique_ptr<Sentence> sentence_;
			
 
				+  std::unique_ptr<ParserTransitionSystem> transition_system_;
			
 
				+  TermFrequencyMap label_map_;
			
 
				+};
			
 
				+
			
 
				+TEST_F(SegmentationTransitionTest, GoldNextActionTest) {
			
 
				+  BinarySegmentState *segment_state = static_cast<BinarySegmentState *>(
			
 
				+      transition_system_->NewTransitionState(true));
			
 
				+  ParserState state(sentence_.get(), segment_state, &label_map_);
			
 
				+
			
 
				+  // Do segmentation by following the gold actions.
			
 
				+  while (transition_system_->IsFinalState(state) == false) {
			
 
				+    ParserAction action = transition_system_->GetNextGoldAction(state);
			
 
				+    transition_system_->PerformActionWithoutHistory(action, &state);
			
 
				+  }
			
 
				+
			
 
				+  // Test STARTs.
			
 
				+  CheckStarts(state, {5, 4, 3, 2, 0});
			
 
				+
			
 
				+  // Test the annotated tokens.
			
 
				+  segment_state->AddParseToDocument(state, false, sentence_.get());
			
 
				+  ASSERT_EQ(sentence_->token_size(), 3);
			
 
				+  EXPECT_EQ(sentence_->token(0).word(), "因为");
			
 
				+  EXPECT_EQ(sentence_->token(1).word(), "有");
			
 
				+  EXPECT_EQ(sentence_->token(2).word(), "这样");
			
 
				+
			
 
				+  // Test start/end annotation of each token.
			
 
				+  EXPECT_EQ(sentence_->token(0).start(), 0);
			
 
				+  EXPECT_EQ(sentence_->token(0).end(), 5);
			
 
				+  EXPECT_EQ(sentence_->token(1).start(), 7);
			
 
				+  EXPECT_EQ(sentence_->token(1).end(), 9);
			
 
				+  EXPECT_EQ(sentence_->token(2).start(), 11);
			
 
				+  EXPECT_EQ(sentence_->token(2).end(), 16);
			
 
				+}
			
 
				+
			
 
				+TEST_F(SegmentationTransitionTest, DefaultActionTest) {
			
 
				+  BinarySegmentState *segment_state = static_cast<BinarySegmentState *>(
			
 
				+      transition_system_->NewTransitionState(true));
			
 
				+  ParserState state(sentence_.get(), segment_state, &label_map_);
			
 
				+
			
 
				+  // Do segmentation, tagging and parsing by following the gold actions.
			
 
				+  while (transition_system_->IsFinalState(state) == false) {
			
 
				+    ParserAction action = transition_system_->GetDefaultAction(state);
			
 
				+    transition_system_->PerformActionWithoutHistory(action, &state);
			
 
				+  }
			
 
				+
			
 
				+  // Every character should be START.
			
 
				+  CheckStarts(state, {6, 5, 4, 3, 2, 1, 0});
			
 
				+
			
 
				+  // Every non-space character should be a word.
			
 
				+  segment_state->AddParseToDocument(state, false, sentence_.get());
			
 
				+  ASSERT_EQ(sentence_->token_size(), 5);
			
 
				+  EXPECT_EQ(sentence_->token(0).word(), "因");
			
 
				+  EXPECT_EQ(sentence_->token(1).word(), "为");
			
 
				+  EXPECT_EQ(sentence_->token(2).word(), "有");
			
 
				+  EXPECT_EQ(sentence_->token(3).word(), "这");
			
 
				+  EXPECT_EQ(sentence_->token(4).word(), "样");
			
 
				+}
			
 
				+
			
 
				+}  // namespace syntaxnet
			
--- a/syntaxnet/syntaxnet/char_properties.cc
+++ b/syntaxnet/syntaxnet/char_properties.cc
@@ -0,0 +1,845 @@
 
				+/* Copyright 2016 Google Inc. All Rights Reserved.
			
 
				+
			
 
				+Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+you may not use this file except in compliance with the License.
			
 
				+You may obtain a copy of the License at
			
 
				+
			
 
				+    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software
			
 
				+distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+See the License for the specific language governing permissions and
			
 
				+limitations under the License.
			
 
				+==============================================================================*/
			
 
				+
			
 
				+// char_properties.cc - define is_X() tests for various character properties
			
 
				+//
			
 
				+// See char_properties.h for how to write a character property.
			
 
				+//
			
 
				+// References for the char sets below:
			
 
				+//
			
 
				+// . http://www.unicode.org/Public/UNIDATA/PropList.txt
			
 
				+//
			
 
				+//   Large (but not exhaustive) list of Unicode chars and their "properties"
			
 
				+//   (e.g., the property "Pi" = an initial quote punctuation char).
			
 
				+//
			
 
				+// . http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
			
 
				+//
			
 
				+//   Defines the list of properties, such as "Pi", used in the above list.
			
 
				+//
			
 
				+// . http://www.unipad.org/unimap/index.php?param_char=XXXX&page=detail
			
 
				+//
			
 
				+//   Gives detail about a particular character code.
			
 
				+//   XXXX is a 4-hex-digit Unicode character code.
			
 
				+//
			
 
				+// . http://www.unicode.org/Public/UNIDATA/UCD.html
			
 
				+//
			
 
				+//   General reference for Unicode characters.
			
 
				+//
			
 
				+
			
 
				+#include "syntaxnet/char_properties.h"
			
 
				+
			
 
				+#include <ctype.h>  // for ispunct, isspace
			
 
				+#include <memory>
			
 
				+#include <utility>
			
 
				+#include <vector>  // for vector
			
 
				+
			
 
				+#include "tensorflow/core/lib/strings/str_util.h"
			
 
				+#include "tensorflow/core/lib/strings/stringprintf.h"
			
 
				+#include "third_party/utf/utf.h"      // for runetochar, ::UTFmax, Rune
			
 
				+#include "util/utf8/unilib.h"  // for IsValidCodepoint, etc
			
 
				+#include "util/utf8/unilib_utf8_utils.h"
			
 
				+
			
 
				+//============================================================
			
 
				+// CharPropertyImplementation
			
 
				+//
			
 
				+
			
 
				+// A CharPropertyImplementation stores a set of Unicode characters,
			
 
				+// encoded in UTF-8, as a trie.  The trie is represented as a vector
			
 
				+// of nodes.  Each node is a 256-element array that specifies what to
			
 
				+// do with one byte of the UTF-8 sequence.  Each element n of a node
			
 
				+// is one of:
			
 
				+//  n = 0,  indicating that the Property is not true of any
			
 
				+//          character whose UTF-8 encoding includes this byte at
			
 
				+//          this position
			
 
				+//  n = -1, indicating that the Property is true for the UTF-8 sequence
			
 
				+//          that ends with this byte.
			
 
				+//  n > 0,  indicating the index of the row that describes the
			
 
				+//          remaining bytes in the UTF-8 sequence.
			
 
				+//
			
 
				+// The only operation that needs to be fast is HoldsFor, which tests
			
 
				+// whether a character has a given property. We use each byte of the
			
 
				+// character's UTF-8 encoding to index into a row. If the value is 0,
			
 
				+// then the property is not true for the character. (We might discover
			
 
				+// this even before getting to the end of the sequence.) If the value
			
 
				+// is -1, then the property is true for this character. Otherwise,
			
 
				+// the value is the index of another row, which we index using the next
			
 
				+// byte in the sequence, and so on. The design of UTF-8 prevents
			
 
				+// ambiguities here; no prefix of a UTF-8 sequence is a valid UTF-8
			
 
				+// sequence.
			
 
				+//
			
 
				+// While it is possible to implement an iterator for this representation,
			
 
				+// it is much easier to use set<char32> for this purpose. In fact, we
			
 
				+// would use that as the entire representation, were it not for concerns
			
 
				+// that HoldsFor might be slower.
			
 
				+
			
 
				+namespace syntaxnet {
			
 
				+
			
 
				+struct CharPropertyImplementation {
			
 
				+  unordered_set<char32> chars;
			
 
				+  vector<vector<int> > rows;
			
 
				+  CharPropertyImplementation() {
			
 
				+    rows.reserve(10);
			
 
				+    rows.resize(1);
			
 
				+    rows[0].resize(256, 0);
			
 
				+  }
			
 
				+  void AddChar(char *buf, int len) {
			
 
				+    int n = 0;  // row index
			
 
				+    for (int i = 0; i < len; ++i) {
			
 
				+      int ch = reinterpret_cast<unsigned char *>(buf)[i];
			
 
				+      int m = rows[n][ch];
			
 
				+      if (m > 0) {
			
 
				+        CHECK_LT(i, len - 1)
			
 
				+            << " : " << (i + 1) << "-byte UTF-8 sequence "
			
 
				+            << "(" << tensorflow::str_util::CEscape(string(buf, i + 1)) << ")"
			
 
				+            << " is prefix of previously-seen UTF-8 sequence(s)";
			
 
				+        n = m;
			
 
				+      } else if (i == len - 1) {
			
 
				+        rows[n][ch] = -1;
			
 
				+      } else {
			
 
				+        CHECK_EQ(m, 0) << " : UTF-8 sequence is extension of previously-seen "
			
 
				+                       << (i + 1) << "-byte UTF-8 sequence "
			
 
				+                       << "("
			
 
				+                       << tensorflow::str_util::CEscape(string(buf, i + 1))
			
 
				+                       << ")";
			
 
				+        int a = rows.size();
			
 
				+        rows.resize(a + 1);
			
 
				+        rows[a].resize(256, 0);
			
 
				+        rows[n][ch] = a;
			
 
				+        n = a;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  bool HoldsFor(const char *buf) const {
			
 
				+    const unsigned char *bytes = reinterpret_cast<const unsigned char *>(buf);
			
 
				+
			
 
				+    // Lookup each byte of the UTF-8 sequence, starting in row 0.
			
 
				+    int n = rows[0][*bytes];
			
 
				+    if (n == 0) return false;
			
 
				+    if (n == -1) return true;
			
 
				+
			
 
				+    // If the value is not 0 or -1, then it is the index of the row for the
			
 
				+    // second byte in the sequence.
			
 
				+    n = rows[n][*++bytes];
			
 
				+    if (n == 0) return false;
			
 
				+    if (n == -1) return true;
			
 
				+    n = rows[n][*++bytes];  // Likewise for the third byte.
			
 
				+    if (n == 0) return false;
			
 
				+    if (n == -1) return true;
			
 
				+    n = rows[n][*++bytes];  // Likewise for the fourth byte.
			
 
				+    if (n == 0) return false;
			
 
				+
			
 
				+    // Since there can be at most 4 bytes in the sequence, n must be -1.
			
 
				+    return true;
			
 
				+
			
 
				+    // Implementation note: it is possible (and perhaps clearer) to write this
			
 
				+    // code as a loop, "for (int i = 0; i < 4; ++i) ...", but the TestHoldsFor
			
 
				+    // benchmark results indicate that doing so produces slower code for
			
 
				+    // anything other than short 7-bit ASCII strings (< 512 bytes). This is
			
 
				+    // mysterious, since the compiler unrolls the loop, producing code that
			
 
				+    // is almost the same as what we have here, except for the shortcut on
			
 
				+    // the 4th byte.
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				+//============================================================
			
 
				+// CharProperty - a property that holds for selected Unicode chars
			
 
				+//
			
 
				+
			
 
				+CharProperty::CharProperty(const char *name,
			
 
				+                           const int *unicodes,
			
 
				+                           int num_unicodes)
			
 
				+    : name_(name),
			
 
				+      impl_(new CharPropertyImplementation) {
			
 
				+  // Initialize CharProperty to its char set.
			
 
				+  AddCharSpec(unicodes, num_unicodes);
			
 
				+}
			
 
				+
			
 
				+CharProperty::CharProperty(const char *name, CharPropertyInitializer *init_fn)
			
 
				+    : name_(name),
			
 
				+      impl_(new CharPropertyImplementation) {
			
 
				+  (*init_fn)(this);
			
 
				+}
			
 
				+
			
 
				+CharProperty::~CharProperty() {
			
 
				+  delete impl_;
			
 
				+}
			
 
				+
			
 
				+void CharProperty::AddChar(int c) {
			
 
				+  CheckUnicodeVal(c);
			
 
				+  impl_->chars.insert(c);
			
 
				+
			
 
				+  char buf[UTFmax];
			
 
				+  Rune r = c;
			
 
				+  int len = runetochar(buf, &r);
			
 
				+  impl_->AddChar(buf, len);
			
 
				+}
			
 
				+
			
 
				+void CharProperty::AddCharRange(int c1, int c2) {
			
 
				+  for (int c = c1; c <= c2; ++c) {
			
 
				+    AddChar(c);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void CharProperty::AddAsciiPredicate(AsciiPredicate *pred) {
			
 
				+  for (int c = 0; c < 256; ++c) {
			
 
				+    if ((*pred)(c)) {
			
 
				+      AddChar(c);
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void CharProperty::AddCharProperty(const char *propname) {
			
 
				+  const CharProperty *prop = CharProperty::Lookup(propname);
			
 
				+  CHECK(prop != NULL) << ": unknown char property \"" << propname
			
 
				+                      << "\" in " << name_;
			
 
				+  int c = -1;
			
 
				+  while ((c = prop->NextElementAfter(c)) >= 0) {
			
 
				+    AddChar(c);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void CharProperty::AddCharSpec(const int *unicodes, int num_unicodes) {
			
 
				+  for (int i = 0; i < num_unicodes; ++i) {
			
 
				+    if (i + 3 < num_unicodes && unicodes[i] == kPreUnicodeRange &&
			
 
				+        unicodes[i + 3] == kPostUnicodeRange) {
			
 
				+      // Range of unicode values
			
 
				+      int lower = unicodes[i + 1];
			
 
				+      int upper = unicodes[i + 2];
			
 
				+      i += 3;  // i will be incremented once more at top of loop
			
 
				+      CHECK(lower <= upper) << ": invalid char range in " << name_
			
 
				+                            << ": [" << UnicodeToString(lower) << ", "
			
 
				+                            << UnicodeToString(upper) << "]";
			
 
				+      AddCharRange(lower, upper);
			
 
				+    } else {
			
 
				+      AddChar(unicodes[i]);
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+bool CharProperty::HoldsFor(int c) const {
			
 
				+  if (!UniLib::IsValidCodepoint(c)) return false;
			
 
				+  char buf[UTFmax];
			
 
				+  Rune r = c;
			
 
				+  runetochar(buf, &r);
			
 
				+  return impl_->HoldsFor(buf);
			
 
				+}
			
 
				+
			
 
				+bool CharProperty::HoldsFor(const char *str, int len) const {
			
 
				+  // UniLib::IsUTF8ValidCodepoint also checks for structural validity.
			
 
				+  return len > 0 && UniLib::IsUTF8ValidCodepoint(StringPiece(str, len)) &&
			
 
				+         impl_->HoldsFor(str);
			
 
				+}
			
 
				+
			
 
				+// Return -1 or the smallest Unicode char greater than c for which
			
 
				+// the CharProperty holds.  Expects c == -1 or HoldsFor(c).
			
 
				+int CharProperty::NextElementAfter(int c) const {
			
 
				+  DCHECK(c == -1 || HoldsFor(c));
			
 
				+  unordered_set<char32>::const_iterator end = impl_->chars.end();
			
 
				+  if (c < 0) {
			
 
				+    unordered_set<char32>::const_iterator it = impl_->chars.begin();
			
 
				+    if (it == end) return -1;
			
 
				+    return *it;
			
 
				+  }
			
 
				+  char32 r = c;
			
 
				+  unordered_set<char32>::const_iterator it = impl_->chars.find(r);
			
 
				+  if (it == end) return -1;
			
 
				+  it++;
			
 
				+  if (it == end) return -1;
			
 
				+  return *it;
			
 
				+}
			
 
				+
			
 
				+REGISTER_CLASS_REGISTRY("char property wrapper", CharPropertyWrapper);
			
 
				+
			
 
				+const CharProperty *CharProperty::Lookup(const char *subclass) {
			
 
				+  // Create a CharPropertyWrapper object and delete it.  We only care about
			
 
				+  // the CharProperty it provides.
			
 
				+  std::unique_ptr<CharPropertyWrapper> wrapper(
			
 
				+      CharPropertyWrapper::Create(subclass));
			
 
				+  if (wrapper.get() == NULL) {
			
 
				+    LOG(ERROR) << "CharPropertyWrapper not found for subclass: "
			
 
				+               << "\"" << subclass << "\"";
			
 
				+    return NULL;
			
 
				+  }
			
 
				+  return wrapper->GetCharProperty();
			
 
				+}
			
 
				+
			
 
				+// Check that a given Unicode value is in range.
			
 
				+void CharProperty::CheckUnicodeVal(int c) const {
			
 
				+  CHECK(UniLib::IsValidCodepoint(c))
			
 
				+      << "Unicode in " << name_ << " out of range: " << UnicodeToString(c);
			
 
				+}
			
 
				+
			
 
				+// Converts a Unicode value to a string (for error messages).
			
 
				+string CharProperty::UnicodeToString(int c) {
			
 
				+  const char *fmt;
			
 
				+
			
 
				+  if (c < 0) {
			
 
				+    fmt = "%d";      // out-of-range
			
 
				+  } else if (c <= 0x7f) {
			
 
				+    fmt = "'%c'";    // ascii
			
 
				+  } else if (c <= 0xffff) {
			
 
				+    fmt = "0x%04X";  // 4 hex digits
			
 
				+  } else {
			
 
				+    fmt = "0x%X";    // also out-of-range
			
 
				+  }
			
 
				+
			
 
				+  return tensorflow::strings::Printf(fmt, c);
			
 
				+}
			
 
				+
			
 
				+//======================================================================
			
 
				+// Expression-level punctuation
			
 
				+//
			
 
				+
			
 
				+// Punctuation that starts a sentence.
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(start_sentence_punc,
			
 
				+  0x00A1,  // Spanish inverted exclamation mark
			
 
				+  0x00BF,  // Spanish inverted question mark
			
 
				+)
			
 
				+
			
 
				+// Punctuation that ends a sentence.
			
 
				+// Based on: http://www.unicode.org/unicode/reports/tr29/#Sentence_Boundaries
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(end_sentence_punc,
			
 
				+  '.',
			
 
				+  '!',
			
 
				+  '?',
			
 
				+  0x055C,  // Armenian exclamation mark
			
 
				+  0x055E,  // Armenian question mark
			
 
				+  0x0589,  // Armenian full stop
			
 
				+  0x061F,  // Arabic question mark
			
 
				+  0x06D4,  // Arabic full stop
			
 
				+  0x0700,  // Syriac end of paragraph
			
 
				+  0x0701,  // Syriac supralinear full stop
			
 
				+  0x0702,  // Syriac sublinear full stop
			
 
				+  RANGE(0x0964, 0x0965),  // Devanagari danda..Devanagari double danda
			
 
				+  0x1362,  // Ethiopic full stop
			
 
				+  0x1367,  // Ethiopic question mark
			
 
				+  0x1368,  // Ethiopic paragraph separator
			
 
				+  0x104A,  // Myanmar sign little section
			
 
				+  0x104B,  // Myanmar sign section
			
 
				+  0x166E,  // Canadian syllabics full stop
			
 
				+  0x17d4,  // Khmer sign khan
			
 
				+  0x1803,  // Mongolian full stop
			
 
				+  0x1809,  // Mongolian Manchu full stop
			
 
				+  0x1944,  // Limbu exclamation mark
			
 
				+  0x1945,  // Limbu question mark
			
 
				+  0x203C,  // double exclamation mark
			
 
				+  0x203D,  // interrobang
			
 
				+  0x2047,  // double question mark
			
 
				+  0x2048,  // question exclamation mark
			
 
				+  0x2049,  // exclamation question mark
			
 
				+  0x3002,  // ideographic full stop
			
 
				+  0x037E,  // Greek question mark
			
 
				+  0xFE52,  // small full stop
			
 
				+  0xFE56,  // small question mark
			
 
				+  0xFE57,  // small exclamation mark
			
 
				+  0xFF01,  // fullwidth exclamation mark
			
 
				+  0xFF0E,  // fullwidth full stop
			
 
				+  0xFF1F,  // fullwidth question mark
			
 
				+  0xFF61,  // halfwidth ideographic full stop
			
 
				+  0x2026,  // ellipsis
			
 
				+)
			
 
				+
			
 
				+// Punctuation, such as parens, that opens a "nested expression" of text.
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(open_expr_punc,
			
 
				+  '(',
			
 
				+  '[',
			
 
				+  '<',
			
 
				+  '{',
			
 
				+  0x207D,  // superscript left parenthesis
			
 
				+  0x208D,  // subscript left parenthesis
			
 
				+  0x27E6,  // mathematical left white square bracket
			
 
				+  0x27E8,  // mathematical left angle bracket
			
 
				+  0x27EA,  // mathematical left double angle bracket
			
 
				+  0x2983,  // left white curly bracket
			
 
				+  0x2985,  // left white parenthesis
			
 
				+  0x2987,  // Z notation left image bracket
			
 
				+  0x2989,  // Z notation left binding bracket
			
 
				+  0x298B,  // left square bracket with underbar
			
 
				+  0x298D,  // left square bracket with tick in top corner
			
 
				+  0x298F,  // left square bracket with tick in bottom corner
			
 
				+  0x2991,  // left angle bracket with dot
			
 
				+  0x2993,  // left arc less-than bracket
			
 
				+  0x2995,  // double left arc greater-than bracket
			
 
				+  0x2997,  // left black tortoise shell bracket
			
 
				+  0x29D8,  // left wiggly fence
			
 
				+  0x29DA,  // left double wiggly fence
			
 
				+  0x29FC,  // left-pointing curved angle bracket
			
 
				+  0x3008,  // CJK left angle bracket
			
 
				+  0x300A,  // CJK left double angle bracket
			
 
				+  0x3010,  // CJK left black lenticular bracket
			
 
				+  0x3014,  // CJK left tortoise shell bracket
			
 
				+  0x3016,  // CJK left white lenticular bracket
			
 
				+  0x3018,  // CJK left white tortoise shell bracket
			
 
				+  0x301A,  // CJK left white square bracket
			
 
				+  0xFD3E,  // Ornate left parenthesis
			
 
				+  0xFE59,  // small left parenthesis
			
 
				+  0xFE5B,  // small left curly bracket
			
 
				+  0xFF08,  // fullwidth left parenthesis
			
 
				+  0xFF3B,  // fullwidth left square bracket
			
 
				+  0xFF5B,  // fullwidth left curly bracket
			
 
				+)
			
 
				+
			
 
				+// Punctuation, such as parens, that closes a "nested expression" of text.
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(close_expr_punc,
			
 
				+  ')',
			
 
				+  ']',
			
 
				+  '>',
			
 
				+  '}',
			
 
				+  0x207E,  // superscript right parenthesis
			
 
				+  0x208E,  // subscript right parenthesis
			
 
				+  0x27E7,  // mathematical right white square bracket
			
 
				+  0x27E9,  // mathematical right angle bracket
			
 
				+  0x27EB,  // mathematical right double angle bracket
			
 
				+  0x2984,  // right white curly bracket
			
 
				+  0x2986,  // right white parenthesis
			
 
				+  0x2988,  // Z notation right image bracket
			
 
				+  0x298A,  // Z notation right binding bracket
			
 
				+  0x298C,  // right square bracket with underbar
			
 
				+  0x298E,  // right square bracket with tick in top corner
			
 
				+  0x2990,  // right square bracket with tick in bottom corner
			
 
				+  0x2992,  // right angle bracket with dot
			
 
				+  0x2994,  // right arc greater-than bracket
			
 
				+  0x2996,  // double right arc less-than bracket
			
 
				+  0x2998,  // right black tortoise shell bracket
			
 
				+  0x29D9,  // right wiggly fence
			
 
				+  0x29DB,  // right double wiggly fence
			
 
				+  0x29FD,  // right-pointing curved angle bracket
			
 
				+  0x3009,  // CJK right angle bracket
			
 
				+  0x300B,  // CJK right double angle bracket
			
 
				+  0x3011,  // CJK right black lenticular bracket
			
 
				+  0x3015,  // CJK right tortoise shell bracket
			
 
				+  0x3017,  // CJK right white lenticular bracket
			
 
				+  0x3019,  // CJK right white tortoise shell bracket
			
 
				+  0x301B,  // CJK right white square bracket
			
 
				+  0xFD3F,  // Ornate right parenthesis
			
 
				+  0xFE5A,  // small right parenthesis
			
 
				+  0xFE5C,  // small right curly bracket
			
 
				+  0xFF09,  // fullwidth right parenthesis
			
 
				+  0xFF3D,  // fullwidth right square bracket
			
 
				+  0xFF5D,  // fullwidth right curly bracket
			
 
				+)
			
 
				+
			
 
				+// Chars that open a quotation.
			
 
				+// Based on: http://www.unicode.org/uni2book/ch06.pdf
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(open_quote,
			
 
				+  '"',
			
 
				+  '\'',
			
 
				+  '`',
			
 
				+  0xFF07,  // fullwidth apostrophe
			
 
				+  0xFF02,  // fullwidth quotation mark
			
 
				+  0x2018,  // left single quotation mark (English, others)
			
 
				+  0x201C,  // left double quotation mark (English, others)
			
 
				+  0x201B,  // single high-reveresed-9 quotation mark (PropList.txt)
			
 
				+  0x201A,  // single low-9 quotation mark (Czech, German, Slovak)
			
 
				+  0x201E,  // double low-9 quotation mark (Czech, German, Slovak)
			
 
				+  0x201F,  // double high-reversed-9 quotation mark (PropList.txt)
			
 
				+  0x2019,  // right single quotation mark (Danish, Finnish, Swedish, Norw.)
			
 
				+  0x201D,  // right double quotation mark (Danish, Finnish, Swedish, Norw.)
			
 
				+  0x2039,  // single left-pointing angle quotation mark (French, others)
			
 
				+  0x00AB,  // left-pointing double angle quotation mark (French, others)
			
 
				+  0x203A,  // single right-pointing angle quotation mark (Slovenian, others)
			
 
				+  0x00BB,  // right-pointing double angle quotation mark (Slovenian, others)
			
 
				+  0x300C,  // left corner bracket (East Asian languages)
			
 
				+  0xFE41,  // presentation form for vertical left corner bracket
			
 
				+  0xFF62,  // halfwidth left corner bracket (East Asian languages)
			
 
				+  0x300E,  // left white corner bracket (East Asian languages)
			
 
				+  0xFE43,  // presentation form for vertical left white corner bracket
			
 
				+  0x301D,  // reversed double prime quotation mark (East Asian langs, horiz.)
			
 
				+)
			
 
				+
			
 
				+// Chars that close a quotation.
			
 
				+// Based on: http://www.unicode.org/uni2book/ch06.pdf
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(close_quote,
			
 
				+  '\'',
			
 
				+  '"',
			
 
				+  '`',
			
 
				+  0xFF07,  // fullwidth apostrophe
			
 
				+  0xFF02,  // fullwidth quotation mark
			
 
				+  0x2019,  // right single quotation mark (English, others)
			
 
				+  0x201D,  // right double quotation mark (English, others)
			
 
				+  0x2018,  // left single quotation mark (Czech, German, Slovak)
			
 
				+  0x201C,  // left double quotation mark (Czech, German, Slovak)
			
 
				+  0x203A,  // single right-pointing angle quotation mark (French, others)
			
 
				+  0x00BB,  // right-pointing double angle quotation mark (French, others)
			
 
				+  0x2039,  // single left-pointing angle quotation mark (Slovenian, others)
			
 
				+  0x00AB,  // left-pointing double angle quotation mark (Slovenian, others)
			
 
				+  0x300D,  // right corner bracket (East Asian languages)
			
 
				+  0xfe42,  // presentation form for vertical right corner bracket
			
 
				+  0xFF63,  // halfwidth right corner bracket (East Asian languages)
			
 
				+  0x300F,  // right white corner bracket (East Asian languages)
			
 
				+  0xfe44,  // presentation form for vertical right white corner bracket
			
 
				+  0x301F,  // low double prime quotation mark (East Asian languages)
			
 
				+  0x301E,  // close double prime (East Asian languages written horizontally)
			
 
				+)
			
 
				+
			
 
				+// Punctuation chars that open an expression or a quotation.
			
 
				+DEFINE_CHAR_PROPERTY(open_punc, prop) {
			
 
				+  prop->AddCharProperty("open_expr_punc");
			
 
				+  prop->AddCharProperty("open_quote");
			
 
				+}
			
 
				+
			
 
				+// Punctuation chars that close an expression or a quotation.
			
 
				+DEFINE_CHAR_PROPERTY(close_punc, prop) {
			
 
				+  prop->AddCharProperty("close_expr_punc");
			
 
				+  prop->AddCharProperty("close_quote");
			
 
				+}
			
 
				+
			
 
				+// Punctuation chars that can come at the beginning of a sentence.
			
 
				+DEFINE_CHAR_PROPERTY(leading_sentence_punc, prop) {
			
 
				+  prop->AddCharProperty("open_punc");
			
 
				+  prop->AddCharProperty("start_sentence_punc");
			
 
				+}
			
 
				+
			
 
				+// Punctuation chars that can come at the end of a sentence.
			
 
				+DEFINE_CHAR_PROPERTY(trailing_sentence_punc, prop) {
			
 
				+  prop->AddCharProperty("close_punc");
			
 
				+  prop->AddCharProperty("end_sentence_punc");
			
 
				+}
			
 
				+
			
 
				+//======================================================================
			
 
				+// Special symbols
			
 
				+//
			
 
				+
			
 
				+// Currency symbols.
			
 
				+// From: http://www.unicode.org/charts/PDF/U20A0.pdf
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(currency_symbol,
			
 
				+  '$',
			
 
				+  // 0x00A2,  // cents (NB: typically FOLLOWS the amount)
			
 
				+  0x00A3,  // pounds and liras
			
 
				+  0x00A4,  // general currency sign
			
 
				+  0x00A5,  // yen or yuan
			
 
				+  0x0192,  // Dutch florin (latin small letter "f" with hook)
			
 
				+  0x09F2,  // Bengali rupee mark
			
 
				+  0x09F3,  // Bengali rupee sign
			
 
				+  0x0AF1,  // Guajarati rupee sign
			
 
				+  0x0BF9,  // Tamil rupee sign
			
 
				+  0x0E3F,  // Thai baht
			
 
				+  0x17DB,  // Khmer riel
			
 
				+  0x20A0,  // alternative euro sign
			
 
				+  0x20A1,  // Costa Rica, El Salvador (colon sign)
			
 
				+  0x20A2,  // Brazilian cruzeiro
			
 
				+  0x20A3,  // French Franc
			
 
				+  0x20A4,  // alternative lira sign
			
 
				+  0x20A5,  // mill sign (USA 1/10 cent)
			
 
				+  0x20A6,  // Nigerian Naira
			
 
				+  0x20A7,  // Spanish peseta
			
 
				+  0x20A8,  // Indian rupee
			
 
				+  0x20A9,  // Korean won
			
 
				+  0x20AA,  // Israeli new sheqel
			
 
				+  0x20AB,  // Vietnam dong
			
 
				+  0x20AC,  // euro sign
			
 
				+  0x20AD,  // Laotian kip
			
 
				+  0x20AE,  // Mongolian tugrik
			
 
				+  0x20AF,  // Greek drachma
			
 
				+  0x20B0,  // German penny
			
 
				+  0x20B1,  // Philippine peso (Mexican peso uses "$")
			
 
				+  0x2133,  // Old German mark (script capital M)
			
 
				+  0xFDFC,  // rial sign
			
 
				+  0xFFE0,  // fullwidth cents
			
 
				+  0xFFE1,  // fullwidth pounds
			
 
				+  0xFFE5,  // fullwidth Japanese yen
			
 
				+  0xFFE6,  // fullwidth Korean won
			
 
				+)
			
 
				+
			
 
				+// Chinese bookquotes.
			
 
				+// They look like "<<" and ">>" except that they are single UTF8 chars
			
 
				+// (U+300A, U+300B). These are used in chinese as special
			
 
				+// punctuation, refering to the title of a book, an article, a movie,
			
 
				+// etc.  For example: "cellphone" means cellphone, but <<cellphone>>
			
 
				+// means (exclusively) the movie.
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(open_bookquote,
			
 
				+ 0x300A
			
 
				+)
			
 
				+
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(close_bookquote,
			
 
				+ 0x300B
			
 
				+)
			
 
				+
			
 
				+//======================================================================
			
 
				+// Token-level punctuation
			
 
				+//
			
 
				+
			
 
				+// Token-prefix symbols, excluding currency symbols -- glom on
			
 
				+// to following token (esp. if no space after)
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(noncurrency_token_prefix_symbol,
			
 
				+  '#',
			
 
				+  0x2116,  // numero sign ("No")
			
 
				+)
			
 
				+
			
 
				+// Token-prefix symbols -- glom on to following token (esp. if no space after)
			
 
				+DEFINE_CHAR_PROPERTY(token_prefix_symbol, prop) {
			
 
				+  prop->AddCharProperty("currency_symbol");
			
 
				+  prop->AddCharProperty("noncurrency_token_prefix_symbol");
			
 
				+}
			
 
				+
			
 
				+// Token-suffix symbols -- glom on to preceding token (esp. if no space before)
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(token_suffix_symbol,
			
 
				+  '%',
			
 
				+  0x066A,  // Arabic percent sign
			
 
				+  0x2030,  // per mille
			
 
				+  0x2031,  // per ten thousand
			
 
				+  0x00A2,  // cents sign
			
 
				+  0x2125,  // ounces sign
			
 
				+  0x00AA,  // feminine ordinal indicator (Spanish)
			
 
				+  0x00BA,  // masculine ordinal indicator (Spanish)
			
 
				+  0x00B0,  // degrees
			
 
				+  0x2109,  // degrees Fahrenheit
			
 
				+  0x2103,  // degrees Celsius
			
 
				+  0x2126,  // ohms
			
 
				+  0x212A,  // Kelvin
			
 
				+  0x212B,  // Angstroms ("A" with circle on top)
			
 
				+  0x00A9,  // copyright
			
 
				+  0x2117,  // sound recording copyright (circled "P")
			
 
				+  0x2122,  // trade mark
			
 
				+  0x00AE,  // registered trade mark
			
 
				+  0x2120,  // service mark
			
 
				+  0x2106,  // cada una ("c/a" == "each" in Spanish)
			
 
				+  0x2020,  // dagger (can be used for footnotes)
			
 
				+  0x2021,  // double dagger (can be used for footnotes)
			
 
				+)
			
 
				+
			
 
				+// Subscripts
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(subscript_symbol,
			
 
				+  0x2080,  // subscript 0
			
 
				+  0x2081,  // subscript 1
			
 
				+  0x2082,  // subscript 2
			
 
				+  0x2083,  // subscript 3
			
 
				+  0x2084,  // subscript 4
			
 
				+  0x2085,  // subscript 5
			
 
				+  0x2086,  // subscript 6
			
 
				+  0x2087,  // subscript 7
			
 
				+  0x2088,  // subscript 8
			
 
				+  0x2089,  // subscript 9
			
 
				+  0x208A,  // subscript "+"
			
 
				+  0x208B,  // subscript "-"
			
 
				+  0x208C,  // subscript "="
			
 
				+  0x208D,  // subscript "("
			
 
				+  0x208E,  // subscript ")"
			
 
				+)
			
 
				+
			
 
				+// Superscripts
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(superscript_symbol,
			
 
				+  0x2070,  // superscript 0
			
 
				+  0x00B9,  // superscript 1
			
 
				+  0x00B2,  // superscript 2
			
 
				+  0x00B3,  // superscript 3
			
 
				+  0x2074,  // superscript 4
			
 
				+  0x2075,  // superscript 5
			
 
				+  0x2076,  // superscript 6
			
 
				+  0x2077,  // superscript 7
			
 
				+  0x2078,  // superscript 8
			
 
				+  0x2079,  // superscript 9
			
 
				+  0x2071,  // superscript Latin small "i"
			
 
				+  0x207A,  // superscript "+"
			
 
				+  0x207B,  // superscript "-"
			
 
				+  0x207C,  // superscript "="
			
 
				+  0x207D,  // superscript "("
			
 
				+  0x207E,  // superscript ")"
			
 
				+  0x207F,  // superscript Latin small "n"
			
 
				+)
			
 
				+
			
 
				+//======================================================================
			
 
				+// General punctuation
			
 
				+//
			
 
				+
			
 
				+// Connector punctuation
			
 
				+// Code Pc from http://www.unicode.org/Public/UNIDATA/PropList.txt
			
 
				+// NB: This list is not necessarily exhaustive.
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(connector_punc,
			
 
				+  0x30fb,  // Katakana middle dot
			
 
				+  0xff65,  // halfwidth Katakana middle dot
			
 
				+  0x2040,  // character tie
			
 
				+)
			
 
				+
			
 
				+// Dashes
			
 
				+// Code Pd from http://www.unicode.org/Public/UNIDATA/PropList.txt
			
 
				+// NB: This list is not necessarily exhaustive.
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(dash_punc,
			
 
				+  '-',
			
 
				+  '~',
			
 
				+  0x058a,  // Armenian hyphen
			
 
				+  0x1806,  // Mongolian todo soft hyphen
			
 
				+  RANGE(0x2010, 0x2015),  // hyphen..horizontal bar
			
 
				+  0x2053,  // swung dash -- from Table 6-3 of Unicode book
			
 
				+  0x207b,  // superscript minus
			
 
				+  0x208b,  // subscript minus
			
 
				+  0x2212,  // minus sign
			
 
				+  0x301c,  // wave dash
			
 
				+  0x3030,  // wavy dash
			
 
				+  RANGE(0xfe31, 0xfe32),  // presentation form for vertical em dash..en dash
			
 
				+  0xfe58,  // small em dash
			
 
				+  0xfe63,  // small hyphen-minus
			
 
				+  0xff0d,  // fullwidth hyphen-minus
			
 
				+)
			
 
				+
			
 
				+// Other punctuation
			
 
				+// Code Po from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
			
 
				+// NB: This list is not exhaustive.
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(other_punc,
			
 
				+  ',',
			
 
				+  ':',
			
 
				+  ';',
			
 
				+  0x00b7,  // middle dot
			
 
				+  0x0387,  // Greek ano teleia
			
 
				+  0x05c3,  // Hebrew punctuation sof pasuq
			
 
				+  0x060c,  // Arabic comma
			
 
				+  0x061b,  // Arabic semicolon
			
 
				+  0x066b,  // Arabic decimal separator
			
 
				+  0x066c,  // Arabic thousands separator
			
 
				+  RANGE(0x0703, 0x70a),  // Syriac contraction and others
			
 
				+  0x070c,  // Syric harklean metobelus
			
 
				+  0x0e5a,  // Thai character angkhankhu
			
 
				+  0x0e5b,  // Thai character khomut
			
 
				+  0x0f08,  // Tibetan mark sbrul shad
			
 
				+  RANGE(0x0f0d, 0x0f12),  // Tibetan mark shad..Tibetan mark rgya gram shad
			
 
				+  0x1361,  // Ethiopic wordspace
			
 
				+  RANGE(0x1363, 0x1366),  // other Ethiopic chars
			
 
				+  0x166d,  // Canadian syllabics chi sign
			
 
				+  RANGE(0x16eb, 0x16ed),  // Runic single punctuation..Runic cross punctuation
			
 
				+  RANGE(0x17d5, 0x17d6),  // Khmer sign camnuc pii huuh and other
			
 
				+  0x17da,  // Khmer sign koomut
			
 
				+  0x1802,  // Mongolian comma
			
 
				+  RANGE(0x1804, 0x1805),  // Mongolian four dots and other
			
 
				+  0x1808,  // Mongolian manchu comma
			
 
				+  0x3001,  // ideographic comma
			
 
				+  RANGE(0xfe50, 0xfe51),  // small comma and others
			
 
				+  RANGE(0xfe54, 0xfe55),  // small semicolon and other
			
 
				+  0xff0c,  // fullwidth comma
			
 
				+  RANGE(0xff0e, 0xff0f),  // fullwidth stop..fullwidth solidus
			
 
				+  RANGE(0xff1a, 0xff1b),  // fullwidth colon..fullwidth semicolon
			
 
				+  0xff64,  // halfwidth ideographic comma
			
 
				+  0x2016,  // double vertical line
			
 
				+  RANGE(0x2032, 0x2034),  // prime..triple prime
			
 
				+  0xfe61,  // small asterisk
			
 
				+  0xfe68,  // small reverse solidus
			
 
				+  0xff3c,  // fullwidth reverse solidus
			
 
				+)
			
 
				+
			
 
				+// All punctuation.
			
 
				+// Code P from http://www.unicode.org/Public/UNIDATA/PropList.txt
			
 
				+// NB: This list is not necessarily exhaustive.
			
 
				+DEFINE_CHAR_PROPERTY(punctuation, prop) {
			
 
				+  prop->AddCharProperty("open_punc");
			
 
				+  prop->AddCharProperty("close_punc");
			
 
				+  prop->AddCharProperty("leading_sentence_punc");
			
 
				+  prop->AddCharProperty("trailing_sentence_punc");
			
 
				+  prop->AddCharProperty("connector_punc");
			
 
				+  prop->AddCharProperty("dash_punc");
			
 
				+  prop->AddCharProperty("other_punc");
			
 
				+  prop->AddAsciiPredicate(&ispunct);
			
 
				+}
			
 
				+
			
 
				+//======================================================================
			
 
				+// Separators
			
 
				+//
			
 
				+
			
 
				+// Line separators
			
 
				+// Code Zl from http://www.unicode.org/Public/UNIDATA/PropList.txt
			
 
				+// NB: This list is not necessarily exhaustive.
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(line_separator,
			
 
				+  0x2028,                           // line separator
			
 
				+)
			
 
				+
			
 
				+// Paragraph separators
			
 
				+// Code Zp from http://www.unicode.org/Public/UNIDATA/PropList.txt
			
 
				+// NB: This list is not necessarily exhaustive.
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(paragraph_separator,
			
 
				+  0x2029,                           // paragraph separator
			
 
				+)
			
 
				+
			
 
				+// Space separators
			
 
				+// Code Zs from http://www.unicode.org/Public/UNIDATA/PropList.txt
			
 
				+// NB: This list is not necessarily exhaustive.
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(space_separator,
			
 
				+  0x0020,                           // space
			
 
				+  0x00a0,                           // no-break space
			
 
				+  0x1680,                           // Ogham space mark
			
 
				+  0x180e,                           // Mongolian vowel separator
			
 
				+  RANGE(0x2000, 0x200a),            // en quad..hair space
			
 
				+  0x202f,                           // narrow no-break space
			
 
				+  0x205f,                           // medium mathematical space
			
 
				+  0x3000,                           // ideographic space
			
 
				+
			
 
				+  // Google additions
			
 
				+  0xe5e5,                           // "private" char used as space in Chinese
			
 
				+)
			
 
				+
			
 
				+// Separators -- all line, paragraph, and space separators.
			
 
				+// Code Z from http://www.unicode.org/Public/UNIDATA/PropList.txt
			
 
				+// NB: This list is not necessarily exhaustive.
			
 
				+DEFINE_CHAR_PROPERTY(separator, prop) {
			
 
				+  prop->AddCharProperty("line_separator");
			
 
				+  prop->AddCharProperty("paragraph_separator");
			
 
				+  prop->AddCharProperty("space_separator");
			
 
				+  prop->AddAsciiPredicate(&isspace);
			
 
				+}
			
 
				+
			
 
				+//======================================================================
			
 
				+// Alphanumeric Characters
			
 
				+//
			
 
				+
			
 
				+// Digits
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(digit,
			
 
				+  RANGE('0', '9'),
			
 
				+  RANGE(0x0660, 0x0669),  // Arabic-Indic digits
			
 
				+  RANGE(0x06F0, 0x06F9),  // Eastern Arabic-Indic digits
			
 
				+)
			
 
				+
			
 
				+//======================================================================
			
 
				+// Japanese Katakana
			
 
				+//
			
 
				+
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(katakana,
			
 
				+  0x3099,  // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
			
 
				+  0x309A,  // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
			
 
				+  0x309B,  // KATAKANA-HIRAGANA VOICED SOUND MARK
			
 
				+  0x309C,  // KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
			
 
				+  RANGE(0x30A0, 0x30FF),  // Fullwidth Katakana
			
 
				+  RANGE(0xFF65, 0xFF9F),  // Halfwidth Katakana
			
 
				+)
			
 
				+
			
 
				+//======================================================================
			
 
				+// BiDi Directional Formatting Codes
			
 
				+//
			
 
				+
			
 
				+// See http://www.unicode.org/reports/tr9/ for a description of Bidi
			
 
				+// and http://www.unicode.org/charts/PDF/U2000.pdf for the character codes.
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(directional_formatting_code,
			
 
				+  0x200E,  // LRM (Left-to-Right Mark)
			
 
				+  0x200F,  // RLM (Right-to-Left Mark)
			
 
				+  0x202A,  // LRE (Left-to-Right Embedding)
			
 
				+  0x202B,  // RLE (Right-to-Left Embedding)
			
 
				+  0x202C,  // PDF (Pop Directional Format)
			
 
				+  0x202D,  // LRO (Left-to-Right Override)
			
 
				+  0x202E,  // RLO (Right-to-Left Override)
			
 
				+)
			
 
				+
			
 
				+//======================================================================
			
 
				+// Special collections
			
 
				+//
			
 
				+
			
 
				+// NB: This does not check for all punctuation and symbols in the
			
 
				+// standard; just those listed in our code. See the definitions in
			
 
				+// char_properties.cc
			
 
				+DEFINE_CHAR_PROPERTY(punctuation_or_symbol, prop) {
			
 
				+  prop->AddCharProperty("punctuation");
			
 
				+  prop->AddCharProperty("subscript_symbol");
			
 
				+  prop->AddCharProperty("superscript_symbol");
			
 
				+  prop->AddCharProperty("token_prefix_symbol");
			
 
				+  prop->AddCharProperty("token_suffix_symbol");
			
 
				+}
			
 
				+
			
 
				+}  // namespace syntaxnet
			
--- a/syntaxnet/syntaxnet/char_properties.h
+++ b/syntaxnet/syntaxnet/char_properties.h
@@ -0,0 +1,362 @@
 
				+/* Copyright 2016 Google Inc. All Rights Reserved.
			
 
				+
			
 
				+Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+you may not use this file except in compliance with the License.
			
 
				+You may obtain a copy of the License at
			
 
				+
			
 
				+    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software
			
 
				+distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+See the License for the specific language governing permissions and
			
 
				+limitations under the License.
			
 
				+==============================================================================*/
			
 
				+
			
 
				+// char_properties.h - define is_X() tests for various character properties
			
 
				+//
			
 
				+// Character properties can be defined in two ways:
			
 
				+//
			
 
				+// (1) Set-based:
			
 
				+//
			
 
				+//     Enumerate the chars that have the property.  Example:
			
 
				+//
			
 
				+//       DEFINE_CHAR_PROPERTY_AS_SET(my_fave,
			
 
				+//         RANGE('0', '9'),
			
 
				+//         '\'',
			
 
				+//         0x00BF,   // Spanish inverted question mark
			
 
				+//       )
			
 
				+//
			
 
				+//     Characters are expressed as Unicode code points; note that ascii codes
			
 
				+//     are a subset.  RANGE() specifies an inclusive range of code points.
			
 
				+//
			
 
				+//     This defines two functions:
			
 
				+//
			
 
				+//       bool is_my_fave(const char *str, int len)
			
 
				+//       bool is_my_fave(int c)
			
 
				+//
			
 
				+//     Each returns true for precisely the 12 characters specified above.
			
 
				+//     Each takes a *single* UTf8 char as its argument -- the first expresses
			
 
				+//     it as a char * and a length, the second as a Unicode code point.
			
 
				+//     Please do not pass a string of multiple UTF8 chars to the first one.
			
 
				+//
			
 
				+//     To make is_my_fave() externally accessible, put in your .h file:
			
 
				+//
			
 
				+//       DECLARE_CHAR_PROPERTY(my_fave)
			
 
				+//
			
 
				+// (2) Function-based:
			
 
				+//
			
 
				+//     Specify a function that assigns the desired chars to a CharProperty
			
 
				+//     object.  Example:
			
 
				+//
			
 
				+//       DEFINE_CHAR_PROPERTY(my_other_fave, prop) {
			
 
				+//         for (int i = '0'; i <= '9'; i += 2) {
			
 
				+//           prop->AddChar(i);
			
 
				+//         }
			
 
				+//         prop->AddAsciiPredicate(&ispunct);
			
 
				+//         prop->AddCharProperty("currency_symbol");
			
 
				+//       }
			
 
				+//
			
 
				+//     This defines a function of one arg: CharProperty *prop.  The function
			
 
				+//     calls various CharProperty methods to populate the prop.  The last call
			
 
				+//     above, AddCharProperty(), adds the chars from another char property
			
 
				+//     ("currency_symbol").
			
 
				+//
			
 
				+//     As in the set-based case, put a DECLARE_CHAR_PROPERTY(my_other_fave)
			
 
				+//     in your .h if you want is_my_other_fave() to be externally accessible.
			
 
				+//
			
 
				+
			
 
				+#ifndef SYNTAXNET_CHAR_PROPERTIES_H_
			
 
				+#define SYNTAXNET_CHAR_PROPERTIES_H_
			
 
				+
			
 
				+#include <string>  // for string
			
 
				+
			
 
				+#include "syntaxnet/registry.h"
			
 
				+#include "syntaxnet/utils.h"
			
 
				+
			
 
				+// =====================================================================
			
 
				+// Registry for accessing CharProperties by name
			
 
				+//
			
 
				+// This is for internal use by the CharProperty class and macros; callers
			
 
				+// should not use it explicitly.
			
 
				+//
			
 
				+
			
 
				+namespace syntaxnet {
			
 
				+
			
 
				+class CharProperty;   // forward declaration
			
 
				+
			
 
				+// Wrapper around a CharProperty, allowing it to be stored in a registry.
			
 
				+struct CharPropertyWrapper : RegisterableClass<CharPropertyWrapper> {
			
 
				+  virtual ~CharPropertyWrapper() { }
			
 
				+  virtual CharProperty *GetCharProperty() = 0;
			
 
				+};
			
 
				+
			
 
				+#define REGISTER_CHAR_PROPERTY_WRAPPER(type, component) \
			
 
				+  REGISTER_CLASS_COMPONENT(CharPropertyWrapper, type, component)
			
 
				+
			
 
				+#define REGISTER_CHAR_PROPERTY(lsp, name)                         \
			
 
				+  struct name##CharPropertyWrapper : public CharPropertyWrapper { \
			
 
				+    CharProperty *GetCharProperty() { return lsp.get(); }         \
			
 
				+  };                                                              \
			
 
				+  REGISTER_CHAR_PROPERTY_WRAPPER(#name, name##CharPropertyWrapper)
			
 
				+
			
 
				+// =====================================================================
			
 
				+// Macros for defining character properties
			
 
				+//
			
 
				+
			
 
				+// Define is_X() functions to test whether a single UTF8 character has
			
 
				+// the 'X' char prop.
			
 
				+#define DEFINE_IS_X_CHAR_PROPERTY_FUNCTIONS(lsp, name) \
			
 
				+  bool is_##name(const char *str, int len) {                                 \
			
 
				+    return lsp->HoldsFor(str, len);                                          \
			
 
				+  }                                                                          \
			
 
				+  bool is_##name(int c) {                                                    \
			
 
				+    return lsp->HoldsFor(c);                                                 \
			
 
				+  }
			
 
				+
			
 
				+// Define a char property by enumerating the unicode char points,
			
 
				+// or RANGE()s thereof, for which it holds.  Example:
			
 
				+//
			
 
				+//   DEFINE_CHAR_PROPERTY_AS_SET(my_fave,
			
 
				+//     'q',
			
 
				+//     RANGE('0', '9'),
			
 
				+//     0x20AB,
			
 
				+//   )
			
 
				+//
			
 
				+// "..." is a GNU extension.
			
 
				+#define DEFINE_CHAR_PROPERTY_AS_SET(name, unicodes...)                         \
			
 
				+  static const int k_##name##_unicodes[] = {unicodes};                         \
			
 
				+  static utils::LazyStaticPtr<CharProperty, const char *, const int *, size_t> \
			
 
				+      name##_char_property = {#name, k_##name##_unicodes,                      \
			
 
				+                              arraysize(k_##name##_unicodes)};                 \
			
 
				+  REGISTER_CHAR_PROPERTY(name##_char_property, name);                          \
			
 
				+  DEFINE_IS_X_CHAR_PROPERTY_FUNCTIONS(name##_char_property, name)
			
 
				+
			
 
				+// Specify a range (inclusive) of Unicode character values.
			
 
				+// Example: RANGE('0', '9') specifies the 10 digits.
			
 
				+// For use as an element in a DEFINE_CHAR_PROPERTY_AS_SET() list.
			
 
				+static const int kPreUnicodeRange = -1;
			
 
				+static const int kPostUnicodeRange = -2;
			
 
				+#define RANGE(lower, upper) \
			
 
				+  kPreUnicodeRange, lower, upper, kPostUnicodeRange
			
 
				+
			
 
				+// A function to initialize a CharProperty.
			
 
				+typedef void CharPropertyInitializer(CharProperty *prop);
			
 
				+
			
 
				+// Define a char property by specifying a block of code that initializes it.
			
 
				+// Example:
			
 
				+//
			
 
				+//   DEFINE_CHAR_PROPERTY(my_other_fave, prop) {
			
 
				+//     for (int i = '0'; i <= '9'; i += 2) {
			
 
				+//       prop->AddChar(i);
			
 
				+//     }
			
 
				+//     prop->AddAsciiPredicate(&ispunct);
			
 
				+//     prop->AddCharProperty("currency_symbol");
			
 
				+//   }
			
 
				+//
			
 
				+#define DEFINE_CHAR_PROPERTY(name, charpropvar)                       \
			
 
				+  static void init_##name##_char_property(CharProperty *charpropvar); \
			
 
				+  static utils::LazyStaticPtr<CharProperty, const char *,             \
			
 
				+                              CharPropertyInitializer *>              \
			
 
				+      name##_char_property = {#name, &init_##name##_char_property};   \
			
 
				+  REGISTER_CHAR_PROPERTY(name##_char_property, name);                 \
			
 
				+  DEFINE_IS_X_CHAR_PROPERTY_FUNCTIONS(name##_char_property, name)     \
			
 
				+  static void init_##name##_char_property(CharProperty *charpropvar)
			
 
				+
			
 
				+// =====================================================================
			
 
				+// Macro for declaring character properties
			
 
				+//
			
 
				+
			
 
				+#define DECLARE_CHAR_PROPERTY(name) \
			
 
				+  extern bool is_##name(const char *str, int len);                           \
			
 
				+  extern bool is_##name(int c);                                              \
			
 
				+
			
 
				+// ===========================================================
			
 
				+// CharProperty - a property that holds for selected Unicode chars
			
 
				+//
			
 
				+// A CharProperty is semantically equivalent to set<char32>.
			
 
				+//
			
 
				+// The characters for which a CharProperty holds are represented as a trie,
			
 
				+// i.e., a tree that is indexed by successive bytes of the UTF-8 encoding
			
 
				+// of the characters.  This permits fast lookup (HoldsFor).
			
 
				+//
			
 
				+
			
 
				+// A function that defines a subset of [0..255], e.g., isspace.
			
 
				+typedef int AsciiPredicate(int c);
			
 
				+
			
 
				+class CharProperty {
			
 
				+ public:
			
 
				+  // Constructor for set-based char properties.
			
 
				+  CharProperty(const char *name, const int *unicodes, int num_unicodes);
			
 
				+
			
 
				+  // Constructor for function-based char properties.
			
 
				+  CharProperty(const char *name, CharPropertyInitializer *init_fn);
			
 
				+
			
 
				+  virtual ~CharProperty();
			
 
				+
			
 
				+  // Various ways of adding chars to a CharProperty; for use only in
			
 
				+  // CharPropertyInitializer functions.
			
 
				+  void AddChar(int c);
			
 
				+  void AddCharRange(int c1, int c2);
			
 
				+  void AddAsciiPredicate(AsciiPredicate *pred);
			
 
				+  void AddCharProperty(const char *name);
			
 
				+  void AddCharSpec(const int *unicodes, int num_unicodes);
			
 
				+
			
 
				+  // Return true iff the CharProperty holds for a single given UTF8 char.
			
 
				+  bool HoldsFor(const char *str, int len) const;
			
 
				+
			
 
				+  // Return true iff the CharProperty holds for a single given Unicode char.
			
 
				+  bool HoldsFor(int c) const;
			
 
				+
			
 
				+  // You can use this to enumerate the set elements (it was easier
			
 
				+  // than defining a real iterator).  Returns -1 if there are no more.
			
 
				+  // Call with -1 to get the first element.  Expects c == -1 or HoldsFor(c).
			
 
				+  int NextElementAfter(int c) const;
			
 
				+
			
 
				+  // Return NULL or the CharProperty with the given name.  Looks up the name
			
 
				+  // in a CharProperty registry.
			
 
				+  static const CharProperty *Lookup(const char *name);
			
 
				+
			
 
				+ private:
			
 
				+  void CheckUnicodeVal(int c) const;
			
 
				+  static string UnicodeToString(int c);
			
 
				+
			
 
				+  const char *name_;
			
 
				+  struct CharPropertyImplementation *impl_;
			
 
				+
			
 
				+  TF_DISALLOW_COPY_AND_ASSIGN(CharProperty);
			
 
				+};
			
 
				+
			
 
				+//======================================================================
			
 
				+// Expression-level punctuation
			
 
				+//
			
 
				+
			
 
				+// Punctuation that starts a sentence.
			
 
				+DECLARE_CHAR_PROPERTY(start_sentence_punc);
			
 
				+
			
 
				+// Punctuation that ends a sentence.
			
 
				+DECLARE_CHAR_PROPERTY(end_sentence_punc);
			
 
				+
			
 
				+// Punctuation, such as parens, that opens a "nested expression" of text.
			
 
				+DECLARE_CHAR_PROPERTY(open_expr_punc);
			
 
				+
			
 
				+// Punctuation, such as parens, that closes a "nested expression" of text.
			
 
				+DECLARE_CHAR_PROPERTY(close_expr_punc);
			
 
				+
			
 
				+// Chars that open a quotation.
			
 
				+DECLARE_CHAR_PROPERTY(open_quote);
			
 
				+
			
 
				+// Chars that close a quotation.
			
 
				+DECLARE_CHAR_PROPERTY(close_quote);
			
 
				+
			
 
				+// Punctuation chars that open an expression or a quotation.
			
 
				+DECLARE_CHAR_PROPERTY(open_punc);
			
 
				+
			
 
				+// Punctuation chars that close an expression or a quotation.
			
 
				+DECLARE_CHAR_PROPERTY(close_punc);
			
 
				+
			
 
				+// Punctuation chars that can come at the beginning of a sentence.
			
 
				+DECLARE_CHAR_PROPERTY(leading_sentence_punc);
			
 
				+
			
 
				+// Punctuation chars that can come at the end of a sentence.
			
 
				+DECLARE_CHAR_PROPERTY(trailing_sentence_punc);
			
 
				+
			
 
				+//======================================================================
			
 
				+// Token-level punctuation
			
 
				+//
			
 
				+
			
 
				+// Token-prefix symbols -- glom on to following token
			
 
				+// (esp. if no space after) -- except for currency symbols.
			
 
				+DECLARE_CHAR_PROPERTY(noncurrency_token_prefix_symbol);
			
 
				+
			
 
				+// Token-prefix symbols -- glom on to following token (esp. if no space after).
			
 
				+DECLARE_CHAR_PROPERTY(token_prefix_symbol);
			
 
				+
			
 
				+// Token-suffix symbols -- glom on to preceding token (esp. if no space
			
 
				+// before).
			
 
				+DECLARE_CHAR_PROPERTY(token_suffix_symbol);
			
 
				+
			
 
				+// Subscripts.
			
 
				+DECLARE_CHAR_PROPERTY(subscript_symbol);
			
 
				+
			
 
				+// Superscripts.
			
 
				+DECLARE_CHAR_PROPERTY(superscript_symbol);
			
 
				+
			
 
				+//======================================================================
			
 
				+// General punctuation
			
 
				+//
			
 
				+
			
 
				+// Connector punctuation.
			
 
				+DECLARE_CHAR_PROPERTY(connector_punc);
			
 
				+
			
 
				+// Dashes.
			
 
				+DECLARE_CHAR_PROPERTY(dash_punc);
			
 
				+
			
 
				+// Other punctuation.
			
 
				+DECLARE_CHAR_PROPERTY(other_punc);
			
 
				+
			
 
				+// All punctuation.
			
 
				+DECLARE_CHAR_PROPERTY(punctuation);
			
 
				+
			
 
				+//======================================================================
			
 
				+// Special symbols
			
 
				+//
			
 
				+
			
 
				+// Currency symbols.
			
 
				+DECLARE_CHAR_PROPERTY(currency_symbol);
			
 
				+
			
 
				+// Chinese bookquotes.
			
 
				+DECLARE_CHAR_PROPERTY(open_bookquote);
			
 
				+DECLARE_CHAR_PROPERTY(close_bookquote);
			
 
				+
			
 
				+//======================================================================
			
 
				+// Separators
			
 
				+//
			
 
				+
			
 
				+// Line separators.
			
 
				+DECLARE_CHAR_PROPERTY(line_separator);
			
 
				+
			
 
				+// Paragraph separators.
			
 
				+DECLARE_CHAR_PROPERTY(paragraph_separator);
			
 
				+
			
 
				+// Space separators.
			
 
				+DECLARE_CHAR_PROPERTY(space_separator);
			
 
				+
			
 
				+// Separators -- all line, paragraph, and space separators.
			
 
				+DECLARE_CHAR_PROPERTY(separator);
			
 
				+
			
 
				+//======================================================================
			
 
				+// Alphanumeric Characters
			
 
				+//
			
 
				+
			
 
				+// Digits.
			
 
				+DECLARE_CHAR_PROPERTY(digit);
			
 
				+
			
 
				+// Japanese Katakana.
			
 
				+DECLARE_CHAR_PROPERTY(katakana);
			
 
				+
			
 
				+//======================================================================
			
 
				+// BiDi Directional Formatting Codes
			
 
				+//
			
 
				+
			
 
				+// Explicit directional formatting codes (LRM, RLM, LRE, RLE, PDF, LRO, RLO)
			
 
				+// used by the bidirectional algorithm.
			
 
				+//
			
 
				+// Note: Use this only to classify characters. To actually determine
			
 
				+// directionality of BiDi text, look under i18n/bidi.
			
 
				+//
			
 
				+// See http://www.unicode.org/reports/tr9/ for a description of the algorithm
			
 
				+// and http://www.unicode.org/charts/PDF/U2000.pdf for the character codes.
			
 
				+DECLARE_CHAR_PROPERTY(directional_formatting_code);
			
 
				+
			
 
				+//======================================================================
			
 
				+// Special collections
			
 
				+//
			
 
				+
			
 
				+// NB: This does not check for all punctuation and symbols in the standard;
			
 
				+// just those listed in our code. See the definitions in char_properties.cc.
			
 
				+DECLARE_CHAR_PROPERTY(punctuation_or_symbol);
			
 
				+
			
 
				+}  // namespace syntaxnet
			
 
				+
			
 
				+#endif  // SYNTAXNET_CHAR_PROPERTIES_H_
			
--- a/syntaxnet/syntaxnet/char_properties_test.cc
+++ b/syntaxnet/syntaxnet/char_properties_test.cc
@@ -0,0 +1,364 @@
 
				+/* Copyright 2016 Google Inc. All Rights Reserved.
			
 
				+
			
 
				+Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+you may not use this file except in compliance with the License.
			
 
				+You may obtain a copy of the License at
			
 
				+
			
 
				+    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software
			
 
				+distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+See the License for the specific language governing permissions and
			
 
				+limitations under the License.
			
 
				+==============================================================================*/
			
 
				+
			
 
				+// Tests for char_properties.cc:
			
 
				+//
			
 
				+// (1) Test the DEFINE_CHAR_PROPERTY_AS_SET and DEFINE_CHAR_PROPERTY macros
			
 
				+//     by defining a few fake char properties and verifying their contents.
			
 
				+//
			
 
				+// (2) Test the char properties defined in char_properties.cc by spot-checking
			
 
				+//     a few chars.
			
 
				+//
			
 
				+
			
 
				+#include "syntaxnet/char_properties.h"
			
 
				+
			
 
				+#include <ctype.h>  // for ispunct, isspace
			
 
				+#include <map>
			
 
				+#include <set>
			
 
				+#include <utility>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include <gmock/gmock.h>  // for ContainerEq, EXPECT_THAT
			
 
				+#include "tensorflow/core/platform/test.h"
			
 
				+#include "third_party/utf/utf.h"
			
 
				+#include "util/utf8/unilib.h"  // for IsValidCodepoint, etc
			
 
				+#include "util/utf8/unilib_utf8_utils.h"
			
 
				+
			
 
				+using ::testing::ContainerEq;
			
 
				+
			
 
				+namespace syntaxnet {
			
 
				+
			
 
				+// Invalid UTF-8 bytes are decoded as the Replacement Character, U+FFFD
			
 
				+// (which is also Runeerror). Invalid code points are encoded in UTF-8
			
 
				+// with the UTF-8 representation of the Replacement Character.
			
 
				+static const char ReplacementCharacterUTF8[3] = {'\xEF', '\xBF', '\xBD'};
			
 
				+
			
 
				+// ====================================================================
			
 
				+// CharPropertiesTest
			
 
				+//
			
 
				+
			
 
				+class CharPropertiesTest : public testing::Test {
			
 
				+ protected:
			
 
				+  // Collect a set of chars.
			
 
				+  void CollectChars(const std::set<char32> &chars) {
			
 
				+    collected_set_.insert(chars.begin(), chars.end());
			
 
				+  }
			
 
				+
			
 
				+  // Collect an array of chars.
			
 
				+  void CollectArray(const char32 arr[], int len) {
			
 
				+    collected_set_.insert(arr, arr + len);
			
 
				+  }
			
 
				+
			
 
				+  // Collect the chars for which the named CharProperty holds.
			
 
				+  void CollectCharProperty(const char *name) {
			
 
				+    const CharProperty *prop = CharProperty::Lookup(name);
			
 
				+    ASSERT_TRUE(prop != nullptr) << "for " << name;
			
 
				+
			
 
				+    for (char32 c = 0; c <= 0x10FFFF; ++c) {
			
 
				+      if (UniLib::IsValidCodepoint(c) && prop->HoldsFor(c)) {
			
 
				+        collected_set_.insert(c);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Collect the chars for which an ascii predicate holds.
			
 
				+  void CollectAsciiPredicate(AsciiPredicate *pred) {
			
 
				+    for (char32 c = 0; c < 256; ++c) {
			
 
				+      if ((*pred)(c)) {
			
 
				+        collected_set_.insert(c);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Expect the named char property to be true for precisely the chars in
			
 
				+  // the collected set.
			
 
				+  void ExpectCharPropertyEqualsCollectedSet(const char *name) {
			
 
				+    const CharProperty *prop = CharProperty::Lookup(name);
			
 
				+    ASSERT_TRUE(prop != nullptr) << "for " << name;
			
 
				+
			
 
				+    // Test that char property holds for all collected chars.  Exercises both
			
 
				+    // signatures of CharProperty::HoldsFor().
			
 
				+    for (std::set<char32>::const_iterator it = collected_set_.begin();
			
 
				+         it != collected_set_.end(); ++it) {
			
 
				+      // Test utf8 version of is_X().
			
 
				+      const char32 c = *it;
			
 
				+      string utf8_char = EncodeAsUTF8(&c, 1);
			
 
				+      EXPECT_TRUE(prop->HoldsFor(utf8_char.c_str(), utf8_char.size()));
			
 
				+
			
 
				+      // Test ucs-2 version of is_X().
			
 
				+      EXPECT_TRUE(prop->HoldsFor(static_cast<int>(c)));
			
 
				+    }
			
 
				+
			
 
				+    // Test that the char property holds for precisely the collected chars.
			
 
				+    // Somewhat redundant with previous test, but exercises
			
 
				+    // CharProperty::NextElementAfter().
			
 
				+    std::set<char32> actual_chars;
			
 
				+    int c = -1;
			
 
				+    while ((c = prop->NextElementAfter(c)) >= 0) {
			
 
				+      actual_chars.insert(static_cast<char32>(c));
			
 
				+    }
			
 
				+    EXPECT_THAT(actual_chars, ContainerEq(collected_set_))
			
 
				+        << " for " << name;
			
 
				+  }
			
 
				+
			
 
				+  // Expect the named char property to be true for at least the chars in
			
 
				+  // the collected set.
			
 
				+  void ExpectCharPropertyContainsCollectedSet(const char *name) {
			
 
				+    const CharProperty *prop = CharProperty::Lookup(name);
			
 
				+    ASSERT_TRUE(prop != nullptr) << "for " << name;
			
 
				+
			
 
				+    for (std::set<char32>::const_iterator it = collected_set_.begin();
			
 
				+         it != collected_set_.end(); ++it) {
			
 
				+      EXPECT_TRUE(prop->HoldsFor(static_cast<int>(*it)));
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  string EncodeAsUTF8(const char32 *in, int size) {
			
 
				+    string out;
			
 
				+    out.reserve(size);
			
 
				+    for (int i = 0; i < size; ++i) {
			
 
				+      char buf[UTFmax];
			
 
				+      int len = EncodeAsUTF8Char(*in++, buf);
			
 
				+      out.append(buf, len);
			
 
				+    }
			
 
				+    return out;
			
 
				+  }
			
 
				+
			
 
				+  int EncodeAsUTF8Char(char32 in, char *out) {
			
 
				+    if (UniLib::IsValidCodepoint(in)) {
			
 
				+      return runetochar(out, &in);
			
 
				+    } else {
			
 
				+      memcpy(out, ReplacementCharacterUTF8, 3);
			
 
				+      return 3;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+ private:
			
 
				+  std::set<char32> collected_set_;
			
 
				+};
			
 
				+
			
 
				+//======================================================================
			
 
				+// Declarations of the sample character sets below
			
 
				+// (to test the DECLARE_CHAR_PROPERTY() macro)
			
 
				+//
			
 
				+
			
 
				+DECLARE_CHAR_PROPERTY(test_digit);
			
 
				+DECLARE_CHAR_PROPERTY(test_wavy_dash);
			
 
				+DECLARE_CHAR_PROPERTY(test_digit_or_wavy_dash);
			
 
				+DECLARE_CHAR_PROPERTY(test_punctuation_plus);
			
 
				+
			
 
				+//======================================================================
			
 
				+// Definitions of sample character sets
			
 
				+//
			
 
				+
			
 
				+// Digits.
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(test_digit,
			
 
				+  RANGE('0', '9'),
			
 
				+)
			
 
				+
			
 
				+// Wavy dashes.
			
 
				+DEFINE_CHAR_PROPERTY_AS_SET(test_wavy_dash,
			
 
				+  '~',
			
 
				+  0x301C,  // wave dash
			
 
				+  0x3030,  // wavy dash
			
 
				+)
			
 
				+
			
 
				+// Digits or wavy dashes.
			
 
				+DEFINE_CHAR_PROPERTY(test_digit_or_wavy_dash, prop) {
			
 
				+  prop->AddCharProperty("test_digit");
			
 
				+  prop->AddCharProperty("test_wavy_dash");
			
 
				+}
			
 
				+
			
 
				+// Punctuation plus a few extraneous chars.
			
 
				+DEFINE_CHAR_PROPERTY(test_punctuation_plus, prop) {
			
 
				+  prop->AddChar('a');
			
 
				+  prop->AddCharRange('b', 'b');
			
 
				+  prop->AddCharRange('c', 'e');
			
 
				+  static const int kUnicodes[] = {'f', RANGE('g', 'i'), 'j'};
			
 
				+  prop->AddCharSpec(kUnicodes, arraysize(kUnicodes));
			
 
				+  prop->AddCharProperty("punctuation");
			
 
				+}
			
 
				+
			
 
				+//====================================================================
			
 
				+// Another form of the character sets above -- for verification
			
 
				+//
			
 
				+
			
 
				+const char32 kTestDigit[] = {
			
 
				+  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
			
 
				+};
			
 
				+
			
 
				+const char32 kTestWavyDash[] = {
			
 
				+  '~',
			
 
				+  0x301C,  // wave dash,
			
 
				+  0x3030,  // wavy dash
			
 
				+};
			
 
				+
			
 
				+const char32 kTestPunctuationPlusExtras[] = {
			
 
				+  'a',
			
 
				+  'b',
			
 
				+  'c',
			
 
				+  'd',
			
 
				+  'e',
			
 
				+  'f',
			
 
				+  'g',
			
 
				+  'h',
			
 
				+  'i',
			
 
				+  'j',
			
 
				+};
			
 
				+
			
 
				+// ====================================================================
			
 
				+// Tests
			
 
				+//
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, TestDigit) {
			
 
				+  CollectArray(kTestDigit, arraysize(kTestDigit));
			
 
				+  ExpectCharPropertyEqualsCollectedSet("test_digit");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, TestWavyDash) {
			
 
				+  CollectArray(kTestWavyDash, arraysize(kTestWavyDash));
			
 
				+  ExpectCharPropertyEqualsCollectedSet("test_wavy_dash");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, TestDigitOrWavyDash) {
			
 
				+  CollectArray(kTestDigit, arraysize(kTestDigit));
			
 
				+  CollectArray(kTestWavyDash, arraysize(kTestWavyDash));
			
 
				+  ExpectCharPropertyEqualsCollectedSet("test_digit_or_wavy_dash");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, TestPunctuationPlus) {
			
 
				+  CollectCharProperty("punctuation");
			
 
				+  CollectArray(kTestPunctuationPlusExtras,
			
 
				+               arraysize(kTestPunctuationPlusExtras));
			
 
				+  ExpectCharPropertyEqualsCollectedSet("test_punctuation_plus");
			
 
				+}
			
 
				+
			
 
				+// ====================================================================
			
 
				+// Spot-check predicates in char_properties.cc
			
 
				+//
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, StartSentencePunc) {
			
 
				+  CollectChars({0x00A1, 0x00BF});
			
 
				+  ExpectCharPropertyContainsCollectedSet("start_sentence_punc");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, EndSentencePunc) {
			
 
				+  CollectChars({'.', '!', '?'});
			
 
				+  ExpectCharPropertyContainsCollectedSet("end_sentence_punc");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, OpenExprPunc) {
			
 
				+  CollectChars({'(', '['});
			
 
				+  ExpectCharPropertyContainsCollectedSet("open_expr_punc");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, CloseExprPunc) {
			
 
				+  CollectChars({')', ']'});
			
 
				+  ExpectCharPropertyContainsCollectedSet("close_expr_punc");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, OpenQuote) {
			
 
				+  CollectChars({'\'', '"'});
			
 
				+  ExpectCharPropertyContainsCollectedSet("open_quote");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, CloseQuote) {
			
 
				+  CollectChars({'\'', '"'});
			
 
				+  ExpectCharPropertyContainsCollectedSet("close_quote");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, OpenBookquote) {
			
 
				+  CollectChars({0x300A});
			
 
				+  ExpectCharPropertyContainsCollectedSet("open_bookquote");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, CloseBookquote) {
			
 
				+  CollectChars({0x300B});
			
 
				+  ExpectCharPropertyContainsCollectedSet("close_bookquote");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, OpenPunc) {
			
 
				+  CollectChars({'(', '['});
			
 
				+  CollectChars({'\'', '"'});
			
 
				+  ExpectCharPropertyContainsCollectedSet("open_punc");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, ClosePunc) {
			
 
				+  CollectChars({')', ']'});
			
 
				+  CollectChars({'\'', '"'});
			
 
				+  ExpectCharPropertyContainsCollectedSet("close_punc");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, LeadingSentencePunc) {
			
 
				+  CollectChars({'(', '['});
			
 
				+  CollectChars({'\'', '"'});
			
 
				+  CollectChars({0x00A1, 0x00BF});
			
 
				+  ExpectCharPropertyContainsCollectedSet("leading_sentence_punc");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, TrailingSentencePunc) {
			
 
				+  CollectChars({')', ']'});
			
 
				+  CollectChars({'\'', '"'});
			
 
				+  CollectChars({'.', '!', '?'});
			
 
				+  ExpectCharPropertyContainsCollectedSet("trailing_sentence_punc");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, NoncurrencyTokenPrefixSymbol) {
			
 
				+  CollectChars({'#'});
			
 
				+  ExpectCharPropertyContainsCollectedSet("noncurrency_token_prefix_symbol");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, TokenSuffixSymbol) {
			
 
				+  CollectChars({'%', 0x2122, 0x00A9, 0x00B0});
			
 
				+  ExpectCharPropertyContainsCollectedSet("token_suffix_symbol");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, TokenPrefixSymbol) {
			
 
				+  CollectChars({'#'});
			
 
				+  CollectChars({'$', 0x00A5, 0x20AC});
			
 
				+  ExpectCharPropertyContainsCollectedSet("token_prefix_symbol");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, SubscriptSymbol) {
			
 
				+  CollectChars({0x2082, 0x2083});
			
 
				+  ExpectCharPropertyContainsCollectedSet("subscript_symbol");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, SuperscriptSymbol) {
			
 
				+  CollectChars({0x00B2, 0x00B3});
			
 
				+  ExpectCharPropertyContainsCollectedSet("superscript_symbol");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, CurrencySymbol) {
			
 
				+  CollectChars({'$', 0x00A5, 0x20AC});
			
 
				+  ExpectCharPropertyContainsCollectedSet("currency_symbol");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, DirectionalFormattingCode) {
			
 
				+  CollectChars({0x200E, 0x200F, 0x202A, 0x202B, 0x202C, 0x202D, 0x202E});
			
 
				+  ExpectCharPropertyContainsCollectedSet("directional_formatting_code");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, Punctuation) {
			
 
				+  CollectAsciiPredicate(ispunct);
			
 
				+  ExpectCharPropertyContainsCollectedSet("punctuation");
			
 
				+}
			
 
				+
			
 
				+TEST_F(CharPropertiesTest, Separator) {
			
 
				+  CollectAsciiPredicate(isspace);
			
 
				+  ExpectCharPropertyContainsCollectedSet("separator");
			
 
				+}
			
 
				+
			
 
				+}  // namespace syntaxnet
			
--- a/syntaxnet/syntaxnet/document_filters.cc
+++ b/syntaxnet/syntaxnet/document_filters.cc
@@ -77,7 +77,8 @@ class DocumentSource : public OpKernel {
 
				     OP_REQUIRES_OK(context, context->GetAttr("batch_size", &batch_size_));
			
 
				     OP_REQUIRES(context, batch_size_ > 0,
			
 
				                 InvalidArgument("invalid batch_size provided"));
			
 
				-    corpus_.reset(new TextReader(*task_context_.GetInput(corpus_name)));
			
 
				+    corpus_.reset(
			
 
				+        new TextReader(*task_context_.GetInput(corpus_name), &task_context_));
			
 
				   }
			
 
				 
			
 
				   void Compute(OpKernelContext *context) override {
			
@@ -124,7 +125,8 @@ class DocumentSink : public OpKernel {
 
				     GetTaskContext(context, &task_context_);
			
 
				     string corpus_name;
			
 
				     OP_REQUIRES_OK(context, context->GetAttr("corpus_name", &corpus_name));
			
 
				-    writer_.reset(new TextWriter(*task_context_.GetInput(corpus_name)));
			
 
				+    writer_.reset(
			
 
				+        new TextWriter(*task_context_.GetInput(corpus_name), &task_context_));
			
 
				   }
			
 
				 
			
 
				   void Compute(OpKernelContext *context) override {
			
--- a/syntaxnet/syntaxnet/document_format.h
+++ b/syntaxnet/syntaxnet/document_format.h
@@ -38,6 +38,8 @@ class DocumentFormat : public RegisterableClass<DocumentFormat> {
 
				   DocumentFormat() {}
			
 
				   virtual ~DocumentFormat() {}
			
 
				 
			
 
				+  virtual void Setup(TaskContext *context) {}
			
 
				+
			
 
				   // Reads a record from the given input buffer with format specific logic.
			
 
				   // Returns false if no record could be read because we reached end of file.
			
 
				   virtual bool ReadRecord(tensorflow::io::InputBuffer *buffer,
			
--- a/syntaxnet/syntaxnet/lexicon_builder.cc
+++ b/syntaxnet/syntaxnet/lexicon_builder.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
				 #include "syntaxnet/affix.h"
			
 
				 #include "syntaxnet/dictionary.pb.h"
			
 
				 #include "syntaxnet/feature_extractor.h"
			
 
				+#include "syntaxnet/segmenter_utils.h"
			
 
				 #include "syntaxnet/sentence.pb.h"
			
 
				 #include "syntaxnet/sentence_batch.h"
			
 
				 #include "syntaxnet/term_frequency_map.h"
			
@@ -75,6 +76,7 @@ class LexiconBuilder : public OpKernel {
 
				     TermFrequencyMap tags;
			
 
				     TermFrequencyMap categories;
			
 
				     TermFrequencyMap labels;
			
 
				+    TermFrequencyMap chars;
			
 
				 
			
 
				     // Affix tables to be populated by the corpus.
			
 
				     AffixTable prefixes(AffixTable::PREFIX, max_prefix_length_);
			
@@ -87,7 +89,7 @@ class LexiconBuilder : public OpKernel {
 
				     int64 num_tokens = 0;
			
 
				     int64 num_documents = 0;
			
 
				     Sentence *document;
			
 
				-    TextReader corpus(*task_context_.GetInput(corpus_name_));
			
 
				+    TextReader corpus(*task_context_.GetInput(corpus_name_), &task_context_);
			
 
				     while ((document = corpus.Read()) != nullptr) {
			
 
				       // Gather token information.
			
 
				       for (int t = 0; t < document->token_size(); ++t) {
			
@@ -114,6 +116,14 @@ class LexiconBuilder : public OpKernel {
 
				         // Add mapping from tag to category.
			
 
				         tag_to_category.SetCategory(token.tag(), token.category());
			
 
				 
			
 
				+        // Add characters.
			
 
				+        vector<tensorflow::StringPiece> char_sp;
			
 
				+        SegmenterUtils::GetUTF8Chars(word, &char_sp);
			
 
				+        for (const auto &c : char_sp) {
			
 
				+          const string c_str = c.ToString();
			
 
				+          if (!c_str.empty() && !HasSpaces(c_str)) chars.Increment(c_str);
			
 
				+        }
			
 
				+
			
 
				         // Update the number of processed tokens.
			
 
				         ++num_tokens;
			
 
				       }
			
@@ -131,6 +141,7 @@ class LexiconBuilder : public OpKernel {
 
				     categories.Save(
			
 
				         TaskContext::InputFile(*task_context_.GetInput("category-map")));
			
 
				     labels.Save(TaskContext::InputFile(*task_context_.GetInput("label-map")));
			
 
				+    chars.Save(TaskContext::InputFile(*task_context_.GetInput("char-map")));
			
 
				 
			
 
				     // Write affixes to disk.
			
 
				     WriteAffixTable(prefixes, TaskContext::InputFile(
			
--- a/syntaxnet/syntaxnet/lexicon_builder_test.py
+++ b/syntaxnet/syntaxnet/lexicon_builder_test.py
@@ -69,6 +69,8 @@ TOKENIZED_DOCS = u'''बात गलत हो तो गुस्सा से
 
				 लेकिन अभिनेत्री के इस कदम से वहां रंग में भंग पड़ गया ।
			
 
				 '''
			
 
				 
			
 
				+CHARS = u'''अ इ आ क ग ज ट त द न प भ ब य म र ल व ह स ि ा ु ी े ै ो ् ड़ । ं'''
			
 
				+
			
 
				 COMMENTS = u'# Line with fake comments.'
			
 
				 
			
 
				 
			
@@ -93,7 +95,7 @@ class LexiconBuilderTest(test_util.TensorFlowTestCase):
 
				     self.AddInput('documents', self.corpus_file, corpus_format, context)
			
 
				     for name in ('word-map', 'lcword-map', 'tag-map',
			
 
				                  'category-map', 'label-map', 'prefix-table',
			
 
				-                 'suffix-table', 'tag-to-category'):
			
 
				+                 'suffix-table', 'tag-to-category', 'char-map'):
			
 
				       self.AddInput(name, os.path.join(FLAGS.test_tmpdir, name), '', context)
			
 
				     logging.info('Writing context to: %s', self.context_file)
			
 
				     with open(self.context_file, 'w') as f:
			
@@ -133,6 +135,26 @@ class LexiconBuilderTest(test_util.TensorFlowTestCase):
 
				       self.assertIn(tag, TAGS)
			
 
				       self.assertIn(category, CATEGORIES)
			
 
				 
			
 
				+  def LoadMap(self, map_name):
			
 
				+    loaded_map = {}
			
 
				+    with file(os.path.join(FLAGS.test_tmpdir, map_name), 'r') as f:
			
 
				+      for line in f:
			
 
				+        entries = line.strip().split(' ')
			
 
				+        if len(entries) == 2:
			
 
				+          loaded_map[entries[0]] = entries[1]
			
 
				+    return loaded_map
			
 
				+
			
 
				+  def ValidateCharMap(self):
			
 
				+    char_map = self.LoadMap('char-map')
			
 
				+    self.assertEqual(len(char_map), len(CHARS.split(' ')))
			
 
				+    for char in CHARS.split(' '):
			
 
				+      self.assertIn(char.encode('utf-8'), char_map)
			
 
				+
			
 
				+  def ValidateWordMap(self):
			
 
				+    word_map = self.LoadMap('word-map')
			
 
				+    for word in filter(None, TOKENIZED_DOCS.replace('\n', ' ').split(' ')):
			
 
				+      self.assertIn(word.encode('utf-8'), word_map)
			
 
				+
			
 
				   def BuildLexicon(self):
			
 
				     with self.test_session():
			
 
				       gen_parser_ops.lexicon_builder(task_context=self.context_file).run()
			
@@ -146,6 +168,8 @@ class LexiconBuilderTest(test_util.TensorFlowTestCase):
 
				     self.ValidateDocuments()
			
 
				     self.BuildLexicon()
			
 
				     self.ValidateTagToCategoryMap()
			
 
				+    self.ValidateCharMap()
			
 
				+    self.ValidateWordMap()
			
 
				 
			
 
				   def testCoNLLFormatExtraNewlinesAndComments(self):
			
 
				     self.WriteContext('conll-sentence')
			
--- a/syntaxnet/syntaxnet/morpher_transitions.cc
+++ b/syntaxnet/syntaxnet/morpher_transitions.cc
@@ -0,0 +1,298 @@
 
				+/* Copyright 2016 Google Inc. All Rights Reserved.
			
 
				+
			
 
				+Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+you may not use this file except in compliance with the License.
			
 
				+You may obtain a copy of the License at
			
 
				+
			
 
				+    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software
			
 
				+distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+See the License for the specific language governing permissions and
			
 
				+limitations under the License.
			
 
				+==============================================================================*/
			
 
				+
			
 
				+// Morpher transition system.
			
 
				+//
			
 
				+// This transition system has one type of actions:
			
 
				+//  - The SHIFT action pushes the next input token to the stack and
			
 
				+//    advances to the next input token, assigning a part-of-speech tag to the
			
 
				+//    token that was shifted.
			
 
				+//
			
 
				+// The transition system operates with parser actions encoded as integers:
			
 
				+//  - A SHIFT action is encoded as number starting from 0.
			
 
				+
			
 
				+#include <string>
			
 
				+
			
 
				+#include "syntaxnet/morphology_label_set.h"
			
 
				+#include "syntaxnet/parser_features.h"
			
 
				+#include "syntaxnet/parser_state.h"
			
 
				+#include "syntaxnet/parser_transitions.h"
			
 
				+#include "syntaxnet/sentence_features.h"
			
 
				+#include "syntaxnet/shared_store.h"
			
 
				+#include "syntaxnet/task_context.h"
			
 
				+#include "syntaxnet/term_frequency_map.h"
			
 
				+#include "syntaxnet/utils.h"
			
 
				+#include "tensorflow/core/lib/strings/strcat.h"
			
 
				+
			
 
				+namespace syntaxnet {
			
 
				+
			
 
				+class MorphologyTransitionState : public ParserTransitionState {
			
 
				+ public:
			
 
				+  explicit MorphologyTransitionState(const MorphologyLabelSet *label_set)
			
 
				+      : label_set_(label_set) {}
			
 
				+
			
 
				+  explicit MorphologyTransitionState(const MorphologyTransitionState *state)
			
 
				+      : MorphologyTransitionState(state->label_set_) {
			
 
				+    tag_ = state->tag_;
			
 
				+    gold_tag_ = state->gold_tag_;
			
 
				+  }
			
 
				+
			
 
				+  // Clones the transition state by returning a new object.
			
 
				+  ParserTransitionState *Clone() const override {
			
 
				+    return new MorphologyTransitionState(this);
			
 
				+  }
			
 
				+
			
 
				+  // Reads gold tags for each token.
			
 
				+  void Init(ParserState *state) override {
			
 
				+    tag_.resize(state->sentence().token_size(), -1);
			
 
				+    gold_tag_.resize(state->sentence().token_size(), -1);
			
 
				+    for (int pos = 0; pos < state->sentence().token_size(); ++pos) {
			
 
				+      const Token &token = state->GetToken(pos);
			
 
				+
			
 
				+      // NOTE: we allow token to not have a TokenMorphology extension or for the
			
 
				+      // TokenMorphology to be absent from the label_set_ because this can
			
 
				+      // happen at test time.
			
 
				+      gold_tag_[pos] = label_set_->LookupExisting(
			
 
				+          token.GetExtension(TokenMorphology::morphology));
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Returns the tag assigned to a given token.
			
 
				+  int Tag(int index) const {
			
 
				+    DCHECK_GE(index, 0);
			
 
				+    DCHECK_LT(index, tag_.size());
			
 
				+    return index == -1 ? -1 : tag_[index];
			
 
				+  }
			
 
				+
			
 
				+  // Sets this tag on the token at index.
			
 
				+  void SetTag(int index, int tag) {
			
 
				+    DCHECK_GE(index, 0);
			
 
				+    DCHECK_LT(index, tag_.size());
			
 
				+    tag_[index] = tag;
			
 
				+  }
			
 
				+
			
 
				+  // Returns the gold tag for a given token.
			
 
				+  int GoldTag(int index) const {
			
 
				+    DCHECK_GE(index, -1);
			
 
				+    DCHECK_LT(index, gold_tag_.size());
			
 
				+    return index == -1 ? -1 : gold_tag_[index];
			
 
				+  }
			
 
				+
			
 
				+  // Returns the proto corresponding to the tag, or an empty proto if the tag is
			
 
				+  // not found.
			
 
				+  const TokenMorphology &TagAsProto(int tag) const {
			
 
				+    if (tag >= 0 && tag < label_set_->Size()) {
			
 
				+      return label_set_->Lookup(tag);
			
 
				+    }
			
 
				+    return TokenMorphology::default_instance();
			
 
				+  }
			
 
				+
			
 
				+  // Adds transition state specific annotations to the document.
			
 
				+  void AddParseToDocument(const ParserState &state, bool rewrite_root_labels,
			
 
				+                          Sentence *sentence) const override {
			
 
				+    for (int i = 0; i < tag_.size(); ++i) {
			
 
				+      Token *token = sentence->mutable_token(i);
			
 
				+      *token->MutableExtension(TokenMorphology::morphology) =
			
 
				+          TagAsProto(Tag(i));
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Whether a parsed token should be considered correct for evaluation.
			
 
				+  bool IsTokenCorrect(const ParserState &state, int index) const override {
			
 
				+    return GoldTag(index) == Tag(index);
			
 
				+  }
			
 
				+
			
 
				+  // Returns a human readable string representation of this state.
			
 
				+  string ToString(const ParserState &state) const override {
			
 
				+    string str;
			
 
				+    for (int i = state.StackSize(); i > 0; --i) {
			
 
				+      const string &word = state.GetToken(state.Stack(i - 1)).word();
			
 
				+      if (i != state.StackSize() - 1) str.append(" ");
			
 
				+      tensorflow::strings::StrAppend(
			
 
				+          &str, word, "[",
			
 
				+          TagAsProto(Tag(state.StackSize() - i)).ShortDebugString(), "]");
			
 
				+    }
			
 
				+    for (int i = state.Next(); i < state.NumTokens(); ++i) {
			
 
				+      tensorflow::strings::StrAppend(&str, " ", state.GetToken(i).word());
			
 
				+    }
			
 
				+    return str;
			
 
				+  }
			
 
				+
			
 
				+ private:
			
 
				+  // Currently assigned morphological analysis for each token in this sentence.
			
 
				+  vector<int> tag_;
			
 
				+
			
 
				+  // Gold morphological analysis from the input document.
			
 
				+  vector<int> gold_tag_;
			
 
				+
			
 
				+  // Tag map used for conversions between integer and string representations
			
 
				+  // part of speech tags. Not owned.
			
 
				+  const MorphologyLabelSet *label_set_ = nullptr;
			
 
				+
			
 
				+  TF_DISALLOW_COPY_AND_ASSIGN(MorphologyTransitionState);
			
 
				+};
			
 
				+
			
 
				+class MorphologyTransitionSystem : public ParserTransitionSystem {
			
 
				+ public:
			
 
				+  ~MorphologyTransitionSystem() override { SharedStore::Release(label_set_); }
			
 
				+
			
 
				+  // Determines tag map location.
			
 
				+  void Setup(TaskContext *context) override {
			
 
				+    context->GetInput("morph-label-set");
			
 
				+  }
			
 
				+
			
 
				+  // Reads tag map and tag to category map.
			
 
				+  void Init(TaskContext *context) override {
			
 
				+    const string fname =
			
 
				+        TaskContext::InputFile(*context->GetInput("morph-label-set"));
			
 
				+    label_set_ =
			
 
				+        SharedStoreUtils::GetWithDefaultName<MorphologyLabelSet>(fname);
			
 
				+  }
			
 
				+
			
 
				+  // The SHIFT action uses the same value as the corresponding action type.
			
 
				+  static ParserAction ShiftAction(int tag) { return tag; }
			
 
				+
			
 
				+  // The morpher transition system doesn't look at the dependency tree, so it
			
 
				+  // allows non-projective trees.
			
 
				+  bool AllowsNonProjective() const override { return true; }
			
 
				+
			
 
				+  // Returns the number of action types.
			
 
				+  int NumActionTypes() const override { return 1; }
			
 
				+
			
 
				+  // Returns the number of possible actions.
			
 
				+  int NumActions(int num_labels) const override { return label_set_->Size(); }
			
 
				+
			
 
				+  // The default action for a given state is assigning the most frequent tag.
			
 
				+  ParserAction GetDefaultAction(const ParserState &state) const override {
			
 
				+    return ShiftAction(0);
			
 
				+  }
			
 
				+
			
 
				+  // Returns the next gold action for a given state according to the
			
 
				+  // underlying annotated sentence.
			
 
				+  ParserAction GetNextGoldAction(const ParserState &state) const override {
			
 
				+    if (!state.EndOfInput()) {
			
 
				+      return ShiftAction(TransitionState(state).GoldTag(state.Next()));
			
 
				+    }
			
 
				+    return ShiftAction(0);
			
 
				+  }
			
 
				+
			
 
				+  // Checks if the action is allowed in a given parser state.
			
 
				+  bool IsAllowedAction(ParserAction action,
			
 
				+                       const ParserState &state) const override {
			
 
				+    return !state.EndOfInput();
			
 
				+  }
			
 
				+
			
 
				+  // Makes a shift by pushing the next input token on the stack and moving to
			
 
				+  // the next position.
			
 
				+  void PerformActionWithoutHistory(ParserAction action,
			
 
				+                                   ParserState *state) const override {
			
 
				+    DCHECK(!state->EndOfInput());
			
 
				+    if (!state->EndOfInput()) {
			
 
				+      MutableTransitionState(state)->SetTag(state->Next(), action);
			
 
				+      state->Push(state->Next());
			
 
				+      state->Advance();
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // We are in a final state when we reached the end of the input and the stack
			
 
				+  // is empty.
			
 
				+  bool IsFinalState(const ParserState &state) const override {
			
 
				+    return state.EndOfInput();
			
 
				+  }
			
 
				+
			
 
				+  // Returns a string representation of a parser action.
			
 
				+  string ActionAsString(ParserAction action,
			
 
				+                        const ParserState &state) const override {
			
 
				+    return tensorflow::strings::StrCat(
			
 
				+        "SHIFT(", label_set_->Lookup(action).ShortDebugString(), ")");
			
 
				+  }
			
 
				+
			
 
				+  // No state is deterministic in this transition system.
			
 
				+  bool IsDeterministicState(const ParserState &state) const override {
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  // Returns a new transition state to be used to enhance the parser state.
			
 
				+  ParserTransitionState *NewTransitionState(bool training_mode) const override {
			
 
				+    return new MorphologyTransitionState(label_set_);
			
 
				+  }
			
 
				+
			
 
				+  // Downcasts the const ParserTransitionState in ParserState to a const
			
 
				+  // MorphologyTransitionState.
			
 
				+  static const MorphologyTransitionState &TransitionState(
			
 
				+      const ParserState &state) {
			
 
				+    return *static_cast<const MorphologyTransitionState *>(
			
 
				+        state.transition_state());
			
 
				+  }
			
 
				+
			
 
				+  // Downcasts the ParserTransitionState in ParserState to an
			
 
				+  // MorphologyTransitionState.
			
 
				+  static MorphologyTransitionState *MutableTransitionState(ParserState *state) {
			
 
				+    return static_cast<MorphologyTransitionState *>(
			
 
				+        state->mutable_transition_state());
			
 
				+  }
			
 
				+
			
 
				+  // Input for the tag map. Not owned.
			
 
				+  TaskInput *input_label_set_ = nullptr;
			
 
				+
			
 
				+  // Tag map used for conversions between integer and string representations
			
 
				+  // morphology labels. Owned through SharedStore.
			
 
				+  const MorphologyLabelSet *label_set_;
			
 
				+};
			
 
				+
			
 
				+REGISTER_TRANSITION_SYSTEM("morpher", MorphologyTransitionSystem);
			
 
				+
			
 
				+// Feature function for retrieving the tag assigned to a token by the tagger
			
 
				+// transition system.
			
 
				+class PredictedMorphTagFeatureFunction : public ParserIndexFeatureFunction {
			
 
				+ public:
			
 
				+  PredictedMorphTagFeatureFunction() {}
			
 
				+
			
 
				+  // Determines tag map location.
			
 
				+  void Setup(TaskContext *context) override {
			
 
				+    context->GetInput("morph-label-set", "recordio", "token-morphology");
			
 
				+  }
			
 
				+
			
 
				+  // Reads tag map.
			
 
				+  void Init(TaskContext *context) override {
			
 
				+    const string fname =
			
 
				+        TaskContext::InputFile(*context->GetInput("morph-label-set"));
			
 
				+    label_set_ = SharedStore::Get<MorphologyLabelSet>(fname, fname);
			
 
				+    set_feature_type(new FullLabelFeatureType(name(), label_set_));
			
 
				+  }
			
 
				+
			
 
				+  // Gets the MorphologyTransitionState from the parser state and reads the
			
 
				+  // assigned
			
 
				+  // tag at the focus index. Returns -1 if the focus is not within the sentence.
			
 
				+  FeatureValue Compute(const WorkspaceSet &workspaces, const ParserState &state,
			
 
				+                       int focus, const FeatureVector *result) const override {
			
 
				+    if (focus < 0 || focus >= state.sentence().token_size()) return -1;
			
 
				+    return static_cast<const MorphologyTransitionState *>(
			
 
				+               state.transition_state())
			
 
				+        ->Tag(focus);
			
 
				+  }
			
 
				+
			
 
				+ private:
			
 
				+  // Tag map used for conversions between integer and string representations
			
 
				+  // part of speech tags. Owned through SharedStore.
			
 
				+  const MorphologyLabelSet *label_set_;
			
 
				+
			
 
				+  TF_DISALLOW_COPY_AND_ASSIGN(PredictedMorphTagFeatureFunction);
			
 
				+};
			
 
				+
			
 
				+REGISTER_PARSER_IDX_FEATURE_FUNCTION("pred-morph-tag",
			
 
				+                                     PredictedMorphTagFeatureFunction);
			
 
				+
			
 
				+}  // namespace syntaxnet
			
--- a/syntaxnet/syntaxnet/morphology_label_set.cc
+++ b/syntaxnet/syntaxnet/morphology_label_set.cc
@@ -0,0 +1,91 @@
 
				+/* Copyright 2016 Google Inc. All Rights Reserved.
			
 
				+
			
 
				+Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+you may not use this file except in compliance with the License.
			
 
				+You may obtain a copy of the License at
			
 
				+
			
 
				+    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software
			
 
				+distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+See the License for the specific language governing permissions and
			
 
				+limitations under the License.
			
 
				+==============================================================================*/
			
 
				+
			
 
				+#include "syntaxnet/morphology_label_set.h"
			
 
				+
			
 
				+namespace syntaxnet {
			
 
				+
			
 
				+const char MorphologyLabelSet::kSeparator[] = "\t";
			
 
				+
			
 
				+int MorphologyLabelSet::Add(const TokenMorphology &morph) {
			
 
				+  string repr = StringForMatch(morph);
			
 
				+  auto it = fast_lookup_.find(repr);
			
 
				+  if (it != fast_lookup_.end()) return it->second;
			
 
				+  fast_lookup_[repr] = label_set_.size();
			
 
				+  label_set_.push_back(morph);
			
 
				+  return label_set_.size() - 1;
			
 
				+}
			
 
				+
			
 
				+// Look up an existing TokenMorphology.  If it is not present, return -1.
			
 
				+int MorphologyLabelSet::LookupExisting(const TokenMorphology &morph) const {
			
 
				+  string repr = StringForMatch(morph);
			
 
				+  auto it = fast_lookup_.find(repr);
			
 
				+  if (it != fast_lookup_.end()) return it->second;
			
 
				+  return -1;
			
 
				+}
			
 
				+
			
 
				+// Return the TokenMorphology at position i.  The input i should be in the range
			
 
				+// 0..size().
			
 
				+const TokenMorphology &MorphologyLabelSet::Lookup(int i) const {
			
 
				+  CHECK_GE(i, 0);
			
 
				+  CHECK_LT(i, label_set_.size());
			
 
				+  return label_set_[i];
			
 
				+}
			
 
				+
			
 
				+void MorphologyLabelSet::Read(const string &filename) {
			
 
				+  ProtoRecordReader reader(filename);
			
 
				+  Read(&reader);
			
 
				+}
			
 
				+
			
 
				+void MorphologyLabelSet::Read(ProtoRecordReader *reader) {
			
 
				+  TokenMorphology morph;
			
 
				+  while (reader->Read(&morph).ok()) {
			
 
				+    CHECK_EQ(-1, LookupExisting(morph));
			
 
				+    Add(morph);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void MorphologyLabelSet::Write(const string &filename) const {
			
 
				+  ProtoRecordWriter writer(filename);
			
 
				+  Write(&writer);
			
 
				+}
			
 
				+
			
 
				+void MorphologyLabelSet::Write(ProtoRecordWriter *writer) const {
			
 
				+  for (const TokenMorphology &morph : label_set_) {
			
 
				+    writer->Write(morph);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+string MorphologyLabelSet::StringForMatch(const TokenMorphology &morph) const {
			
 
				+  vector<string> attributes;
			
 
				+  for (const auto &a : morph.attribute()) {
			
 
				+    attributes.push_back(
			
 
				+        tensorflow::strings::StrCat(a.name(), kSeparator, a.value()));
			
 
				+  }
			
 
				+  std::sort(attributes.begin(), attributes.end());
			
 
				+  return utils::Join(attributes, kSeparator);
			
 
				+}
			
 
				+
			
 
				+string FullLabelFeatureType::GetFeatureValueName(FeatureValue value) const {
			
 
				+  const TokenMorphology &morph = label_set_->Lookup(value);
			
 
				+  vector<string> attributes;
			
 
				+  for (const auto &a : morph.attribute()) {
			
 
				+    attributes.push_back(tensorflow::strings::StrCat(a.name(), ":", a.value()));
			
 
				+  }
			
 
				+  std::sort(attributes.begin(), attributes.end());
			
 
				+  return utils::Join(attributes, ",");
			
 
				+}
			
 
				+
			
 
				+}  // namespace syntaxnet
			
--- a/syntaxnet/syntaxnet/morphology_label_set.h
+++ b/syntaxnet/syntaxnet/morphology_label_set.h
@@ -0,0 +1,110 @@
 
				+/* Copyright 2016 Google Inc. All Rights Reserved.
			
 
				+
			
 
				+Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+you may not use this file except in compliance with the License.
			
 
				+You may obtain a copy of the License at
			
 
				+
			
 
				+    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software
			
 
				+distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+See the License for the specific language governing permissions and
			
 
				+limitations under the License.
			
 
				+==============================================================================*/
			
 
				+
			
 
				+// A class to store the set of possible TokenMorphology objects.  This includes
			
 
				+// lookup, iteration and serialziation.
			
 
				+
			
 
				+#ifndef SYNTAXNET_MORPHOLOGY_LABEL_SET_H_
			
 
				+#define SYNTAXNET_MORPHOLOGY_LABEL_SET_H_
			
 
				+
			
 
				+#include <unordered_map>
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "syntaxnet/proto_io.h"
			
 
				+#include "syntaxnet/sentence.pb.h"
			
 
				+
			
 
				+namespace syntaxnet {
			
 
				+
			
 
				+class MorphologyLabelSet {
			
 
				+ public:
			
 
				+  // Initalize as an empty morphology.
			
 
				+  MorphologyLabelSet() {}
			
 
				+
			
 
				+  // Initalizes by reading the given file, which has been saved by Write().
			
 
				+  // This makes using the shared store easier.
			
 
				+  explicit MorphologyLabelSet(const string &fname) { Read(fname); }
			
 
				+
			
 
				+  // Adds a TokenMorphology to the set if it is not present. In any case, return
			
 
				+  // its position in the list. Note: This is slow, and should not be called
			
 
				+  // outside of training or init.
			
 
				+  int Add(const TokenMorphology &morph);
			
 
				+
			
 
				+  // Look up an existing TokenMorphology. If it is not present, return -1.
			
 
				+  // Note: This is slow, and should not be called outside of training workflow
			
 
				+  // or init.
			
 
				+  int LookupExisting(const TokenMorphology &morph) const;
			
 
				+
			
 
				+  // Return the TokenMorphology at position i. The input i should be in the
			
 
				+  // range 0..size(). Note: this will be called at inference time and needs to
			
 
				+  // be kept fast.
			
 
				+  const TokenMorphology &Lookup(int i) const;
			
 
				+
			
 
				+  // Return the number of elements.
			
 
				+  int Size() const { return label_set_.size(); }
			
 
				+
			
 
				+  // Deserialization and serialization.
			
 
				+  void Read(const string &filename);
			
 
				+  void Write(const string &filename) const;
			
 
				+
			
 
				+ private:
			
 
				+  string StringForMatch(const TokenMorphology &morhp) const;
			
 
				+
			
 
				+  // Deserialization and serialziation implementation.
			
 
				+  void Read(ProtoRecordReader *reader);
			
 
				+  void Write(ProtoRecordWriter *writer) const;
			
 
				+
			
 
				+  // List of all possible annotations.  This is a unique list, where equality is
			
 
				+  // defined as follows:
			
 
				+  //
			
 
				+  //   a == b iff the set of attribute pairs (attribute, value) is identical.
			
 
				+  vector<TokenMorphology> label_set_;
			
 
				+
			
 
				+  // Because protocol buffer equality is complicated, we implement our own
			
 
				+  // equality operator based on strings. This unordered_map allows us to do the
			
 
				+  // lookup more quickly.
			
 
				+  unordered_map<string, int> fast_lookup_;
			
 
				+
			
 
				+  // A separator string that should not occur in any of the attribute names.
			
 
				+  // This should never be serialized, so that it can be changed in the code if
			
 
				+  // we change attribute names and it occurs in the new names.
			
 
				+  static const char kSeparator[];
			
 
				+};
			
 
				+
			
 
				+// A feature type with one value for each complete morphological analysis
			
 
				+// (analogous to the fulltag analyzer).
			
 
				+class FullLabelFeatureType : public FeatureType {
			
 
				+ public:
			
 
				+  FullLabelFeatureType(const string &name, const MorphologyLabelSet *label_set)
			
 
				+      : FeatureType(name), label_set_(label_set) {}
			
 
				+
			
 
				+  ~FullLabelFeatureType() override {}
			
 
				+
			
 
				+  // Converts a feature value to a name.  We don't use StringForMatch, since the
			
 
				+  // goal of these are to be readable, even if they might occasionally be
			
 
				+  // non-unique.
			
 
				+  string GetFeatureValueName(FeatureValue value) const override;
			
 
				+
			
 
				+  // Returns the size of the feature values domain.
			
 
				+  FeatureValue GetDomainSize() const override { return label_set_->Size(); }
			
 
				+
			
 
				+ private:
			
 
				+  // Not owned.
			
 
				+  const MorphologyLabelSet *label_set_ = nullptr;
			
 
				+};
			
 
				+
			
 
				+}  // namespace syntaxnet
			
 
				+
			
 
				+#endif  // SYNTAXNET_MORPHOLOGY_LABEL_SET_H_
			
--- a/syntaxnet/syntaxnet/morphology_label_set_test.cc
+++ b/syntaxnet/syntaxnet/morphology_label_set_test.cc
@@ -0,0 +1,101 @@
 
				+/* Copyright 2016 Google Inc. All Rights Reserved.
			
 
				+
			
 
				+Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+you may not use this file except in compliance with the License.
			
 
				+You may obtain a copy of the License at
			
 
				+
			
 
				+    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software
			
 
				+distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+See the License for the specific language governing permissions and
			
 
				+limitations under the License.
			
 
				+==============================================================================*/
			
 
				+
			
 
				+#include "syntaxnet/morphology_label_set.h"
			
 
				+#include "syntaxnet/sentence.pb.h"
			
 
				+#include <gmock/gmock.h>
			
 
				+
			
 
				+#include "tensorflow/core/lib/core/status.h"
			
 
				+#include "tensorflow/core/platform/env.h"
			
 
				+#include "tensorflow/core/platform/test.h"
			
 
				+
			
 
				+namespace syntaxnet {
			
 
				+
			
 
				+class MorphologyLabelSetTest : public ::testing::Test {
			
 
				+ protected:
			
 
				+  MorphologyLabelSet label_set_;
			
 
				+};
			
 
				+
			
 
				+// Test that Add and LookupExisting work as expected.
			
 
				+TEST_F(MorphologyLabelSetTest, AddLookupExisting) {
			
 
				+  TokenMorphology si1, si2;  // singular, imperative
			
 
				+  TokenMorphology pi;        // plural, imperative
			
 
				+  TokenMorphology six;       // singular, imperative with extra value
			
 
				+  TextFormat::ParseFromString(R"(
			
 
				+      attribute {name: "Number" value: "Singular"}
			
 
				+      attribute {name: "POS" value: "IMP"})",
			
 
				+                                      &si1);
			
 
				+  TextFormat::ParseFromString(R"(
			
 
				+      attribute {name: "POS" value: "IMP"}
			
 
				+      attribute {name: "Number" value: "Singular"})",
			
 
				+                                      &si2);
			
 
				+  TextFormat::ParseFromString(R"(
			
 
				+      attribute {name: "Number" value: "Plural"}
			
 
				+      attribute {name: "POS" value: "IMP"})",
			
 
				+                                      &pi);
			
 
				+  TextFormat::ParseFromString(R"(
			
 
				+      attribute {name: "Number" value: "Plural"}
			
 
				+      attribute {name: "POS" value: "IMP"}
			
 
				+      attribute {name: "x" value: "x"})",
			
 
				+                                      &six);
			
 
				+
			
 
				+  // Check Lookup existing returns -1 for non-existing entries.
			
 
				+  EXPECT_EQ(-1, label_set_.LookupExisting(si1));
			
 
				+  EXPECT_EQ(-1, label_set_.LookupExisting(si2));
			
 
				+  EXPECT_EQ(0, label_set_.Size());
			
 
				+
			
 
				+  // Check that adding returns 0 (this is the only possiblity given Size())
			
 
				+  EXPECT_EQ(0, label_set_.Add(si1));
			
 
				+  EXPECT_EQ(0, label_set_.Add(si1));  // calling Add twice adds only once
			
 
				+  EXPECT_EQ(1, label_set_.Size());
			
 
				+
			
 
				+  // Check that order of attributes does not matter.
			
 
				+  EXPECT_EQ(0, label_set_.LookupExisting(si2));
			
 
				+
			
 
				+  // Check that un-added entries still are not present.
			
 
				+  EXPECT_EQ(-1, label_set_.LookupExisting(pi));
			
 
				+  EXPECT_EQ(-1, label_set_.LookupExisting(six));
			
 
				+
			
 
				+  // Check that we can add them.
			
 
				+  EXPECT_EQ(1, label_set_.Add(pi));
			
 
				+  EXPECT_EQ(2, label_set_.Add(six));
			
 
				+  EXPECT_EQ(3, label_set_.Size());
			
 
				+}
			
 
				+
			
 
				+// Test write and deserializing constructor.
			
 
				+TEST_F(MorphologyLabelSetTest, Serialization) {
			
 
				+  TokenMorphology si;  // singular, imperative
			
 
				+  TokenMorphology pi;  // plural, imperative
			
 
				+  TextFormat::ParseFromString(R"(
			
 
				+      attribute {name: "Number" value: "Singular"}
			
 
				+      attribute {name: "POS" value: "IMP"})",
			
 
				+                                      &si);
			
 
				+  TextFormat::ParseFromString(R"(
			
 
				+      attribute {name: "Number" value: "Plural"}
			
 
				+      attribute {name: "POS" value: "IMP"})",
			
 
				+                                      &pi);
			
 
				+  EXPECT_EQ(0, label_set_.Add(si));
			
 
				+  EXPECT_EQ(1, label_set_.Add(pi));
			
 
				+
			
 
				+  // Serialize and deserialize.
			
 
				+  string fname = utils::JoinPath({tensorflow::testing::TmpDir(), "label-set"});
			
 
				+  label_set_.Write(fname);
			
 
				+  MorphologyLabelSet label_set2(fname);
			
 
				+  EXPECT_EQ(0, label_set2.LookupExisting(si));
			
 
				+  EXPECT_EQ(1, label_set2.LookupExisting(pi));
			
 
				+  EXPECT_EQ(2, label_set2.Size());
			
 
				+}
			
 
				+
			
 
				+}  // namespace syntaxnet
			
--- a/syntaxnet/syntaxnet/parser_eval.py
+++ b/syntaxnet/syntaxnet/parser_eval.py
@@ -22,7 +22,6 @@ import time
 
				 
			
 
				 import tensorflow as tf
			
 
				 
			
 
				-from tensorflow.python.platform import gfile
			
 
				 from tensorflow.python.platform import tf_logging as logging
			
 
				 from syntaxnet import sentence_pb2
			
 
				 from syntaxnet import graph_builder
			
--- a/syntaxnet/syntaxnet/parser_features.cc
+++ b/syntaxnet/syntaxnet/parser_features.cc
@@ -166,6 +166,9 @@ REGISTER_PARSER_IDX_FEATURE_FUNCTION("label", LabelFeatureFunction);
 
				 typedef BasicParserSentenceFeatureFunction<Word> WordFeatureFunction;
			
 
				 REGISTER_PARSER_IDX_FEATURE_FUNCTION("word", WordFeatureFunction);
			
 
				 
			
 
				+typedef BasicParserSentenceFeatureFunction<Char> CharFeatureFunction;
			
 
				+REGISTER_PARSER_IDX_FEATURE_FUNCTION("char", CharFeatureFunction);
			
 
				+
			
 
				 typedef BasicParserSentenceFeatureFunction<Tag> TagFeatureFunction;
			
 
				 REGISTER_PARSER_IDX_FEATURE_FUNCTION("tag", TagFeatureFunction);
			
 
				 
			
@@ -175,6 +178,21 @@ REGISTER_PARSER_IDX_FEATURE_FUNCTION("digit", DigitFeatureFunction);
 
				 typedef BasicParserSentenceFeatureFunction<Hyphen> HyphenFeatureFunction;
			
 
				 REGISTER_PARSER_IDX_FEATURE_FUNCTION("hyphen", HyphenFeatureFunction);
			
 
				 
			
 
				+typedef BasicParserSentenceFeatureFunction<Capitalization>
			
 
				+    CapitalizationFeatureFunction;
			
 
				+REGISTER_PARSER_IDX_FEATURE_FUNCTION("capitalization",
			
 
				+                                     CapitalizationFeatureFunction);
			
 
				+
			
 
				+typedef BasicParserSentenceFeatureFunction<PunctuationAmount>
			
 
				+    PunctuationAmountFeatureFunction;
			
 
				+REGISTER_PARSER_IDX_FEATURE_FUNCTION("punctuation-amount",
			
 
				+                                     PunctuationAmountFeatureFunction);
			
 
				+
			
 
				+typedef BasicParserSentenceFeatureFunction<Quote>
			
 
				+    QuoteFeatureFunction;
			
 
				+REGISTER_PARSER_IDX_FEATURE_FUNCTION("quote",
			
 
				+                                     QuoteFeatureFunction);
			
 
				+
			
 
				 typedef BasicParserSentenceFeatureFunction<PrefixFeature> PrefixFeatureFunction;
			
 
				 REGISTER_PARSER_IDX_FEATURE_FUNCTION("prefix", PrefixFeatureFunction);
			
 
				 
			
--- a/syntaxnet/syntaxnet/proto_io.h
+++ b/syntaxnet/syntaxnet/proto_io.h
@@ -144,7 +144,7 @@ class StdIn : public tensorflow::RandomAccessFile {
 
				 // Reads sentence protos from a text file.
			
 
				 class TextReader {
			
 
				  public:
			
 
				-  explicit TextReader(const TaskInput &input) {
			
 
				+  explicit TextReader(const TaskInput &input, TaskContext *context) {
			
 
				     CHECK_EQ(input.record_format_size(), 1)
			
 
				         << "TextReader only supports inputs with one record format: "
			
 
				         << input.DebugString();
			
@@ -153,6 +153,7 @@ class TextReader {
 
				         << input.DebugString();
			
 
				     filename_ = TaskContext::InputFile(input);
			
 
				     format_.reset(DocumentFormat::Create(input.record_format(0)));
			
 
				+    format_->Setup(context);
			
 
				     Reset();
			
 
				   }
			
 
				 
			
@@ -202,7 +203,7 @@ class TextReader {
 
				 // Writes sentence protos to a text conll file.
			
 
				 class TextWriter {
			
 
				  public:
			
 
				-  explicit TextWriter(const TaskInput &input) {
			
 
				+  explicit TextWriter(const TaskInput &input, TaskContext *context) {
			
 
				     CHECK_EQ(input.record_format_size(), 1)
			
 
				         << "TextWriter only supports files with one record format: "
			
 
				         << input.DebugString();
			
@@ -211,6 +212,7 @@ class TextWriter {
 
				         << input.DebugString();
			
 
				     filename_ = TaskContext::InputFile(input);
			
 
				     format_.reset(DocumentFormat::Create(input.record_format(0)));
			
 
				+    format_->Setup(context);
			
 
				     if (filename_ != "-") {
			
 
				       TF_CHECK_OK(
			
 
				           tensorflow::Env::Default()->NewWritableFile(filename_, &file_));
			
--- a/syntaxnet/syntaxnet/segmenter_utils.cc
+++ b/syntaxnet/syntaxnet/segmenter_utils.cc
@@ -0,0 +1,85 @@
 
				+/* Copyright 2016 Google Inc. All Rights Reserved.
			
 
				+
			
 
				+Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+you may not use this file except in compliance with the License.
			
 
				+You may obtain a copy of the License at
			
 
				+
			
 
				+    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software
			
 
				+distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+See the License for the specific language governing permissions and
			
 
				+limitations under the License.
			
 
				+==============================================================================*/
			
 
				+
			
 
				+#include "syntaxnet/segmenter_utils.h"
			
 
				+#include "util/utf8/unicodetext.h"
			
 
				+#include "util/utf8/unilib.h"
			
 
				+#include "util/utf8/unilib_utf8_utils.h"
			
 
				+
			
 
				+namespace syntaxnet {
			
 
				+
			
 
				+// Separators, code Zs from http://www.unicode.org/Public/UNIDATA/PropList.txt
			
 
				+// NB: This list is not necessarily exhaustive.
			
 
				+const std::unordered_set<int> SegmenterUtils::kBreakChars({
			
 
				+  0x2028,  // line separator
			
 
				+  0x2029,  // paragraph separator
			
 
				+  0x0020,  // space
			
 
				+  0x00a0,  // no-break space
			
 
				+  0x1680,  // Ogham space mark
			
 
				+  0x180e,  // Mongolian vowel separator
			
 
				+  0x202f,  // narrow no-break space
			
 
				+  0x205f,  // medium mathematical space
			
 
				+  0x3000,  // ideographic space
			
 
				+  0xe5e5,  // Google addition
			
 
				+  0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008,
			
 
				+  0x2009, 0x200a
			
 
				+});
			
 
				+
			
 
				+void SegmenterUtils::GetUTF8Chars(const string &text,
			
 
				+                                  vector<tensorflow::StringPiece> *chars) {
			
 
				+  const char *start = text.c_str();
			
 
				+  const char *end = text.c_str() + text.size();
			
 
				+  while (start < end) {
			
 
				+    int char_length = UniLib::OneCharLen(start);
			
 
				+    chars->emplace_back(start, char_length);
			
 
				+    start += char_length;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void SegmenterUtils::SetCharsAsTokens(
			
 
				+    const string &text,
			
 
				+    const vector<tensorflow::StringPiece> &chars,
			
 
				+    Sentence *sentence) {
			
 
				+  sentence->clear_token();
			
 
				+  sentence->set_text(text);
			
 
				+  for (int i = 0; i < chars.size(); ++i) {
			
 
				+    Token *tok = sentence->add_token();
			
 
				+    tok->set_word(chars[i].ToString());  // NOLINT
			
 
				+    int start_byte, end_byte;
			
 
				+    GetCharStartEndBytes(text, chars[i], &start_byte, &end_byte);
			
 
				+    tok->set_start(start_byte);
			
 
				+    tok->set_end(end_byte);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+bool SegmenterUtils::IsValidSegment(const Sentence &sentence,
			
 
				+                                    const Token &token) {
			
 
				+  // Check that the token is not empty, both by string and by bytes.
			
 
				+  if (token.word().empty()) return false;
			
 
				+  if (token.start() > token.end()) return false;
			
 
				+
			
 
				+  // Check token boudaries inside of text.
			
 
				+  if (token.start() < 0) return false;
			
 
				+  if (token.end() >= sentence.text().size()) return false;
			
 
				+
			
 
				+  // Check that token string is valid UTF8, by bytes.
			
 
				+  const char s = sentence.text()[token.start()];
			
 
				+  const char e = sentence.text()[token.end() + 1];
			
 
				+  if (UniLib::IsTrailByte(s)) return false;
			
 
				+  if (UniLib::IsTrailByte(e)) return false;
			
 
				+  return true;
			
 
				+}
			
 
				+
			
 
				+}  // namespace syntaxnet
			
--- a/syntaxnet/syntaxnet/segmenter_utils.h
+++ b/syntaxnet/syntaxnet/segmenter_utils.h
@@ -0,0 +1,93 @@
 
				+/* Copyright 2016 Google Inc. All Rights Reserved.
			
 
				+
			
 
				+Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+you may not use this file except in compliance with the License.
			
 
				+You may obtain a copy of the License at
			
 
				+
			
 
				+    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software
			
 
				+distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+See the License for the specific language governing permissions and
			
 
				+limitations under the License.
			
 
				+==============================================================================*/
			
 
				+
			
 
				+#ifndef SYNTAXNET_SEGMENTER_UTILS_H_
			
 
				+#define SYNTAXNET_SEGMENTER_UTILS_H_
			
 
				+
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+#include <unordered_set>
			
 
				+
			
 
				+#include "syntaxnet/sentence.pb.h"
			
 
				+#include "tensorflow/core/lib/strings/strcat.h"
			
 
				+#include "util/utf8/unicodetext.h"
			
 
				+
			
 
				+namespace syntaxnet {
			
 
				+
			
 
				+// A set of common convenience functions.
			
 
				+class SegmenterUtils {
			
 
				+ public:
			
 
				+  // Takes a text and convert it into a vector, where each element is a utf8
			
 
				+  // character.
			
 
				+  static void GetUTF8Chars(const string &text,
			
 
				+                           vector<tensorflow::StringPiece> *chars);
			
 
				+
			
 
				+  // Sets tokens in the sentence so that each token is a single character.
			
 
				+  // Assigns the start/end byte offsets.
			
 
				+  //
			
 
				+  // If the sentence is not empty, the current tokens will be cleared.
			
 
				+  static void SetCharsAsTokens(const string &text,
			
 
				+                               const vector<tensorflow::StringPiece> &chars,
			
 
				+                               Sentence *sentence);
			
 
				+
			
 
				+  // Returns true for UTF-8 characters that cannot be 'real' tokens. This is
			
 
				+  // defined as any whitespace, line break or paragraph break.
			
 
				+  static bool IsBreakChar(const string &word) {
			
 
				+    if (word == "\n" || word == "\t") return true;
			
 
				+    UnicodeText text;
			
 
				+    text.PointToUTF8(word.c_str(), word.length());
			
 
				+    CHECK_EQ(text.size(), 1);
			
 
				+    return kBreakChars.find(*text.begin()) != kBreakChars.end();
			
 
				+  }
			
 
				+
			
 
				+  // Returns the break level for the next token based on the current character.
			
 
				+  static Token::BreakLevel BreakLevel(const string &word) {
			
 
				+    UnicodeText text;
			
 
				+    text.PointToUTF8(word.c_str(), word.length());
			
 
				+    auto point = *text.begin();
			
 
				+    if (word == "\n" || point == kLineSeparator) {
			
 
				+      return Token::LINE_BREAK;
			
 
				+    } else if (point == kParagraphSeparator) {
			
 
				+      return Token::SENTENCE_BREAK;  // No PARAGRAPH_BREAK in sentence proto.
			
 
				+    } else if (word == "\t" || kBreakChars.find(point) != kBreakChars.end()) {
			
 
				+      return Token::SPACE_BREAK;
			
 
				+    }
			
 
				+    return Token::NO_BREAK;
			
 
				+  }
			
 
				+
			
 
				+  // Convenience function for computing start/end byte offsets of a character
			
 
				+  // StringPiece relative to original text.
			
 
				+  static void GetCharStartEndBytes(const string &text,
			
 
				+                                   tensorflow::StringPiece c,
			
 
				+                                   int *start,
			
 
				+                                   int *end) {
			
 
				+    *start = c.data() - text.data();
			
 
				+    *end = *start + c.size() - 1;
			
 
				+  }
			
 
				+
			
 
				+  // Returns true if this segment is a valid segment. Currently checks:
			
 
				+  // 1) It is non-empty
			
 
				+  // 2) It is valid UTF8
			
 
				+  static bool IsValidSegment(const Sentence &sentence, const Token &token);
			
 
				+
			
 
				+  // Set for utf8 break characters.
			
 
				+  static const std::unordered_set<int> kBreakChars;
			
 
				+  static const int kLineSeparator = 0x2028;
			
 
				+  static const int kParagraphSeparator = 0x2029;
			
 
				+};
			
 
				+
			
 
				+}  // namespace syntaxnet
			
 
				+
			
 
				+#endif  // SYNTAXNET_SEGMENTER_UTILS_H_
			
--- a/syntaxnet/syntaxnet/segmenter_utils_test.cc
+++ b/syntaxnet/syntaxnet/segmenter_utils_test.cc
@@ -0,0 +1,149 @@
 
				+/* Copyright 2016 Google Inc. All Rights Reserved.
			
 
				+
			
 
				+Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+you may not use this file except in compliance with the License.
			
 
				+You may obtain a copy of the License at
			
 
				+
			
 
				+    http://www.apache.org/licenses/LICENSE-2.0
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software
			
 
				+distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+See the License for the specific language governing permissions and
			
 
				+limitations under the License.
			
 
				+==============================================================================*/
			
 
				+
			
 
				+#include "syntaxnet/segmenter_utils.h"
			
 
				+
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+
			
 
				+#include "syntaxnet/char_properties.h"
			
 
				+#include "syntaxnet/sentence.pb.h"
			
 
				+#include <gmock/gmock.h>
			
 
				+#include "tensorflow/core/lib/strings/strcat.h"
			
 
				+
			
 
				+namespace syntaxnet {
			
 
				+
			
 
				+// Creates a Korean senence and also initializes the token field.
			
 
				+static Sentence GetKoSentence() {
			
 
				+  Sentence sentence;
			
 
				+
			
 
				+  string text = "서울시는 2012년부터";
			
 
				+
			
 
				+  // Add tokens.
			
 
				+  sentence.set_text(text);
			
 
				+  Token *tok = sentence.add_token();
			
 
				+  tok->set_word("서울시");
			
 
				+  tok->set_start(0);
			
 
				+  tok->set_end(8);
			
 
				+  tok = sentence.add_token();
			
 
				+  tok->set_word("는");
			
 
				+  tok->set_start(9);
			
 
				+  tok->set_end(11);
			
 
				+  tok = sentence.add_token();
			
 
				+  tok->set_word("2012");
			
 
				+  tok->set_start(13);
			
 
				+  tok->set_end(16);
			
 
				+  tok = sentence.add_token();
			
 
				+  tok->set_word("년");
			
 
				+  tok->set_start(17);
			
 
				+  tok->set_end(19);
			
 
				+  tok = sentence.add_token();
			
 
				+  tok->set_word("부터");
			
 
				+  tok->set_start(20);
			
 
				+  tok->set_end(25);
			
 
				+
			
 
				+  return sentence;
			
 
				+}
			
 
				+
			
 
				+// Gets the start end bytes of the given chars in the given text.
			
 
				+static void GetStartEndBytes(const string &text,
			
 
				+                             const vector<tensorflow::StringPiece> &chars,
			
 
				+                             vector<int> *starts,
			
 
				+                             vector<int> *ends) {
			
 
				+  SegmenterUtils segment_utils;
			
 
				+  for (const tensorflow::StringPiece &c : chars) {
			
 
				+    int start; int end;
			
 
				+    segment_utils.GetCharStartEndBytes(text, c, &start, &end);
			
 
				+    starts->push_back(start);
			
 
				+    ends->push_back(end);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+// Test the GetChars function.
			
 
				+TEST(SegmenterUtilsTest, GetCharsTest) {
			
 
				+  // Create test sentence.
			
 
				+  const Sentence sentence = GetKoSentence();
			
 
				+  vector<tensorflow::StringPiece> chars;
			
 
				+  SegmenterUtils::GetUTF8Chars(sentence.text(), &chars);
			
 
				+
			
 
				+  // Check the number of characters is correct.
			
 
				+  CHECK_EQ(chars.size(), 12);
			
 
				+
			
 
				+  vector<int> starts;
			
 
				+  vector<int> ends;
			
 
				+  GetStartEndBytes(sentence.text(), chars, &starts, &ends);
			
 
				+
			
 
				+  // Check start positions.
			
 
				+  CHECK_EQ(starts[0], 0);
			
 
				+  CHECK_EQ(starts[1], 3);
			
 
				+  CHECK_EQ(starts[2], 6);
			
 
				+  CHECK_EQ(starts[3], 9);
			
 
				+  CHECK_EQ(starts[4], 12);
			
 
				+  CHECK_EQ(starts[5], 13);
			
 
				+  CHECK_EQ(starts[6], 14);
			
 
				+  CHECK_EQ(starts[7], 15);
			
 
				+  CHECK_EQ(starts[8], 16);
			
 
				+  CHECK_EQ(starts[9], 17);
			
 
				+  CHECK_EQ(starts[10], 20);
			
 
				+  CHECK_EQ(starts[11], 23);
			
 
				+
			
 
				+  // Check end positions.
			
 
				+  CHECK_EQ(ends[0], 2);
			
 
				+  CHECK_EQ(ends[1], 5);
			
 
				+  CHECK_EQ(ends[2], 8);
			
 
				+  CHECK_EQ(ends[3], 11);
			
 
				+  CHECK_EQ(ends[4], 12);
			
 
				+  CHECK_EQ(ends[5], 13);
			
 
				+  CHECK_EQ(ends[6], 14);
			
 
				+  CHECK_EQ(ends[7], 15);
			
 
				+  CHECK_EQ(ends[8], 16);
			
 
				+  CHECK_EQ(ends[9], 19);
			
 
				+  CHECK_EQ(ends[10], 22);
			
 
				+  CHECK_EQ(ends[11], 25);
			
 
				+}
			
 
				+
			
 
				+// Test the SetCharsAsTokens function.
			
 
				+TEST(SegmenterUtilsTest, SetCharsAsTokensTest) {
			
 
				+  // Create test sentence.
			
 
				+  const Sentence sentence = GetKoSentence();
			
 
				+  vector<tensorflow::StringPiece> chars;
			
 
				+  SegmenterUtils segment_utils;
			
 
				+  segment_utils.GetUTF8Chars(sentence.text(), &chars);
			
 
				+
			
 
				+  vector<int> starts;
			
 
				+  vector<int> ends;
			
 
				+  GetStartEndBytes(sentence.text(), chars, &starts, &ends);
			
 
				+
			
 
				+  // Check that the new docs word, start and end positions are properly set.
			
 
				+  Sentence new_sentence;
			
 
				+  segment_utils.SetCharsAsTokens(sentence.text(), chars, &new_sentence);
			
 
				+  CHECK_EQ(new_sentence.token_size(), chars.size());
			
 
				+  for (int t = 0; t < sentence.token_size(); ++t) {
			
 
				+    CHECK_EQ(new_sentence.token(t).word(), chars[t]);
			
 
				+    CHECK_EQ(new_sentence.token(t).start(), starts[t]);
			
 
				+    CHECK_EQ(new_sentence.token(t).end(), ends[t]);
			
 
				+  }
			
 
				+
			
 
				+  // Re-running should remove the old tokens.
			
 
				+  segment_utils.SetCharsAsTokens(sentence.text(), chars, &new_sentence);
			
 
				+  CHECK_EQ(new_sentence.token_size(), chars.size());
			
 
				+  for (int t = 0; t < sentence.token_size(); ++t) {
			
 
				+    CHECK_EQ(new_sentence.token(t).word(), chars[t]);
			
 
				+    CHECK_EQ(new_sentence.token(t).start(), starts[t]);
			
 
				+    CHECK_EQ(new_sentence.token(t).end(), ends[t]);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+}  // namespace syntaxnet
			
--- a/syntaxnet/syntaxnet/sentence.proto
+++ b/syntaxnet/syntaxnet/sentence.proto
@@ -59,3 +59,18 @@ message Token {
 
				 
			
 
				   extensions 1000 to max;
			
 
				 }
			
 
				+
			
 
				+// Stores information about the morphology of a token.
			
 
				+message TokenMorphology {
			
 
				+  extend Token {
			
 
				+    optional TokenMorphology morphology = 63949837;
			
 
				+  }
			
 
				+
			
 
				+  // Morphology is represented by a set of attribute values.
			
 
				+  message Attribute {
			
 
				+    required string name = 1;
			
 
				+    required string value = 2;
			
 
				+  }
			
 
				+  // This attribute field is designated to hold a single disambiguated analysis.
			
 
				+  repeated Attribute attribute = 3;
			
 
				+};
			
--- a/syntaxnet/syntaxnet/sentence_batch.cc
+++ b/syntaxnet/syntaxnet/sentence_batch.cc
@@ -24,7 +24,7 @@ limitations under the License.
 
				 namespace syntaxnet {
			
 
				 
			
 
				 void SentenceBatch::Init(TaskContext *context) {
			
 
				-  reader_.reset(new TextReader(*context->GetInput(input_name_)));
			
 
				+  reader_.reset(new TextReader(*context->GetInput(input_name_), context));
			
 
				   size_ = 0;
			
 
				 }
			
 
				 
			
--- a/syntaxnet/syntaxnet/sentence_features.cc
+++ b/syntaxnet/syntaxnet/sentence_features.cc
@@ -14,9 +14,11 @@ limitations under the License.
 
				 ==============================================================================*/
			
 
				 
			
 
				 #include "syntaxnet/sentence_features.h"
			
 
				-
			
 
				+#include "syntaxnet/char_properties.h"
			
 
				 #include "syntaxnet/registry.h"
			
 
				 #include "util/utf8/unicodetext.h"
			
 
				+#include "util/utf8/unilib.h"
			
 
				+#include "util/utf8/unilib_utf8_utils.h"
			
 
				 
			
 
				 namespace syntaxnet {
			
 
				 
			
@@ -55,6 +57,83 @@ string TermFrequencyMapFeature::WorkspaceName() const {
 
				                                              min_freq_, max_num_terms_);
			
 
				 }
			
 
				 
			
 
				+TermFrequencyMapSetFeature::~TermFrequencyMapSetFeature() {
			
 
				+  if (term_map_ != nullptr) {
			
 
				+    SharedStore::Release(term_map_);
			
 
				+    term_map_ = nullptr;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void TermFrequencyMapSetFeature::Setup(TaskContext *context) {
			
 
				+  context->GetInput(input_name_, "text", "");
			
 
				+}
			
 
				+
			
 
				+void TermFrequencyMapSetFeature::Init(TaskContext *context) {
			
 
				+  min_freq_ = GetIntParameter("min-freq", 0);
			
 
				+  max_num_terms_ = GetIntParameter("max-num-terms", 0);
			
 
				+  file_name_ = context->InputFile(*context->GetInput(input_name_));
			
 
				+  term_map_ = SharedStoreUtils::GetWithDefaultName<TermFrequencyMap>(
			
 
				+      file_name_, min_freq_, max_num_terms_);
			
 
				+  TokenLookupSetFeature::Init(context);
			
 
				+}
			
 
				+
			
 
				+string TermFrequencyMapSetFeature::WorkspaceName() const {
			
 
				+  return SharedStoreUtils::CreateDefaultName(
			
 
				+      "term-frequency-map-set", input_name_, min_freq_, max_num_terms_);
			
 
				+}
			
 
				+
			
 
				+namespace {
			
 
				+void GetUTF8Chars(const string &word, vector<tensorflow::StringPiece> *chars) {
			
 
				+  UnicodeText text;
			
 
				+  text.PointToUTF8(word.c_str(), word.size());
			
 
				+  for (UnicodeText::const_iterator it = text.begin(); it != text.end(); ++it) {
			
 
				+    chars->push_back(tensorflow::StringPiece(it.utf8_data(), it.utf8_length()));
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+int UTF8FirstLetterNumBytes(const char *utf8_str) {
			
 
				+  if (*utf8_str == '\0') return 0;
			
 
				+  return UniLib::OneCharLen(utf8_str);
			
 
				+}
			
 
				+
			
 
				+}  // namespace
			
 
				+
			
 
				+void CharNgram::GetTokenIndices(const Token &token, vector<int> *values) const {
			
 
				+  values->clear();
			
 
				+  vector<tensorflow::StringPiece> char_sp;
			
 
				+  if (use_terminators_) char_sp.push_back("^");
			
 
				+  GetUTF8Chars(token.word(), &char_sp);
			
 
				+  if (use_terminators_) char_sp.push_back("$");
			
 
				+  for (int start = 0; start < char_sp.size(); ++start) {
			
 
				+    string char_ngram;
			
 
				+    for (int index = 0;
			
 
				+         index < max_char_ngram_length_ && start + index < char_sp.size();
			
 
				+         ++index) {
			
 
				+      tensorflow::StringPiece c = char_sp[start + index];
			
 
				+      if (c == " ") break;  // Never add char ngrams containing spaces.
			
 
				+      tensorflow::strings::StrAppend(&char_ngram, c);
			
 
				+      int value = LookupIndex(char_ngram);
			
 
				+      if (value != -1) {  // Skip unknown values.
			
 
				+        values->push_back(value);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void MorphologySet::GetTokenIndices(const Token &token,
			
 
				+                                    vector<int> *values) const {
			
 
				+  values->clear();
			
 
				+  const TokenMorphology &token_morphology =
			
 
				+      token.GetExtension(TokenMorphology::morphology);
			
 
				+  for (const TokenMorphology::Attribute &att : token_morphology.attribute()) {
			
 
				+    int value =
			
 
				+        LookupIndex(tensorflow::strings::StrCat(att.name(), "=", att.value()));
			
 
				+    if (value != -1) {  // Skip unknown values.
			
 
				+      values->push_back(value);
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				 string Hyphen::GetFeatureValueName(FeatureValue value) const {
			
 
				   switch (value) {
			
 
				     case NO_HYPHEN:
			
@@ -70,6 +149,152 @@ FeatureValue Hyphen::ComputeValue(const Token &token) const {
 
				   return (word.find('-') < word.length() ? HAS_HYPHEN : NO_HYPHEN);
			
 
				 }
			
 
				 
			
 
				+void Capitalization::Setup(TaskContext *context) {
			
 
				+  utf8_ = (GetParameter("utf8") == "true");
			
 
				+}
			
 
				+
			
 
				+// Runs ComputeValue for each token in the sentence.
			
 
				+void Capitalization::Preprocess(WorkspaceSet *workspaces,
			
 
				+                                Sentence *sentence) const {
			
 
				+  if (workspaces->Has<VectorIntWorkspace>(Workspace())) return;
			
 
				+  VectorIntWorkspace *workspace =
			
 
				+      new VectorIntWorkspace(sentence->token_size());
			
 
				+  for (int i = 0; i < sentence->token_size(); ++i) {
			
 
				+    const int value = ComputeValueWithFocus(sentence->token(i), i);
			
 
				+    workspace->set_element(i, value);
			
 
				+  }
			
 
				+  workspaces->Set<VectorIntWorkspace>(Workspace(), workspace);
			
 
				+}
			
 
				+
			
 
				+string Capitalization::GetFeatureValueName(FeatureValue value) const {
			
 
				+  switch (value) {
			
 
				+    case LOWERCASE:
			
 
				+      return "LOWERCASE";
			
 
				+    case UPPERCASE:
			
 
				+      return "UPPERCASE";
			
 
				+    case CAPITALIZED:
			
 
				+      return "CAPITALIZED";
			
 
				+    case CAPITALIZED_SENTENCE_INITIAL:
			
 
				+      return "CAPITALIZED_SENTENCE_INITIAL";
			
 
				+    case NON_ALPHABETIC:
			
 
				+      return "NON_ALPHABETIC";
			
 
				+  }
			
 
				+  return "<INVALID>";
			
 
				+}
			
 
				+
			
 
				+FeatureValue Capitalization::ComputeValueWithFocus(const Token &token,
			
 
				+                                                   int focus) const {
			
 
				+  const string &word = token.word();
			
 
				+
			
 
				+  // Check whether there is an uppercase or lowercase character.
			
 
				+  bool has_upper = false;
			
 
				+  bool has_lower = false;
			
 
				+  if (utf8_) {
			
 
				+    LOG(FATAL) << "Not implemented.";
			
 
				+  } else {
			
 
				+    const char *str = word.c_str();
			
 
				+    for (int i = 0; i < word.length(); ++i) {
			
 
				+      const char c = str[i];
			
 
				+      has_upper = (has_upper || (c >= 'A' && c <= 'Z'));
			
 
				+      has_lower = (has_lower || (c >= 'a' && c <= 'z'));
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Compute simple values.
			
 
				+  if (!has_upper && has_lower) return LOWERCASE;
			
 
				+  if (has_upper && !has_lower) return UPPERCASE;
			
 
				+  if (!has_upper && !has_lower) return NON_ALPHABETIC;
			
 
				+
			
 
				+  // Else has_upper && has_lower; a normal capitalized word.  Check the break
			
 
				+  // level to determine whether the capitalized word is sentence-initial.
			
 
				+  const bool sentence_initial = (focus == 0);
			
 
				+  return sentence_initial ? CAPITALIZED_SENTENCE_INITIAL : CAPITALIZED;
			
 
				+}
			
 
				+
			
 
				+string PunctuationAmount::GetFeatureValueName(FeatureValue value) const {
			
 
				+  switch (value) {
			
 
				+    case NO_PUNCTUATION:
			
 
				+      return "NO_PUNCTUATION";
			
 
				+    case SOME_PUNCTUATION:
			
 
				+      return "SOME_PUNCTUATION";
			
 
				+    case ALL_PUNCTUATION:
			
 
				+      return "ALL_PUNCTUATION";
			
 
				+  }
			
 
				+  return "<INVALID>";
			
 
				+}
			
 
				+
			
 
				+FeatureValue PunctuationAmount::ComputeValue(const Token &token) const {
			
 
				+  const string &word = token.word();
			
 
				+  bool has_punctuation = false;
			
 
				+  bool all_punctuation = true;
			
 
				+
			
 
				+  const char *start = word.c_str();
			
 
				+  const char *end = word.c_str() + word.size();
			
 
				+  while (start < end) {
			
 
				+    int char_length = UTF8FirstLetterNumBytes(start);
			
 
				+    bool char_is_punct = is_punctuation_or_symbol(start, char_length);
			
 
				+    all_punctuation &= char_is_punct;
			
 
				+    has_punctuation |= char_is_punct;
			
 
				+    if (!all_punctuation && has_punctuation) return SOME_PUNCTUATION;
			
 
				+    start += char_length;
			
 
				+  }
			
 
				+  if (!all_punctuation) return NO_PUNCTUATION;
			
 
				+  return ALL_PUNCTUATION;
			
 
				+}
			
 
				+
			
 
				+string Quote::GetFeatureValueName(FeatureValue value) const {
			
 
				+  switch (value) {
			
 
				+    case NO_QUOTE:
			
 
				+      return "NO_QUOTE";
			
 
				+    case OPEN_QUOTE:
			
 
				+      return "OPEN_QUOTE";
			
 
				+    case CLOSE_QUOTE:
			
 
				+      return "CLOSE_QUOTE";
			
 
				+    case UNKNOWN_QUOTE:
			
 
				+      return "UNKNOWN_QUOTE";
			
 
				+  }
			
 
				+  return "<INVALID>";
			
 
				+}
			
 
				+
			
 
				+FeatureValue Quote::ComputeValue(const Token &token) const {
			
 
				+  const string &word = token.word();
			
 
				+
			
 
				+  // Penn Treebank open and close quotes are multi-character.
			
 
				+  if (word == "``") return OPEN_QUOTE;
			
 
				+  if (word == "''") return CLOSE_QUOTE;
			
 
				+  if (word.length() == 1) {
			
 
				+    int char_len = UTF8FirstLetterNumBytes(word.c_str());
			
 
				+    bool is_open = is_open_quote(word.c_str(), char_len);
			
 
				+    bool is_close = is_close_quote(word.c_str(), char_len);
			
 
				+    if (is_open && !is_close) return OPEN_QUOTE;
			
 
				+    if (is_close && !is_open) return CLOSE_QUOTE;
			
 
				+    if (is_open && is_close) return UNKNOWN_QUOTE;
			
 
				+  }
			
 
				+  return NO_QUOTE;
			
 
				+}
			
 
				+
			
 
				+void Quote::Preprocess(WorkspaceSet *workspaces, Sentence *sentence) const {
			
 
				+  if (workspaces->Has<VectorIntWorkspace>(Workspace())) return;
			
 
				+  VectorIntWorkspace *workspace =
			
 
				+      new VectorIntWorkspace(sentence->token_size());
			
 
				+
			
 
				+  // For double quote ", it is unknown whether they are open or closed without
			
 
				+  // looking at the prior tokens in the sentence.  in_quote is true iff an odd
			
 
				+  // number of " marks have been seen so far in the sentence (similar to the
			
 
				+  // behavior of some tokenizers).
			
 
				+  bool in_quote = false;
			
 
				+  for (int i = 0; i < sentence->token_size(); ++i) {
			
 
				+    int quote_type = ComputeValue(sentence->token(i));
			
 
				+    if (quote_type == UNKNOWN_QUOTE) {
			
 
				+      // Update based on in_quote and flip in_quote.
			
 
				+      quote_type = in_quote ? CLOSE_QUOTE : OPEN_QUOTE;
			
 
				+      in_quote = !in_quote;
			
 
				+    }
			
 
				+    workspace->set_element(i, quote_type);
			
 
				+  }
			
 
				+  workspaces->Set<VectorIntWorkspace>(Workspace(), workspace);
			
 
				+}
			
 
				+
			
 
				 string Digit::GetFeatureValueName(FeatureValue value) const {
			
 
				   switch (value) {
			
 
				     case NO_DIGIT:
			
@@ -130,8 +355,7 @@ static AffixTable *CreateAffixTable(const string &filename,
 
				 void AffixTableFeature::Setup(TaskContext *context) {
			
 
				   context->GetInput(input_name_, "recordio", "affix-table");
			
 
				   affix_length_ = GetIntParameter("length", 0);
			
 
				-  CHECK_GE(affix_length_, 0)
			
 
				-      << "Length must be specified for affix preprocessor.";
			
 
				+  CHECK_GE(affix_length_, 0) << "Length must be specified for affix feature.";
			
 
				   TokenLookupFeature::Setup(context);
			
 
				 }
			
 
				 
			
@@ -181,6 +405,7 @@ REGISTER_CLASS_REGISTRY("sentence+index feature function", SentenceFeature);
 
				 
			
 
				 // Register the features defined in the header.
			
 
				 REGISTER_SENTENCE_IDX_FEATURE("word", Word);
			
 
				+REGISTER_SENTENCE_IDX_FEATURE("char", Char);
			
 
				 REGISTER_SENTENCE_IDX_FEATURE("lcword", LowercaseWord);
			
 
				 REGISTER_SENTENCE_IDX_FEATURE("tag", Tag);
			
 
				 REGISTER_SENTENCE_IDX_FEATURE("offset", Offset);
			
@@ -188,5 +413,10 @@ REGISTER_SENTENCE_IDX_FEATURE("hyphen", Hyphen);
 
				 REGISTER_SENTENCE_IDX_FEATURE("digit", Digit);
			
 
				 REGISTER_SENTENCE_IDX_FEATURE("prefix", PrefixFeature);
			
 
				 REGISTER_SENTENCE_IDX_FEATURE("suffix", SuffixFeature);
			
 
				+REGISTER_SENTENCE_IDX_FEATURE("char-ngram", CharNgram);
			
 
				+REGISTER_SENTENCE_IDX_FEATURE("morphology-set", MorphologySet);
			
 
				+REGISTER_SENTENCE_IDX_FEATURE("capitalization", Capitalization);
			
 
				+REGISTER_SENTENCE_IDX_FEATURE("punctuation-amount", PunctuationAmount);
			
 
				+REGISTER_SENTENCE_IDX_FEATURE("quote", Quote);
			
 
				 
			
 
				 }  // namespace syntaxnet
			
--- a/syntaxnet/syntaxnet/sentence_features.h
+++ b/syntaxnet/syntaxnet/sentence_features.h
@@ -23,6 +23,7 @@ limitations under the License.
 
				 #include "syntaxnet/affix.h"
			
 
				 #include "syntaxnet/feature_extractor.h"
			
 
				 #include "syntaxnet/feature_types.h"
			
 
				+#include "syntaxnet/segmenter_utils.h"
			
 
				 #include "syntaxnet/shared_store.h"
			
 
				 #include "syntaxnet/task_context.h"
			
 
				 #include "syntaxnet/workspace.h"
			
@@ -85,6 +86,88 @@ class TokenLookupFeature : public SentenceFeature {
 
				     return workspaces.Get<VectorIntWorkspace>(workspace_).element(focus);
			
 
				   }
			
 
				 
			
 
				+  int Workspace() const { return workspace_; }
			
 
				+
			
 
				+ private:
			
 
				+  int workspace_;
			
 
				+};
			
 
				+
			
 
				+// A multi purpose specialization of the feature. Processes the tokens in a
			
 
				+// Sentence by looking up a value set for each token and storing that in
			
 
				+// a VectorVectorInt workspace. Given a set of base values of size Size(),
			
 
				+// reserves an extra value for unknown tokens.
			
 
				+class TokenLookupSetFeature : public SentenceFeature {
			
 
				+ public:
			
 
				+  void Init(TaskContext *context) override {
			
 
				+    set_feature_type(new ResourceBasedFeatureType<TokenLookupSetFeature>(
			
 
				+        name(), this, {{NumValues(), "<OUTSIDE>"}}));
			
 
				+  }
			
 
				+
			
 
				+  // Number of unique values.
			
 
				+  virtual int64 NumValues() const = 0;
			
 
				+
			
 
				+  // Given a position in a sentence and workspaces, looks up the corresponding
			
 
				+  // feature value set. The index is relative to the start of the sentence.
			
 
				+  virtual void LookupToken(const WorkspaceSet &workspaces,
			
 
				+                           const Sentence &sentence, int index,
			
 
				+                           vector<int> *values) const = 0;
			
 
				+
			
 
				+  // Given a feature value, returns a string representation.
			
 
				+  virtual string GetFeatureValueName(int value) const = 0;
			
 
				+
			
 
				+  // Name of the shared workspace.
			
 
				+  virtual string WorkspaceName() const = 0;
			
 
				+
			
 
				+  // TokenLookupSetFeatures use VectorVectorIntWorkspaces by default.
			
 
				+  void RequestWorkspaces(WorkspaceRegistry *registry) override {
			
 
				+    workspace_ = registry->Request<VectorVectorIntWorkspace>(WorkspaceName());
			
 
				+  }
			
 
				+
			
 
				+  // Default preprocessing: looks up a value set for each token in the Sentence.
			
 
				+  void Preprocess(WorkspaceSet *workspaces, Sentence *sentence) const override {
			
 
				+    // Default preprocessing: lookup a value set for each token in the Sentence.
			
 
				+    if (workspaces->Has<VectorVectorIntWorkspace>(workspace_)) return;
			
 
				+    VectorVectorIntWorkspace *workspace =
			
 
				+        new VectorVectorIntWorkspace(sentence->token_size());
			
 
				+    for (int i = 0; i < sentence->token_size(); ++i) {
			
 
				+      LookupToken(*workspaces, *sentence, i, workspace->mutable_elements(i));
			
 
				+    }
			
 
				+    workspaces->Set<VectorVectorIntWorkspace>(workspace_, workspace);
			
 
				+  }
			
 
				+
			
 
				+  // Returns a pre-computed token value from the cache. This assumes the cache
			
 
				+  // is populated.
			
 
				+  const vector<int> &GetCachedValueSet(const WorkspaceSet &workspaces,
			
 
				+                                       const Sentence &sentence,
			
 
				+                                       int focus) const {
			
 
				+    // Do bounds checking on focus.
			
 
				+    CHECK_GE(focus, 0);
			
 
				+    CHECK_LT(focus, sentence.token_size());
			
 
				+
			
 
				+    // Return value from cache.
			
 
				+    return workspaces.Get<VectorVectorIntWorkspace>(workspace_).elements(focus);
			
 
				+  }
			
 
				+
			
 
				+  // Adds any precomputed features at the given focus, if present.
			
 
				+  void Evaluate(const WorkspaceSet &workspaces, const Sentence &sentence,
			
 
				+                int focus, FeatureVector *result) const override {
			
 
				+    if (focus >= 0 && focus < sentence.token_size()) {
			
 
				+      const vector<int> &elements =
			
 
				+          GetCachedValueSet(workspaces, sentence, focus);
			
 
				+      for (auto &value : elements) {
			
 
				+        result->add(this->feature_type(), value);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Returns the precomputed value, or NumValues() for features outside
			
 
				+  // the sentence.
			
 
				+  FeatureValue Compute(const WorkspaceSet &workspaces, const Sentence &sentence,
			
 
				+                       int focus, const FeatureVector *result) const override {
			
 
				+    if (focus < 0 || focus >= sentence.token_size()) return NumValues();
			
 
				+    return workspaces.Get<VectorIntWorkspace>(workspace_).element(focus);
			
 
				+  }
			
 
				+
			
 
				  private:
			
 
				   int workspace_;
			
 
				 };
			
@@ -134,6 +217,83 @@ class TermFrequencyMapFeature : public TokenLookupFeature {
 
				   int max_num_terms_;
			
 
				 };
			
 
				 
			
 
				+// Specialization of the TokenLookupSetFeature class to use a TermFrequencyMap
			
 
				+// to perform the mapping. This takes two options: "min_freq" (discard tokens
			
 
				+// with less than this min frequency), and "max_num_terms" (only read in at most
			
 
				+// these terms.)
			
 
				+class TermFrequencyMapSetFeature : public TokenLookupSetFeature {
			
 
				+ public:
			
 
				+  // Initializes with an empty name, since we need the options to compute the
			
 
				+  // actual workspace name.
			
 
				+  explicit TermFrequencyMapSetFeature(const string &input_name)
			
 
				+      : input_name_(input_name), min_freq_(0), max_num_terms_(0) {}
			
 
				+
			
 
				+  // Releases shared resources.
			
 
				+  ~TermFrequencyMapSetFeature() override;
			
 
				+
			
 
				+  // Returns index of raw word text.
			
 
				+  virtual void GetTokenIndices(const Token &token,
			
 
				+                               vector<int> *values) const = 0;
			
 
				+
			
 
				+  // Requests the resource inputs.
			
 
				+  void Setup(TaskContext *context) override;
			
 
				+
			
 
				+  // Obtains resources using the shared store. At this point options are known
			
 
				+  // so the full name can be computed.
			
 
				+  void Init(TaskContext *context) override;
			
 
				+
			
 
				+  // Number of unique values.
			
 
				+  int64 NumValues() const override { return term_map_->Size(); }
			
 
				+
			
 
				+  // Special value for strings not in the map.
			
 
				+  FeatureValue UnknownValue() const { return term_map_->Size(); }
			
 
				+
			
 
				+  // Gets pointer to the underlying map.
			
 
				+  const TermFrequencyMap *term_map() const { return term_map_; }
			
 
				+
			
 
				+  // Returns the term index or the unknown value. Used inside GetTokenIndex()
			
 
				+  // specializations for convenience.
			
 
				+  int LookupIndex(const string &term) const {
			
 
				+    return term_map_->LookupIndex(term, -1);
			
 
				+  }
			
 
				+
			
 
				+  // Given a position in a sentence and workspaces, looks up the corresponding
			
 
				+  // feature value set. The index is relative to the start of the sentence.
			
 
				+  void LookupToken(const WorkspaceSet &workspaces, const Sentence &sentence,
			
 
				+                   int index, vector<int> *values) const override {
			
 
				+    GetTokenIndices(sentence.token(index), values);
			
 
				+  }
			
 
				+
			
 
				+  // Uses the TermFrequencyMap to lookup the string associated with a value.
			
 
				+  string GetFeatureValueName(int value) const override {
			
 
				+    if (value == UnknownValue()) return "<UNKNOWN>";
			
 
				+    if (value >= 0 && value < NumValues()) {
			
 
				+      return term_map_->GetTerm(value);
			
 
				+    }
			
 
				+    LOG(ERROR) << "Invalid feature value: " << value;
			
 
				+    return "<INVALID>";
			
 
				+  }
			
 
				+
			
 
				+  // Name of the shared workspace.
			
 
				+  string WorkspaceName() const override;
			
 
				+
			
 
				+ private:
			
 
				+  // Shortcut pointer to shared map. Not owned.
			
 
				+  const TermFrequencyMap *term_map_ = nullptr;
			
 
				+
			
 
				+  // Name of the input for the term map.
			
 
				+  string input_name_;
			
 
				+
			
 
				+  // Filename of the underlying resource.
			
 
				+  string file_name_;
			
 
				+
			
 
				+  // Minimum frequency for term map.
			
 
				+  int min_freq_;
			
 
				+
			
 
				+  // Maximum number of terms for term map.
			
 
				+  int max_num_terms_;
			
 
				+};
			
 
				+
			
 
				 class Word : public TermFrequencyMapFeature {
			
 
				  public:
			
 
				   Word() : TermFrequencyMapFeature("word-map") {}
			
@@ -144,6 +304,36 @@ class Word : public TermFrequencyMapFeature {
 
				   }
			
 
				 };
			
 
				 
			
 
				+class Char : public TermFrequencyMapFeature {
			
 
				+ public:
			
 
				+  Char() : TermFrequencyMapFeature("char-map") {}
			
 
				+
			
 
				+  FeatureValue ComputeValue(const Token &token) const override {
			
 
				+    const string &form = token.word();
			
 
				+    if (SegmenterUtils::IsBreakChar(form)) return BreakCharValue();
			
 
				+    return term_map().LookupIndex(form, UnknownValue());
			
 
				+  }
			
 
				+
			
 
				+  // Special value for breaks.
			
 
				+  FeatureValue BreakCharValue() const { return term_map().Size(); }
			
 
				+
			
 
				+  // Special value for non-break strings not in the map.
			
 
				+  FeatureValue UnknownValue() const { return term_map().Size() + 1; }
			
 
				+
			
 
				+  // Number of unique values.
			
 
				+  int64 NumValues() const override { return term_map().Size() + 2; }
			
 
				+
			
 
				+  string GetFeatureValueName(FeatureValue value) const override {
			
 
				+    if (value == BreakCharValue()) return "<BREAK_CHAR>";
			
 
				+    if (value == UnknownValue()) return "<UNKNOWN>";
			
 
				+    if (value >= 0 && value < term_map().Size()) {
			
 
				+      return term_map().GetTerm(value);
			
 
				+    }
			
 
				+    LOG(ERROR) << "Invalid feature value: " << value;
			
 
				+    return "<INVALID>";
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				 class LowercaseWord : public TermFrequencyMapFeature {
			
 
				  public:
			
 
				   LowercaseWord() : TermFrequencyMapFeature("lc-word-map") {}
			
@@ -172,6 +362,47 @@ class Label : public TermFrequencyMapFeature {
 
				   }
			
 
				 };
			
 
				 
			
 
				+class CharNgram : public TermFrequencyMapSetFeature {
			
 
				+ public:
			
 
				+  CharNgram() : TermFrequencyMapSetFeature("char-ngram-map") {}
			
 
				+  ~CharNgram() override {}
			
 
				+
			
 
				+  void Setup(TaskContext *context) override {
			
 
				+    TermFrequencyMapSetFeature::Setup(context);
			
 
				+    max_char_ngram_length_ = context->Get("lexicon_max_char_ngram_length", 3);
			
 
				+    use_terminators_ =
			
 
				+        context->Get("lexicon_char_ngram_include_terminators", false);
			
 
				+  }
			
 
				+
			
 
				+  // Returns index of raw word text.
			
 
				+  void GetTokenIndices(const Token &token, vector<int> *values) const override;
			
 
				+
			
 
				+ private:
			
 
				+  // Size parameter (n) for the ngrams.
			
 
				+  int max_char_ngram_length_ = 3;
			
 
				+
			
 
				+  // Whether to pad the word with ^ and $ before extracting ngrams.
			
 
				+  bool use_terminators_ = false;
			
 
				+};
			
 
				+
			
 
				+class MorphologySet : public TermFrequencyMapSetFeature {
			
 
				+ public:
			
 
				+  MorphologySet() : TermFrequencyMapSetFeature("morphology-map") {}
			
 
				+  ~MorphologySet() override {}
			
 
				+
			
 
				+  void Setup(TaskContext *context) override {
			
 
				+    TermFrequencyMapSetFeature::Setup(context);
			
 
				+  }
			
 
				+
			
 
				+
			
 
				+  int64 NumValues() const override {
			
 
				+    return term_map()->Size() - 1;
			
 
				+  }
			
 
				+
			
 
				+  // Returns index of raw word text.
			
 
				+  void GetTokenIndices(const Token &token, vector<int> *values) const override;
			
 
				+};
			
 
				+
			
 
				 class LexicalCategoryFeature : public TokenLookupFeature {
			
 
				  public:
			
 
				   LexicalCategoryFeature(const string &name, int cardinality)
			
@@ -180,7 +411,7 @@ class LexicalCategoryFeature : public TokenLookupFeature {
 
				 
			
 
				   FeatureValue NumValues() const override { return cardinality_; }
			
 
				 
			
 
				-  // Returns the identifier for the workspace for this preprocessor.
			
 
				+  // Returns the identifier for the workspace for this feature.
			
 
				   string WorkspaceName() const override {
			
 
				     return tensorflow::strings::StrCat(name_, ":", cardinality_);
			
 
				   }
			
@@ -193,7 +424,7 @@ class LexicalCategoryFeature : public TokenLookupFeature {
 
				   const int cardinality_;
			
 
				 };
			
 
				 
			
 
				-// Preprocessor that computes whether a word has a hyphen or not.
			
 
				+// Feature that computes whether a word has a hyphen or not.
			
 
				 class Hyphen : public LexicalCategoryFeature {
			
 
				  public:
			
 
				   // Enumeration of values.
			
@@ -213,7 +444,100 @@ class Hyphen : public LexicalCategoryFeature {
 
				   FeatureValue ComputeValue(const Token &token) const override;
			
 
				 };
			
 
				 
			
 
				-// Preprocessor that computes whether a word has a hyphen or not.
			
 
				+// Feature that categorizes the capitalization of the word. If the option
			
 
				+// utf8=true is specified, lowercase and uppercase checks are done with UTF8
			
 
				+// compliant functions.
			
 
				+class Capitalization : public LexicalCategoryFeature {
			
 
				+ public:
			
 
				+  // Enumeration of values.
			
 
				+  enum Category {
			
 
				+    LOWERCASE = 0,                     // normal word
			
 
				+    UPPERCASE = 1,                     // all-caps
			
 
				+    CAPITALIZED = 2,                   // has one cap and one non-cap
			
 
				+    CAPITALIZED_SENTENCE_INITIAL = 3,  // same as above but sentence-initial
			
 
				+    NON_ALPHABETIC = 4,                // contains no alphabetic characters
			
 
				+    CARDINALITY = 5,
			
 
				+  };
			
 
				+
			
 
				+  // Default constructor.
			
 
				+  Capitalization() : LexicalCategoryFeature("capitalization", CARDINALITY) {}
			
 
				+
			
 
				+  // Sets one of the options for the capitalization.
			
 
				+  void Setup(TaskContext *context) override;
			
 
				+
			
 
				+  // Capitalization needs special preprocessing because token category can
			
 
				+  // depend on whether the token is at the start of the sentence.
			
 
				+  void Preprocess(WorkspaceSet *workspaces, Sentence *sentence) const override;
			
 
				+
			
 
				+  // Returns a string representation of the enum value.
			
 
				+  string GetFeatureValueName(FeatureValue value) const override;
			
 
				+
			
 
				+  // Returns the category value for the token.
			
 
				+  FeatureValue ComputeValue(const Token &token) const override {
			
 
				+    LOG(FATAL) << "Capitalization should use ComputeValueWithFocus.";
			
 
				+    return 0;
			
 
				+  }
			
 
				+
			
 
				+  // Returns the category value for the token.
			
 
				+  FeatureValue ComputeValueWithFocus(const Token &token, int focus) const;
			
 
				+
			
 
				+ private:
			
 
				+  // Whether to use UTF8 compliant functions to check capitalization.
			
 
				+  bool utf8_ = false;
			
 
				+};
			
 
				+
			
 
				+// A feature for computing whether the focus token contains any punctuation
			
 
				+// for ternary features.
			
 
				+class PunctuationAmount : public LexicalCategoryFeature {
			
 
				+ public:
			
 
				+  // Enumeration of values.
			
 
				+  enum Category {
			
 
				+    NO_PUNCTUATION = 0,
			
 
				+    SOME_PUNCTUATION = 1,
			
 
				+    ALL_PUNCTUATION = 2,
			
 
				+    CARDINALITY = 3,
			
 
				+  };
			
 
				+
			
 
				+  // Default constructor.
			
 
				+  PunctuationAmount()
			
 
				+      : LexicalCategoryFeature("punctuation-amount", CARDINALITY) {}
			
 
				+
			
 
				+  // Returns a string representation of the enum value.
			
 
				+  string GetFeatureValueName(FeatureValue value) const override;
			
 
				+
			
 
				+  // Returns the category value for the token.
			
 
				+  FeatureValue ComputeValue(const Token &token) const override;
			
 
				+};
			
 
				+
			
 
				+// A feature for a feature that returns whether the word is an open or
			
 
				+// close quotation mark, based on its relative position to other quotation marks
			
 
				+// in the sentence.
			
 
				+class Quote : public LexicalCategoryFeature {
			
 
				+ public:
			
 
				+  // Enumeration of values.
			
 
				+  enum Category {
			
 
				+    NO_QUOTE = 0,
			
 
				+    OPEN_QUOTE = 1,
			
 
				+    CLOSE_QUOTE = 2,
			
 
				+    UNKNOWN_QUOTE = 3,
			
 
				+    CARDINALITY = 4,
			
 
				+  };
			
 
				+
			
 
				+  // Default constructor.
			
 
				+  Quote() : LexicalCategoryFeature("quote", CARDINALITY) {}
			
 
				+
			
 
				+  // Returns a string representation of the enum value.
			
 
				+  string GetFeatureValueName(FeatureValue value) const override;
			
 
				+
			
 
				+  // Returns the category value for the token.
			
 
				+  FeatureValue ComputeValue(const Token &token) const override;
			
 
				+
			
 
				+  // Override preprocess to compute open and close quotes from prior context of
			
 
				+  // the sentence.
			
 
				+  void Preprocess(WorkspaceSet *workspaces, Sentence *instance) const override;
			
 
				+};
			
 
				+
			
 
				+// Feature that computes whether a word has digits or not.
			
 
				 class Digit : public LexicalCategoryFeature {
			
 
				  public:
			
 
				   // Enumeration of values.
			
@@ -234,9 +558,9 @@ class Digit : public LexicalCategoryFeature {
 
				   FeatureValue ComputeValue(const Token &token) const override;
			
 
				 };
			
 
				 
			
 
				-// TokenLookupPreprocessor object to compute prefixes and suffixes of words. The
			
 
				+// TokenLookupFeature object to compute prefixes and suffixes of words. The
			
 
				 // AffixTable is stored in the SharedStore. This is very similar to the
			
 
				-// implementation of TermFrequencyMapPreprocessor, but using an AffixTable to
			
 
				+// implementation of TermFrequencyMapFeature, but using an AffixTable to
			
 
				 // perform the lookups. There are only two specializations, for prefixes and
			
 
				 // suffixes.
			
 
				 class AffixTableFeature : public TokenLookupFeature {
			
--- a/syntaxnet/syntaxnet/sentence_features_test.cc
+++ b/syntaxnet/syntaxnet/sentence_features_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 
				 #include "syntaxnet/utils.h"
			
 
				 #include "syntaxnet/workspace.h"
			
 
				 #include <gmock/gmock.h>
			
 
				+#include "tensorflow/core/platform/test.h"
			
 
				 
			
 
				 using testing::UnorderedElementsAreArray;
			
 
				 
			
@@ -83,6 +84,27 @@ class SentenceFeaturesTest : public ::testing::Test {
 
				     return values;
			
 
				   }
			
 
				 
			
 
				+  // Adds an input to the task context.
			
 
				+  void AddInputToContext(const string &name, const string &file_pattern,
			
 
				+                         const string &file_format,
			
 
				+                         const string &record_format) {
			
 
				+    TaskInput *input = context_.GetInput(name);
			
 
				+    TaskInput::Part *part = input->add_part();
			
 
				+    part->set_file_pattern(file_pattern);
			
 
				+    part->set_file_format(file_format);
			
 
				+    part->set_record_format(record_format);
			
 
				+  }
			
 
				+
			
 
				+  // Checks that a vector workspace is equal to a target vector.
			
 
				+  void CheckVectorWorkspace(const VectorIntWorkspace &workspace,
			
 
				+                            vector<int> target) {
			
 
				+    vector<int> src;
			
 
				+    for (int i = 0; i < workspace.size(); ++i) {
			
 
				+      src.push_back(workspace.element(i));
			
 
				+    }
			
 
				+    EXPECT_THAT(src, testing::ContainerEq(target));
			
 
				+  }
			
 
				+
			
 
				   Sentence sentence_;
			
 
				   WorkspaceSet workspaces_;
			
 
				 
			
@@ -99,13 +121,18 @@ class CommonSentenceFeaturesTest : public SentenceFeaturesTest {
 
				       : SentenceFeaturesTest(
			
 
				             "text: 'I saw a man with a telescope.' "
			
 
				             "token { word: 'I' start: 0 end: 0 tag: 'PRP' category: 'PRON'"
			
 
				-            " head: 1 label: 'nsubj' break_level: NO_BREAK } "
			
 
				+            "  head: 1 label: 'nsubj' break_level: NO_BREAK } "
			
 
				             "token { word: 'saw' start: 2 end: 4 tag: 'VBD' category: 'VERB'"
			
 
				-            " label: 'ROOT' break_level: SPACE_BREAK } "
			
 
				+            "  label: 'ROOT' break_level: SPACE_BREAK } "
			
 
				             "token { word: 'a' start: 6 end: 6 tag: 'DT' category: 'DET'"
			
 
				-            " head: 3 label: 'det' break_level: SPACE_BREAK } "
			
 
				+            "  head: 3 label: 'det' break_level: SPACE_BREAK } "
			
 
				             "token { word: 'man' start: 8 end: 10 tag: 'NN' category: 'NOUN'"
			
 
				-            " head: 1 label: 'dobj' break_level: SPACE_BREAK } "
			
 
				+            "  head: 1 label: 'dobj' break_level: SPACE_BREAK"
			
 
				+            "  [syntaxnet.TokenMorphology.morphology] { "
			
 
				+            "    attribute { name:'morph' value:'Sg' } "
			
 
				+            "    attribute { name:'morph' value:'Masc' } "
			
 
				+            "  } "
			
 
				+            "} "
			
 
				             "token { word: 'with' start: 12 end: 15 tag: 'IN' category: 'ADP'"
			
 
				             " head: 1 label: 'prep' break_level: SPACE_BREAK } "
			
 
				             "token { word: 'a' start: 17 end: 17 tag: 'DT' category: 'DET'"
			
@@ -152,4 +179,96 @@ TEST_F(CommonSentenceFeaturesTest, OffsetPlusTag) {
 
				   EXPECT_EQ("<OUTSIDE>", ExtractFeature(9));
			
 
				 }
			
 
				 
			
 
				+TEST_F(CommonSentenceFeaturesTest, CharNgramFeature) {
			
 
				+  TermFrequencyMap char_ngram_map;
			
 
				+  char_ngram_map.Increment("a");
			
 
				+  char_ngram_map.Increment("aw");
			
 
				+  char_ngram_map.Increment("sa");
			
 
				+  creators_.Add(
			
 
				+      "char-ngram-map", "text", "",
			
 
				+      [&char_ngram_map](const string &path) { char_ngram_map.Save(path); });
			
 
				+
			
 
				+  // Test that CharNgram works as expected.
			
 
				+  PrepareFeature("char-ngram");
			
 
				+  EXPECT_EQ("", utils::Join(ExtractMultiFeature(-1), ","));
			
 
				+  EXPECT_EQ("", utils::Join(ExtractMultiFeature(0), ","));
			
 
				+  EXPECT_EQ("sa,a,aw", utils::Join(ExtractMultiFeature(1), ","));
			
 
				+  EXPECT_EQ("a", utils::Join(ExtractMultiFeature(2), ","));
			
 
				+  EXPECT_EQ("a", utils::Join(ExtractMultiFeature(3), ","));
			
 
				+  EXPECT_EQ("", utils::Join(ExtractMultiFeature(8), ","));
			
 
				+}
			
 
				+
			
 
				+TEST_F(CommonSentenceFeaturesTest, MorphologySetFeature) {
			
 
				+  TermFrequencyMap morphology_map;
			
 
				+  morphology_map.Increment("morph=Sg");
			
 
				+  morphology_map.Increment("morph=Sg");
			
 
				+  morphology_map.Increment("morph=Masc");
			
 
				+  morphology_map.Increment("morph=Masc");
			
 
				+  morphology_map.Increment("morph=Pl");
			
 
				+  creators_.Add(
			
 
				+      "morphology-map", "text", "",
			
 
				+      [&morphology_map](const string &path) { morphology_map.Save(path); });
			
 
				+
			
 
				+  // Test that CharNgram works as expected.
			
 
				+  PrepareFeature("morphology-set");
			
 
				+  EXPECT_EQ("", utils::Join(ExtractMultiFeature(-1), ","));
			
 
				+  EXPECT_EQ("", utils::Join(ExtractMultiFeature(0), ","));
			
 
				+  EXPECT_EQ("morph=Sg,morph=Masc", utils::Join(ExtractMultiFeature(3), ","));
			
 
				+}
			
 
				+
			
 
				+TEST_F(CommonSentenceFeaturesTest, CapitalizationProcessesCorrectly) {
			
 
				+  Capitalization feature;
			
 
				+  feature.RequestWorkspaces(&registry_);
			
 
				+  workspaces_.Reset(registry_);
			
 
				+  feature.Preprocess(&workspaces_, &sentence_);
			
 
				+
			
 
				+  // Check the workspace contains what we expect.
			
 
				+  EXPECT_TRUE(workspaces_.Has<VectorIntWorkspace>(feature.Workspace()));
			
 
				+  const VectorIntWorkspace &workspace =
			
 
				+      workspaces_.Get<VectorIntWorkspace>(feature.Workspace());
			
 
				+  constexpr int UPPERCASE = Capitalization::UPPERCASE;
			
 
				+  constexpr int LOWERCASE = Capitalization::LOWERCASE;
			
 
				+  constexpr int NON_ALPHABETIC = Capitalization::NON_ALPHABETIC;
			
 
				+  CheckVectorWorkspace(workspace,
			
 
				+                       {UPPERCASE, LOWERCASE, LOWERCASE, LOWERCASE, LOWERCASE,
			
 
				+                        LOWERCASE, LOWERCASE, NON_ALPHABETIC});
			
 
				+}
			
 
				+
			
 
				+class CharFeatureTest : public SentenceFeaturesTest {
			
 
				+ protected:
			
 
				+  CharFeatureTest()
			
 
				+      : SentenceFeaturesTest(
			
 
				+          "text: '一 个 测 试 员  ' "
			
 
				+          "token { word: '一' start: 0 end: 2 } "
			
 
				+          "token { word: '个' start: 3 end: 5 } "
			
 
				+          "token { word: '测' start: 6 end: 8 } "
			
 
				+          "token { word: '试' start: 9 end: 11 } "
			
 
				+          "token { word: '员' start: 12 end: 14 } "
			
 
				+          "token { word: ' ' start: 15 end: 15 } "
			
 
				+          "token { word: '\t' start: 16 end: 16 } ") {}
			
 
				+};
			
 
				+
			
 
				+TEST_F(CharFeatureTest, CharFeature) {
			
 
				+  TermFrequencyMap char_map;
			
 
				+  char_map.Increment("一");
			
 
				+  char_map.Increment("个");
			
 
				+  char_map.Increment("试");
			
 
				+  char_map.Increment("员");
			
 
				+  creators_.Add(
			
 
				+      "char-map", "text", "",
			
 
				+      [&char_map](const string &path) { char_map.Save(path); });
			
 
				+
			
 
				+  // Test that Char works as expected.
			
 
				+  PrepareFeature("char");
			
 
				+  EXPECT_EQ("<OUTSIDE>", ExtractFeature(-1));
			
 
				+  EXPECT_EQ("一", ExtractFeature(0));
			
 
				+  EXPECT_EQ("个", ExtractFeature(1));
			
 
				+  EXPECT_EQ("<UNKNOWN>", ExtractFeature(2));  // "测" is not in the char map.
			
 
				+  EXPECT_EQ("试", ExtractFeature(3));
			
 
				+  EXPECT_EQ("员", ExtractFeature(4));
			
 
				+  EXPECT_EQ("<BREAK_CHAR>", ExtractFeature(5));
			
 
				+  EXPECT_EQ("<BREAK_CHAR>", ExtractFeature(6));
			
 
				+  EXPECT_EQ("<OUTSIDE>", ExtractFeature(7));
			
 
				+}
			
 
				+
			
 
				 }  // namespace syntaxnet
			
--- a/syntaxnet/syntaxnet/tagger_transitions.cc
+++ b/syntaxnet/syntaxnet/tagger_transitions.cc
@@ -25,8 +25,10 @@ limitations under the License.
 
				 
			
 
				 #include <string>
			
 
				 
			
 
				+#include "syntaxnet/parser_features.h"
			
 
				 #include "syntaxnet/parser_state.h"
			
 
				 #include "syntaxnet/parser_transitions.h"
			
 
				+#include "syntaxnet/sentence_features.h"
			
 
				 #include "syntaxnet/shared_store.h"
			
 
				 #include "syntaxnet/task_context.h"
			
 
				 #include "syntaxnet/term_frequency_map.h"
			
@@ -98,7 +100,9 @@ class TaggerTransitionState : public ParserTransitionState {
 
				     for (size_t i = 0; i < tag_.size(); ++i) {
			
 
				       Token *token = sentence->mutable_token(i);
			
 
				       token->set_tag(TagAsString(Tag(i)));
			
 
				-      token->set_category(tag_to_category_->GetCategory(token->tag()));
			
 
				+      if (tag_to_category_) {
			
 
				+        token->set_category(tag_to_category_->GetCategory(token->tag()));
			
 
				+      }
			
 
				     }
			
 
				   }
			
 
				 
			
@@ -146,6 +150,7 @@ class TaggerTransitionSystem : public ParserTransitionSystem {
 
				   // Determines tag map location.
			
 
				   void Setup(TaskContext *context) override {
			
 
				     input_tag_map_ = context->GetInput("tag-map", "text", "");
			
 
				+    join_category_to_pos_ = context->GetBoolParameter("join_category_to_pos");
			
 
				     input_tag_to_category_ = context->GetInput("tag-to-category", "text", "");
			
 
				   }
			
 
				 
			
@@ -154,15 +159,21 @@ class TaggerTransitionSystem : public ParserTransitionSystem {
 
				     const string tag_map_path = TaskContext::InputFile(*input_tag_map_);
			
 
				     tag_map_ = SharedStoreUtils::GetWithDefaultName<TermFrequencyMap>(
			
 
				         tag_map_path, 0, 0);
			
 
				-    const string tag_to_category_path =
			
 
				-        TaskContext::InputFile(*input_tag_to_category_);
			
 
				-    tag_to_category_ = SharedStoreUtils::GetWithDefaultName<TagToCategoryMap>(
			
 
				-        tag_to_category_path);
			
 
				+    if (!join_category_to_pos_) {
			
 
				+      const string tag_to_category_path =
			
 
				+          TaskContext::InputFile(*input_tag_to_category_);
			
 
				+      tag_to_category_ = SharedStoreUtils::GetWithDefaultName<TagToCategoryMap>(
			
 
				+          tag_to_category_path);
			
 
				+    }
			
 
				   }
			
 
				 
			
 
				   // The SHIFT action uses the same value as the corresponding action type.
			
 
				   static ParserAction ShiftAction(int tag) { return tag; }
			
 
				 
			
 
				+  // The tagger transition system doesn't look at the dependency tree, so it
			
 
				+  // allows non-projective trees.
			
 
				+  bool AllowsNonProjective() const override { return true; }
			
 
				+
			
 
				   // Returns the number of action types.
			
 
				   int NumActionTypes() const override { return 1; }
			
 
				 
			
@@ -251,8 +262,32 @@ class TaggerTransitionSystem : public ParserTransitionSystem {
 
				 
			
 
				   // Tag to category map. Owned through SharedStore.
			
 
				   const TagToCategoryMap *tag_to_category_ = nullptr;
			
 
				+
			
 
				+  bool join_category_to_pos_ = false;
			
 
				 };
			
 
				 
			
 
				 REGISTER_TRANSITION_SYSTEM("tagger", TaggerTransitionSystem);
			
 
				 
			
 
				+// Feature function for retrieving the tag assigned to a token by the tagger
			
 
				+// transition system.
			
 
				+class PredictedTagFeatureFunction
			
 
				+    : public BasicParserSentenceFeatureFunction<Tag> {
			
 
				+ public:
			
 
				+  PredictedTagFeatureFunction() {}
			
 
				+
			
 
				+  // Gets the TaggerTransitionState from the parser state and reads the assigned
			
 
				+  // tag at the focus index. Returns -1 if the focus is not within the sentence.
			
 
				+  FeatureValue Compute(const WorkspaceSet &workspaces, const ParserState &state,
			
 
				+                       int focus, const FeatureVector *result) const override {
			
 
				+    if (focus < 0 || focus >= state.sentence().token_size()) return -1;
			
 
				+    return static_cast<const TaggerTransitionState *>(state.transition_state())
			
 
				+        ->Tag(focus);
			
 
				+  }
			
 
				+
			
 
				+ private:
			
 
				+  TF_DISALLOW_COPY_AND_ASSIGN(PredictedTagFeatureFunction);
			
 
				+};
			
 
				+
			
 
				+REGISTER_PARSER_IDX_FEATURE_FUNCTION("pred-tag", PredictedTagFeatureFunction);
			
 
				+
			
 
				 }  // namespace syntaxnet
			
--- a/syntaxnet/syntaxnet/testdata/context.pbtxt
+++ b/syntaxnet/syntaxnet/testdata/context.pbtxt
@@ -61,6 +61,12 @@ input {
 
				   }
			
 
				 }
			
 
				 input {
			
 
				+  name: 'char-map'
			
 
				+  Part {
			
 
				+    file_pattern: 'OUTPATH/char-map'
			
 
				+  }
			
 
				+}
			
 
				+input {
			
 
				   name: 'prefix-table'
			
 
				   Part {
			
 
				     file_pattern: 'OUTPATH/prefix-table'
			
--- a/syntaxnet/syntaxnet/text_formats.cc
+++ b/syntaxnet/syntaxnet/text_formats.cc
@@ -63,6 +63,11 @@ class CoNLLSyntaxFormat : public DocumentFormat {
 
				  public:
			
 
				   CoNLLSyntaxFormat() {}
			
 
				 
			
 
				+  void Setup(TaskContext *context) override {
			
 
				+    join_category_to_pos_ = context->GetBoolParameter("join_category_to_pos");
			
 
				+    add_pos_as_attribute_ = context->GetBoolParameter("add_pos_as_attribute");
			
 
				+  }
			
 
				+
			
 
				   // Reads up to the first empty line and returns false end of file is reached.
			
 
				   bool ReadRecord(tensorflow::io::InputBuffer *buffer,
			
 
				                   string *record) override {
			
@@ -121,6 +126,7 @@ class CoNLLSyntaxFormat : public DocumentFormat {
 
				       const string &word = fields[1];
			
 
				       const string &cpostag = fields[3];
			
 
				       const string &tag = fields[4];
			
 
				+      const string &attributes = fields[5];
			
 
				       const int head = utils::ParseUsing<int>(fields[6], 0, utils::ParseInt32);
			
 
				       const string &label = fields[7];
			
 
				 
			
@@ -139,6 +145,9 @@ class CoNLLSyntaxFormat : public DocumentFormat {
 
				       if (!tag.empty()) token->set_tag(tag);
			
 
				       if (!cpostag.empty()) token->set_category(cpostag);
			
 
				       if (!label.empty()) token->set_label(label);
			
 
				+      if (!attributes.empty()) AddMorphAttributes(attributes, token);
			
 
				+      if (join_category_to_pos_) JoinCategoryToPos(token);
			
 
				+      if (add_pos_as_attribute_) AddPosAsAttribute(token);
			
 
				     }
			
 
				 
			
 
				     if (sentence->token_size() > 0) {
			
@@ -158,16 +167,18 @@ class CoNLLSyntaxFormat : public DocumentFormat {
 
				     *key = sentence.docid();
			
 
				     vector<string> lines;
			
 
				     for (int i = 0; i < sentence.token_size(); ++i) {
			
 
				+      Token token = sentence.token(i);
			
 
				+      if (join_category_to_pos_) SplitCategoryFromPos(&token);
			
 
				+      if (add_pos_as_attribute_) RemovePosFromAttributes(&token);
			
 
				       vector<string> fields(10);
			
 
				       fields[0] = tensorflow::strings::Printf("%d", i + 1);
			
 
				-      fields[1] = sentence.token(i).word();
			
 
				+      fields[1] = token.word();
			
 
				       fields[2] = "_";
			
 
				-      fields[3] = sentence.token(i).category();
			
 
				-      fields[4] = sentence.token(i).tag();
			
 
				-      fields[5] = "_";
			
 
				-      fields[6] =
			
 
				-          tensorflow::strings::Printf("%d", sentence.token(i).head() + 1);
			
 
				-      fields[7] = sentence.token(i).label();
			
 
				+      fields[3] = token.category();
			
 
				+      fields[4] = token.tag();
			
 
				+      fields[5] = GetMorphAttributes(token);
			
 
				+      fields[6] = tensorflow::strings::Printf("%d", token.head() + 1);
			
 
				+      fields[7] = token.label();
			
 
				       fields[8] = "_";
			
 
				       fields[9] = "_";
			
 
				       lines.push_back(utils::Join(fields, "\t"));
			
@@ -176,6 +187,95 @@ class CoNLLSyntaxFormat : public DocumentFormat {
 
				   }
			
 
				 
			
 
				  private:
			
 
				+  // Creates a TokenMorphology object out of a list of attribute values of the
			
 
				+  // form: a1=v1|a2=v2|... or v1|v2|...
			
 
				+  void AddMorphAttributes(const string &attributes, Token *token) {
			
 
				+    TokenMorphology *morph =
			
 
				+        token->MutableExtension(TokenMorphology::morphology);
			
 
				+    vector<string> att_vals = utils::Split(attributes, '|');
			
 
				+    for (int i = 0; i < att_vals.size(); ++i) {
			
 
				+      vector<string> att_val = utils::Split(att_vals[i], '=');
			
 
				+      CHECK_LE(att_val.size(), 2)
			
 
				+          << "Error parsing morphology features "
			
 
				+          << "column, must be of format "
			
 
				+          << "a1=v1|a2=v2|... or v1|v2|... <field>: " << attributes;
			
 
				+
			
 
				+      // Format is either:
			
 
				+      //   1) a1=v1|a2=v2..., e.g., Czech CoNLL data, or,
			
 
				+      //   2) v1|v2|..., e.g., German CoNLL data.
			
 
				+      const pair<string, string> name_value =
			
 
				+          att_val.size() == 2 ? std::make_pair(att_val[0], att_val[1])
			
 
				+                              : std::make_pair(att_val[0], "on");
			
 
				+
			
 
				+      // We currently don't expect an empty attribute value, but might have an
			
 
				+      // empty attribute name due to data input errors.
			
 
				+      if (name_value.second.empty()) {
			
 
				+        LOG(WARNING) << "Invalid attributes string: " << attributes
			
 
				+                     << " for token: " << token->ShortDebugString();
			
 
				+        continue;
			
 
				+      }
			
 
				+      if (!name_value.first.empty()) {
			
 
				+        TokenMorphology::Attribute *attribute = morph->add_attribute();
			
 
				+        attribute->set_name(name_value.first);
			
 
				+        attribute->set_value(name_value.second);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Creates a list of attribute values of the form a1=v1|a2=v2|... or v1|v2|...
			
 
				+  // from a TokenMorphology object.
			
 
				+  string GetMorphAttributes(const Token &token) {
			
 
				+    const TokenMorphology &morph =
			
 
				+        token.GetExtension(TokenMorphology::morphology);
			
 
				+    if (morph.attribute_size() == 0) return "_";
			
 
				+    string attributes;
			
 
				+    for (const TokenMorphology::Attribute &attribute : morph.attribute()) {
			
 
				+      if (!attributes.empty()) tensorflow::strings::StrAppend(&attributes, "|");
			
 
				+      tensorflow::strings::StrAppend(&attributes, attribute.name());
			
 
				+      if (attribute.value() != "on") {
			
 
				+        tensorflow::strings::StrAppend(&attributes, "=", attribute.value());
			
 
				+      }
			
 
				+    }
			
 
				+    return attributes;
			
 
				+  }
			
 
				+
			
 
				+  void JoinCategoryToPos(Token *token) {
			
 
				+    token->set_tag(
			
 
				+        tensorflow::strings::StrCat(token->category(), "++", token->tag()));
			
 
				+    token->clear_category();
			
 
				+  }
			
 
				+
			
 
				+  void SplitCategoryFromPos(Token *token) {
			
 
				+    const string &tag = token->tag();
			
 
				+    const size_t pos = tag.find("++");
			
 
				+    if (pos != string::npos) {
			
 
				+      token->set_category(tag.substr(0, pos));
			
 
				+      token->set_tag(tag.substr(pos + 2));
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  void AddPosAsAttribute(Token *token) {
			
 
				+    if (!token->tag().empty()) {
			
 
				+      TokenMorphology *morph =
			
 
				+          token->MutableExtension(TokenMorphology::morphology);
			
 
				+      TokenMorphology::Attribute *attribute = morph->add_attribute();
			
 
				+      attribute->set_name("fPOS");
			
 
				+      attribute->set_value(token->tag());
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  void RemovePosFromAttributes(Token *token) {
			
 
				+    // Assumes the "fPOS" attribute, if present, is the last one.
			
 
				+    TokenMorphology *morph =
			
 
				+        token->MutableExtension(TokenMorphology::morphology);
			
 
				+    if (morph->attribute().rbegin()->name() == "fPOS") {
			
 
				+      morph->mutable_attribute()->RemoveLast();
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  bool join_category_to_pos_ = false;
			
 
				+  bool add_pos_as_attribute_ = false;
			
 
				+
			
 
				   TF_DISALLOW_COPY_AND_ASSIGN(CoNLLSyntaxFormat);
			
 
				 };
			
 
				 
			
--- a/syntaxnet/syntaxnet/utils.h
+++ b/syntaxnet/syntaxnet/utils.h
@@ -62,7 +62,7 @@ string Join(const std::vector<T> &s, const char *sep) {
 
				   return result;
			
 
				 }
			
 
				 
			
 
				-string JoinPath(std::initializer_list<StringPiece> paths);
			
 
				+string JoinPath(std::initializer_list<tensorflow::StringPiece> paths);
			
 
				 
			
 
				 size_t RemoveLeadingWhitespace(tensorflow::StringPiece *text);
			
 
				 
			
@@ -165,6 +165,64 @@ class PunctuationUtil {
 
				 
			
 
				 void NormalizeDigits(string *form);
			
 
				 
			
 
				+// Helper type to mark missing c-tor argument types
			
 
				+// for Type's c-tor in LazyStaticPtr<Type, ...>.
			
 
				+struct NoArg {};
			
 
				+
			
 
				+template <typename Type, typename Arg1 = NoArg, typename Arg2 = NoArg,
			
 
				+          typename Arg3 = NoArg>
			
 
				+class LazyStaticPtr {
			
 
				+ public:
			
 
				+  typedef Type element_type;  // per smart pointer convention
			
 
				+
			
 
				+  // Pretend to be a pointer to Type (never NULL due to on-demand creation):
			
 
				+  Type &operator*() const { return *get(); }
			
 
				+  Type *operator->() const { return get(); }
			
 
				+
			
 
				+  // Named accessor/initializer:
			
 
				+  Type *get() const {
			
 
				+    if (!ptr_) Initialize(this);
			
 
				+    return ptr_;
			
 
				+  }
			
 
				+
			
 
				+ public:
			
 
				+  // All the data is public and LazyStaticPtr has no constructors so that we can
			
 
				+  // initialize LazyStaticPtr objects with the "= { arg_value, ... }" syntax.
			
 
				+  // Clients of LazyStaticPtr must not access the data members directly.
			
 
				+
			
 
				+  // Arguments for Type's c-tor
			
 
				+  // (unused NoArg-typed arguments consume either no space, or 1 byte to
			
 
				+  //  ensure address uniqueness):
			
 
				+  Arg1 arg1_;
			
 
				+  Arg2 arg2_;
			
 
				+  Arg3 arg3_;
			
 
				+
			
 
				+  // The object we create and show.
			
 
				+  mutable Type *ptr_;
			
 
				+
			
 
				+ private:
			
 
				+  template <typename A1, typename A2, typename A3>
			
 
				+  static Type *Factory(const A1 &a1, const A2 &a2, const A3 &a3) {
			
 
				+    return new Type(a1, a2, a3);
			
 
				+  }
			
 
				+
			
 
				+  template <typename A1, typename A2>
			
 
				+  static Type *Factory(const A1 &a1, const A2 &a2, NoArg a3) {
			
 
				+    return new Type(a1, a2);
			
 
				+  }
			
 
				+
			
 
				+  template <typename A1>
			
 
				+  static Type *Factory(const A1 &a1, NoArg a2, NoArg a3) {
			
 
				+    return new Type(a1);
			
 
				+  }
			
 
				+
			
 
				+  static Type *Factory(NoArg a1, NoArg a2, NoArg a3) { return new Type(); }
			
 
				+
			
 
				+  static void Initialize(const LazyStaticPtr *lsp) {
			
 
				+    lsp->ptr_ = Factory(lsp->arg1_, lsp->arg2_, lsp->arg3_);
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				 }  // namespace utils
			
 
				 }  // namespace syntaxnet
			
 
				 
			
--- a/syntaxnet/syntaxnet/workspace.h
+++ b/syntaxnet/syntaxnet/workspace.h
@@ -185,6 +185,8 @@ class VectorIntWorkspace : public Workspace {
 
				   // Sets the i'th element.
			
 
				   void set_element(int i, int value) { elements_[i] = value; }
			
 
				 
			
 
				+  int size() const { return elements_.size(); }
			
 
				+
			
 
				  private:
			
 
				   // The enclosed vector.
			
 
				   vector<int> elements_;
			
--- a/syntaxnet/util/utf8/unicodetext.h
+++ b/syntaxnet/util/utf8/unicodetext.h
@@ -462,6 +462,12 @@ inline string UnicodeTextToUTF8(const UnicodeText& t) {
 
				   return string(t.utf8_data(), t.utf8_length());
			
 
				 }
			
 
				 
			
 
				+// This template function declaration is used in defining arraysize.
			
 
				+// Note that the function doesn't need an implementation, as we only
			
 
				+// use its type.
			
 
				+template <typename T, size_t N>
			
 
				+char (&ArraySizeHelper(T (&array)[N]))[N];
			
 
				+#define arraysize(array) (sizeof(ArraySizeHelper(array)))
			
 
				 
			
 
				 // For debugging.  Return a string of integers, written in uppercase
			
 
				 // hex (%X), corresponding to the codepoints within the text. Each
			
--- a/syntaxnet/util/utf8/unicodetext_unittest.cc
+++ b/syntaxnet/util/utf8/unicodetext_unittest.cc
@@ -25,10 +25,6 @@
 
				 
			
 
				 namespace {
			
 
				 
			
 
				-template <typename T, size_t N>
			
 
				-char (&ArraySizeHelper(T (&array)[N]))[N];
			
 
				-#define arraysize(array) (sizeof(ArraySizeHelper(array)))
			
 
				-
			
 
				 class UnicodeTextTest : public testing::Test {
			
 
				  protected:
			
 
				   UnicodeTextTest() : empty_text_() {
			
--- a/syntaxnet/util/utf8/unilib_utf8_utils.h
+++ b/syntaxnet/util/utf8/unilib_utf8_utils.h
@@ -21,6 +21,7 @@
 
				 // They are also exported from unilib.h for legacy reasons.
			
 
				 
			
 
				 #include "syntaxnet/base.h"
			
 
				+#include "third_party/utf/utf.h"
			
 
				 
			
 
				 namespace UniLib {
			
 
				 
			
@@ -32,6 +33,19 @@ inline bool IsValidCodepoint(char32 c) {
 
				     || (c >= 0xE000 && c <= 0x10FFFF);
			
 
				 }
			
 
				 
			
 
				+// Returns true if 'str' is the start of a structurally valid UTF-8
			
 
				+// sequence and is not a surrogate codepoint. Returns false if str.empty()
			
 
				+// or if str.length() < UniLib::OneCharLen(str[0]). Otherwise, this function
			
 
				+// will access 1-4 bytes of src, where n is UniLib::OneCharLen(src[0]).
			
 
				+inline bool IsUTF8ValidCodepoint(StringPiece str) {
			
 
				+  char32 c;
			
 
				+  int consumed;
			
 
				+  // It's OK if str.length() > consumed.
			
 
				+  return !str.empty()
			
 
				+      && isvalidcharntorune(str.data(), str.size(), &c, &consumed)
			
 
				+      && IsValidCodepoint(c);
			
 
				+}
			
 
				+
			
 
				 // Returns the length (number of bytes) of the Unicode code point
			
 
				 // starting at src, based on inspecting just that one byte. This
			
 
				 // requires that src point to a well-formed UTF-8 string; the result