10 år sedan · ddf8a3af32
--- a/syntaxnet/README.md
+++ b/syntaxnet/README.md
@@ -3,6 +3,10 @@
 
				 *A TensorFlow implementation of the models described in [Andor et al. (2016)]
			
 
				 (http://arxiv.org/pdf/1603.06042v1.pdf).*
			
 
				 
			
 
				+**Update**: Parsey models are now [available](universal.md) for 40 languages
			
 
				+trained on Universal Dependencies datasets, with support for text segmentation
			
 
				+and morphological analysis.
			
 
				+
			
 
				 At Google, we spend a lot of time thinking about how computer systems can read
			
 
				 and understand human language in order to process it in intelligent ways. We are
			
 
				 excited to share the fruits of our research with the broader community by
			
--- a/syntaxnet/syntaxnet/models/parsey_universal/context-tokenize-zh.pbtxt
+++ b/syntaxnet/syntaxnet/models/parsey_universal/context-tokenize-zh.pbtxt
@@ -0,0 +1,64 @@
 
				+Parameter {
			
 
				+  name: "brain_tokenizer_zh_embedding_dims"
			
 
				+  value: "32;32"
			
 
				+}
			
 
				+Parameter {
			
 
				+  name: "brain_tokenizer_zh_embedding_names"
			
 
				+  value: "chars;words"
			
 
				+}
			
 
				+Parameter {
			
 
				+  name: "brain_tokenizer_zh_features"
			
 
				+  value: "input.char "
			
 
				+         "input(1).char "
			
 
				+         "input(2).char "
			
 
				+         "input(3).char "
			
 
				+         "input(-1).char "
			
 
				+         "input(-2).char "
			
 
				+         "input(-3).char "
			
 
				+         "stack.char "
			
 
				+         "stack.offset(1).char "
			
 
				+         "stack.offset(-1).char "
			
 
				+         "stack(1).char "
			
 
				+         "stack(1).offset(1).char "
			
 
				+         "stack(1).offset(-1).char "
			
 
				+         "stack(2).char; "
			
 
				+         "last-word(1,min-freq=2) "
			
 
				+         "last-word(2,min-freq=2) "
			
 
				+         "last-word(3,min-freq=2)"
			
 
				+}
			
 
				+Parameter {
			
 
				+  name: "brain_tokenizer_zh_transition_system"
			
 
				+  value: "binary-segment-transitions"
			
 
				+}
			
 
				+input {
			
 
				+  name: "word-map"
			
 
				+  Part {
			
 
				+    file_pattern: "last-word-map"
			
 
				+  }
			
 
				+}
			
 
				+input {
			
 
				+  name: "char-map"
			
 
				+  Part {
			
 
				+    file_pattern: "char-map"
			
 
				+  }
			
 
				+}
			
 
				+input {
			
 
				+  name: "label-map"
			
 
				+  Part {
			
 
				+    file_pattern: "label-map"
			
 
				+  }
			
 
				+}
			
 
				+input {
			
 
				+  name: 'stdin-untoken'
			
 
				+  record_format: 'untokenized-text'
			
 
				+  Part {
			
 
				+    file_pattern: '-'
			
 
				+  }
			
 
				+}
			
 
				+input {
			
 
				+  name: 'stdout-conll'
			
 
				+  record_format: 'conll-sentence'
			
 
				+  Part {
			
 
				+    file_pattern: '-'
			
 
				+  }
			
 
				+}
			
--- a/syntaxnet/syntaxnet/models/parsey_universal/context.pbtxt
+++ b/syntaxnet/syntaxnet/models/parsey_universal/context.pbtxt
@@ -0,0 +1,362 @@
 
				+Parameter {
			
 
				+  name: "brain_tokenizer_embedding_dims"
			
 
				+  value: "16;16;16"
			
 
				+}
			
 
				+Parameter {
			
 
				+  name: "brain_tokenizer_embedding_names"
			
 
				+  value: "chars;digits;puncts"
			
 
				+}
			
 
				+Parameter {
			
 
				+  name: "brain_tokenizer_features"
			
 
				+  value:  "input.char "
			
 
				+          "input(-1).char "
			
 
				+          "input(1).char; "
			
 
				+          "input.digit "
			
 
				+          "input(-1).digit "
			
 
				+          "input(1).digit; "
			
 
				+          "input.punctuation-amount "
			
 
				+          "input(-1).punctuation-amount "
			
 
				+          "input(1).punctuation-amount "
			
 
				+}
			
 
				+Parameter {
			
 
				+  name: "brain_tokenizer_transition_system"
			
 
				+  value: "binary-segment-transitions"
			
 
				+}
			
 
				+Parameter {
			
 
				+  name: "brain_morpher_embedding_dims"
			
 
				+  value: "2;16;8;16;16;16;16;16;64"
			
 
				+}
			
 
				+Parameter {
			
 
				+  name: "brain_morpher_embedding_names"
			
 
				+  value: "capitalization;char_ngram;other;prefix2;prefix3;suffix2;suffix3;tags;words"
			
 
				+}
			
 
				+Parameter {
			
 
				+  name: "brain_morpher_features"
			
 
				+  value: "input.capitalization "
			
 
				+         "input(1).capitalization "
			
 
				+         "input(2).capitalization "
			
 
				+         "input(3).capitalization "
			
 
				+         "input(-1).capitalization "
			
 
				+         "input(-2).capitalization "
			
 
				+         "input(-3).capitalization "
			
 
				+         "input(-4).capitalization; "
			
 
				+         "input.token.char-ngram "
			
 
				+         "input(1).token.char-ngram "
			
 
				+         "input(2).token.char-ngram "
			
 
				+         "input(3).token.char-ngram "
			
 
				+         "input(-1).token.char-ngram "
			
 
				+         "input(-2).token.char-ngram "
			
 
				+         "input(-3).token.char-ngram "
			
 
				+         "input(-4).token.char-ngram; "
			
 
				+         "input.digit "
			
 
				+         "input.hyphen "
			
 
				+         "input.token.punctuation-amount "
			
 
				+         "input.token.quote; "
			
 
				+         "input.token.prefix(length=2) "
			
 
				+         "input(1).token.prefix(length=2) "
			
 
				+         "input(2).token.prefix(length=2) "
			
 
				+         "input(3).token.prefix(length=2) "
			
 
				+         "input(-1).token.prefix(length=2) "
			
 
				+         "input(-2).token.prefix(length=2) "
			
 
				+         "input(-3).token.prefix(length=2) "
			
 
				+         "input(-4).token.prefix(length=2); "
			
 
				+         "input.token.prefix(length=3) "
			
 
				+         "input(1).token.prefix(length=3) "
			
 
				+         "input(2).token.prefix(length=3) "
			
 
				+         "input(3).token.prefix(length=3) "
			
 
				+         "input(-1).token.prefix(length=3) "
			
 
				+         "input(-2).token.prefix(length=3) "
			
 
				+         "input(-3).token.prefix(length=3) "
			
 
				+         "input(-4).token.prefix(length=3); "
			
 
				+         "input.token.suffix(length=2) "
			
 
				+         "input(1).token.suffix(length=2) "
			
 
				+         "input(2).token.suffix(length=2) "
			
 
				+         "input(3).token.suffix(length=2) "
			
 
				+         "input(-1).token.suffix(length=2) "
			
 
				+         "input(-2).token.suffix(length=2) "
			
 
				+         "input(-3).token.suffix(length=2) "
			
 
				+         "input(-4).token.suffix(length=2); "
			
 
				+         "input.token.suffix(length=3) "
			
 
				+         "input(1).token.suffix(length=3) "
			
 
				+         "input(2).token.suffix(length=3) "
			
 
				+         "input(3).token.suffix(length=3) "
			
 
				+         "input(-1).token.suffix(length=3) "
			
 
				+         "input(-2).token.suffix(length=3) "
			
 
				+         "input(-3).token.suffix(length=3) "
			
 
				+         "input(-4).token.suffix(length=3); "
			
 
				+         "input(-1).pred-morph-tag "
			
 
				+         "input(-2).pred-morph-tag "
			
 
				+         "input(-3).pred-morph-tag "
			
 
				+         "input(-4).pred-morph-tag; "
			
 
				+         "input.token.word "
			
 
				+         "input(1).token.word "
			
 
				+         "input(2).token.word "
			
 
				+         "input(3).token.word "
			
 
				+         "input(-1).token.word "
			
 
				+         "input(-2).token.word "
			
 
				+         "input(-3).token.word "
			
 
				+         "input(-4).token.word"
			
 
				+}
			
 
				+Parameter {
			
 
				+  name: "brain_morpher_transition_system"
			
 
				+  value: "morpher"
			
 
				+}
			
 
				+Parameter {
			
 
				+  name: "brain_tagger_embedding_dims"
			
 
				+  value: "2;16;8;16;16;16;16;16;64"
			
 
				+}
			
 
				+Parameter {
			
 
				+  name: "brain_tagger_embedding_names"
			
 
				+  value: "capitalization;char_ngram;other;prefix2;prefix3;suffix2;suffix3;tags;words"
			
 
				+}
			
 
				+Parameter {
			
 
				+  name: "brain_tagger_features"
			
 
				+  value: "input.capitalization "
			
 
				+         "input(1).capitalization "
			
 
				+         "input(2).capitalization "
			
 
				+         "input(3).capitalization "
			
 
				+         "input(-1).capitalization "
			
 
				+         "input(-2).capitalization "
			
 
				+         "input(-3).capitalization "
			
 
				+         "input(-4).capitalization; "
			
 
				+         "input.token.char-ngram "
			
 
				+         "input(1).token.char-ngram "
			
 
				+         "input(2).token.char-ngram "
			
 
				+         "input(3).token.char-ngram "
			
 
				+         "input(-1).token.char-ngram "
			
 
				+         "input(-2).token.char-ngram "
			
 
				+         "input(-3).token.char-ngram "
			
 
				+         "input(-4).token.char-ngram; "
			
 
				+         "input.digit "
			
 
				+         "input.hyphen "
			
 
				+         "input.token.punctuation-amount "
			
 
				+         "input.token.quote; "
			
 
				+         "input.token.prefix(length=2) "
			
 
				+         "input(1).token.prefix(length=2) "
			
 
				+         "input(2).token.prefix(length=2) "
			
 
				+         "input(3).token.prefix(length=2) "
			
 
				+         "input(-1).token.prefix(length=2) "
			
 
				+         "input(-2).token.prefix(length=2) "
			
 
				+         "input(-3).token.prefix(length=2) "
			
 
				+         "input(-4).token.prefix(length=2); "
			
 
				+         "input.token.prefix(length=3) "
			
 
				+         "input(1).token.prefix(length=3) "
			
 
				+         "input(2).token.prefix(length=3) "
			
 
				+         "input(3).token.prefix(length=3) "
			
 
				+         "input(-1).token.prefix(length=3) "
			
 
				+         "input(-2).token.prefix(length=3) "
			
 
				+         "input(-3).token.prefix(length=3) "
			
 
				+         "input(-4).token.prefix(length=3); "
			
 
				+         "input.token.suffix(length=2) "
			
 
				+         "input(1).token.suffix(length=2) "
			
 
				+         "input(2).token.suffix(length=2) "
			
 
				+         "input(3).token.suffix(length=2) "
			
 
				+         "input(-1).token.suffix(length=2) "
			
 
				+         "input(-2).token.suffix(length=2) "
			
 
				+         "input(-3).token.suffix(length=2) "
			
 
				+         "input(-4).token.suffix(length=2); "
			
 
				+         "input.token.suffix(length=3) "
			
 
				+         "input(1).token.suffix(length=3) "
			
 
				+         "input(2).token.suffix(length=3) "
			
 
				+         "input(3).token.suffix(length=3) "
			
 
				+         "input(-1).token.suffix(length=3) "
			
 
				+         "input(-2).token.suffix(length=3) "
			
 
				+         "input(-3).token.suffix(length=3) "
			
 
				+         "input(-4).token.suffix(length=3); "
			
 
				+         "input(-1).pred-tag "
			
 
				+         "input(-2).pred-tag "
			
 
				+         "input(-3).pred-tag "
			
 
				+         "input(-4).pred-tag; "
			
 
				+         "input.token.word "
			
 
				+         "input(1).token.word "
			
 
				+         "input(2).token.word "
			
 
				+         "input(3).token.word "
			
 
				+         "input(-1).token.word "
			
 
				+         "input(-2).token.word "
			
 
				+         "input(-3).token.word "
			
 
				+         "input(-4).token.word"
			
 
				+}
			
 
				+Parameter {
			
 
				+  name: "brain_tagger_transition_system"
			
 
				+  value: "tagger"
			
 
				+}
			
 
				+Parameter {
			
 
				+  name: "brain_parser_embedding_dims"
			
 
				+  value: "32;32;32;64"
			
 
				+}
			
 
				+Parameter {
			
 
				+  name: "brain_parser_embedding_names"
			
 
				+  value: "labels;morphology;tags;words"
			
 
				+}
			
 
				+Parameter {
			
 
				+  name: "brain_parser_features"
			
 
				+  value: "stack.child(1).label "
			
 
				+         "stack.child(1).sibling(-1).label "
			
 
				+         "stack.child(-1).label "
			
 
				+         "stack.child(-1).sibling(1).label "
			
 
				+         "stack.child(2).label "
			
 
				+         "stack.child(-2).label "
			
 
				+         "stack(1).child(1).label "
			
 
				+         "stack(1).child(1).sibling(-1).label "
			
 
				+         "stack(1).child(-1).label "
			
 
				+         "stack(1).child(-1).sibling(1).label "
			
 
				+         "stack(1).child(2).label "
			
 
				+         "stack(1).child(-2).label; "
			
 
				+         "input.token.morphology-set "
			
 
				+         "input(1).token.morphology-set "
			
 
				+         "input(2).token.morphology-set "
			
 
				+         "input(3).token.morphology-set "
			
 
				+         "stack.token.morphology-set "
			
 
				+         "stack.child(1).token.morphology-set "
			
 
				+         "stack.child(1).sibling(-1).token.morphology-set "
			
 
				+         "stack.child(-1).token.morphology-set "
			
 
				+         "stack.child(-1).sibling(1).token.morphology-set "
			
 
				+         "stack.child(2).token.morphology-set "
			
 
				+         "stack.child(-2).token.morphology-set "
			
 
				+         "stack(1).token.morphology-set "
			
 
				+         "stack(1).child(1).token.morphology-set "
			
 
				+         "stack(1).child(1).sibling(-1).token.morphology-set "
			
 
				+         "stack(1).child(-1).token.morphology-set "
			
 
				+         "stack(1).child(-1).sibling(1).token.morphology-set "
			
 
				+         "stack(1).child(2).token.morphology-set "
			
 
				+         "stack(1).child(-2).token.morphology-set "
			
 
				+         "stack(2).token.morphology-set "
			
 
				+         "stack(3).token.morphology-set; "
			
 
				+         "input.token.tag "
			
 
				+         "input(1).token.tag "
			
 
				+         "input(2).token.tag "
			
 
				+         "input(3).token.tag "
			
 
				+         "stack.token.tag "
			
 
				+         "stack.child(1).token.tag "
			
 
				+         "stack.child(1).sibling(-1).token.tag "
			
 
				+         "stack.child(-1).token.tag "
			
 
				+         "stack.child(-1).sibling(1).token.tag "
			
 
				+         "stack.child(2).token.tag "
			
 
				+         "stack.child(-2).token.tag "
			
 
				+         "stack(1).token.tag "
			
 
				+         "stack(1).child(1).token.tag "
			
 
				+         "stack(1).child(1).sibling(-1).token.tag "
			
 
				+         "stack(1).child(-1).token.tag "
			
 
				+         "stack(1).child(-1).sibling(1).token.tag "
			
 
				+         "stack(1).child(2).token.tag "
			
 
				+         "stack(1).child(-2).token.tag "
			
 
				+         "stack(2).token.tag "
			
 
				+         "stack(3).token.tag; "
			
 
				+         "input.token.word "
			
 
				+         "input(1).token.word "
			
 
				+         "input(2).token.word "
			
 
				+         "input(3).token.word "
			
 
				+         "stack.token.word "
			
 
				+         "stack.child(1).token.word "
			
 
				+         "stack.child(1).sibling(-1).token.word "
			
 
				+         "stack.child(-1).token.word "
			
 
				+         "stack.child(-1).sibling(1).token.word "
			
 
				+         "stack.child(2).token.word "
			
 
				+         "stack.child(-2).token.word "
			
 
				+         "stack(1).token.word "
			
 
				+         "stack(1).child(1).token.word "
			
 
				+         "stack(1).child(1).sibling(-1).token.word "
			
 
				+         "stack(1).child(-1).token.word "
			
 
				+         "stack(1).child(-1).sibling(1).token.word "
			
 
				+         "stack(1).child(2).token.word "
			
 
				+         "stack(1).child(-2).token.word "
			
 
				+         "stack(2).token.word "
			
 
				+         "stack(3).token.word "
			
 
				+}
			
 
				+Parameter {
			
 
				+  name: "brain_parser_transition_system"
			
 
				+  value: "arc-standard"
			
 
				+}
			
 
				+Parameter {
			
 
				+  name: "join_category_to_pos"
			
 
				+  value: "true"
			
 
				+}
			
 
				+input {
			
 
				+  name: "word-map"
			
 
				+  Part {
			
 
				+    file_pattern: "word-map"
			
 
				+  }
			
 
				+}
			
 
				+input {
			
 
				+  name: "char-map"
			
 
				+  Part {
			
 
				+    file_pattern: "char-map"
			
 
				+  }
			
 
				+}
			
 
				+input {
			
 
				+  name: "tag-map"
			
 
				+  Part {
			
 
				+    file_pattern: "tag-map"
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+input {
			
 
				+  name: "tag-to-category"
			
 
				+  Part {
			
 
				+    file_pattern: "tag-to-category"
			
 
				+  }
			
 
				+}
			
 
				+input {
			
 
				+  name: "label-map"
			
 
				+  Part {
			
 
				+    file_pattern: "label-map"
			
 
				+  }
			
 
				+}
			
 
				+input {
			
 
				+  name: "char-ngram-map"
			
 
				+  Part {
			
 
				+    file_pattern: "char-ngram-map"
			
 
				+  }
			
 
				+}
			
 
				+input {
			
 
				+  name: "prefix-table"
			
 
				+  Part {
			
 
				+    file_pattern: "prefix-table"
			
 
				+  }
			
 
				+}
			
 
				+input {
			
 
				+  name: "suffix-table"
			
 
				+  Part {
			
 
				+    file_pattern: "suffix-table"
			
 
				+  }
			
 
				+}
			
 
				+input {
			
 
				+  name: "morph-label-set"
			
 
				+  Part {
			
 
				+    file_pattern: "morph-label-set"
			
 
				+  }
			
 
				+}
			
 
				+input {
			
 
				+  name: "morphology-map"
			
 
				+  Part {
			
 
				+    file_pattern: "morphology-map"
			
 
				+  }
			
 
				+}
			
 
				+input {
			
 
				+  name: 'stdin'
			
 
				+  record_format: 'tokenized-text'
			
 
				+  Part {
			
 
				+    file_pattern: '-'
			
 
				+  }
			
 
				+}
			
 
				+input {
			
 
				+  name: 'stdin-conll'
			
 
				+  record_format: 'conll-sentence'
			
 
				+  Part {
			
 
				+    file_pattern: '-'
			
 
				+  }
			
 
				+}
			
 
				+input {
			
 
				+  name: 'stdin-untoken'
			
 
				+  record_format: 'untokenized-text'
			
 
				+  Part {
			
 
				+    file_pattern: '-'
			
 
				+  }
			
 
				+}
			
 
				+input {
			
 
				+  name: 'stdout-conll'
			
 
				+  record_format: 'conll-sentence'
			
 
				+  Part {
			
 
				+    file_pattern: '-'
			
 
				+  }
			
 
				+}
			
--- a/syntaxnet/syntaxnet/models/parsey_universal/parse.sh
+++ b/syntaxnet/syntaxnet/models/parsey_universal/parse.sh
@@ -0,0 +1,68 @@
 
				+# A script that runs a morphological analyzer, a part-of-speech tagger and a
			
 
				+# dependency parser on a text file, with one sentence per line.
			
 
				+#
			
 
				+# Example usage:
			
 
				+#  bazel build syntaxnet:parser_eval
			
 
				+#  cat sentences.txt |
			
 
				+#    syntaxnet/models/parsey_universal/parse.sh \
			
 
				+#    $MODEL_DIRECTORY > output.conll
			
 
				+#
			
 
				+# To run on a conll formatted file, add the --conll command line argument:
			
 
				+#  cat sentences.conll |
			
 
				+#    syntaxnet/models/parsey_universal/parse.sh \
			
 
				+#    --conll $MODEL_DIRECTORY > output.conll
			
 
				+#
			
 
				+# Models can be downloaded from
			
 
				+#  http://download.tensorflow.org/models/parsey_universal/<language>.zip
			
 
				+# for the languages listed at
			
 
				+#  https://github.com/tensorflow/models/blob/master/syntaxnet/universal.md
			
 
				+#
			
 
				+
			
 
				+PARSER_EVAL=bazel-bin/syntaxnet/parser_eval
			
 
				+CONTEXT=syntaxnet/models/parsey_universal/context.pbtxt
			
 
				+if [[ "$1" == "--conll" ]]; then
			
 
				+  INPUT_FORMAT=stdin-conll
			
 
				+  shift
			
 
				+else
			
 
				+  INPUT_FORMAT=stdin
			
 
				+fi
			
 
				+MODEL_DIR=$1
			
 
				+
			
 
				+$PARSER_EVAL \
			
 
				+  --input=$INPUT_FORMAT \
			
 
				+  --output=stdout-conll \
			
 
				+  --hidden_layer_sizes=64 \
			
 
				+  --arg_prefix=brain_morpher \
			
 
				+  --graph_builder=structured \
			
 
				+  --task_context=$CONTEXT \
			
 
				+  --resource_dir=$MODEL_DIR \
			
 
				+  --model_path=$MODEL_DIR/morpher-params \
			
 
				+  --slim_model \
			
 
				+  --batch_size=1024 \
			
 
				+  --alsologtostderr \
			
 
				+  | \
			
 
				+  $PARSER_EVAL \
			
 
				+  --input=stdin-conll \
			
 
				+  --output=stdout-conll \
			
 
				+  --hidden_layer_sizes=64 \
			
 
				+  --arg_prefix=brain_tagger \
			
 
				+  --graph_builder=structured \
			
 
				+  --task_context=$CONTEXT \
			
 
				+  --resource_dir=$MODEL_DIR \
			
 
				+  --model_path=$MODEL_DIR/tagger-params \
			
 
				+  --slim_model \
			
 
				+  --batch_size=1024 \
			
 
				+  --alsologtostderr \
			
 
				+  | \
			
 
				+  $PARSER_EVAL \
			
 
				+  --input=stdin-conll \
			
 
				+  --output=stdout-conll \
			
 
				+  --hidden_layer_sizes=512,512 \
			
 
				+  --arg_prefix=brain_parser \
			
 
				+  --graph_builder=structured \
			
 
				+  --task_context=$CONTEXT \
			
 
				+  --resource_dir=$MODEL_DIR \
			
 
				+  --model_path=$MODEL_DIR/parser-params \
			
 
				+  --slim_model \
			
 
				+  --batch_size=1024 \
			
 
				+  --alsologtostderr
			
--- a/syntaxnet/syntaxnet/models/parsey_universal/tokenize.sh
+++ b/syntaxnet/syntaxnet/models/parsey_universal/tokenize.sh
@@ -0,0 +1,31 @@
 
				+# A script that runs a tokenizer on a text file with one sentence per line.
			
 
				+#
			
 
				+# Example usage:
			
 
				+#  bazel build syntaxnet:parser_eval
			
 
				+#  cat untokenized-sentences.txt |
			
 
				+#    syntaxnet/models/parsey_universal/tokenize.sh \
			
 
				+#    $MODEL_DIRECTORY > output.conll
			
 
				+#
			
 
				+# Models can be downloaded from
			
 
				+#  http://download.tensorflow.org/models/parsey_universal/<language>.zip
			
 
				+# for the languages listed at
			
 
				+#  https://github.com/tensorflow/models/blob/master/syntaxnet/universal.md
			
 
				+#
			
 
				+
			
 
				+PARSER_EVAL=bazel-bin/syntaxnet/parser_eval
			
 
				+CONTEXT=syntaxnet/models/parsey_universal/context.pbtxt
			
 
				+INPUT_FORMAT=stdin-untoken
			
 
				+MODEL_DIR=$1
			
 
				+
			
 
				+$PARSER_EVAL \
			
 
				+  --input=$INPUT_FORMAT \
			
 
				+  --output=stdin-untoken \
			
 
				+  --hidden_layer_sizes=128,128 \
			
 
				+  --arg_prefix=brain_tokenizer \
			
 
				+  --graph_builder=greedy \
			
 
				+  --task_context=$CONTEXT \
			
 
				+  --resource_dir=$MODEL_DIR \
			
 
				+  --model_path=$MODEL_DIR/tokenizer-params \
			
 
				+  --batch_size=32 \
			
 
				+  --alsologtostderr \
			
 
				+  --slim_model
			
--- a/syntaxnet/syntaxnet/models/parsey_universal/tokenize_zh.sh
+++ b/syntaxnet/syntaxnet/models/parsey_universal/tokenize_zh.sh
@@ -0,0 +1,30 @@
 
				+# A script that runs a traditional Chinese tokenizer on a text file with one
			
 
				+# sentence per line.
			
 
				+#
			
 
				+# Example usage:
			
 
				+#  bazel build syntaxnet:parser_eval
			
 
				+#  cat untokenized-sentences.txt |
			
 
				+#    syntaxnet/models/parsey_universal/tokenize_zh.sh \
			
 
				+#    $MODEL_DIRECTORY > output.conll
			
 
				+#
			
 
				+# The traditional Chinese model can be downloaded from
			
 
				+#  http://download.tensorflow.org/models/parsey_universal/Chinese.zip
			
 
				+#
			
 
				+
			
 
				+PARSER_EVAL=bazel-bin/syntaxnet/parser_eval
			
 
				+CONTEXT=syntaxnet/models/parsey_universal/context-tokenize-zh.pbtxt
			
 
				+INPUT_FORMAT=stdin-untoken
			
 
				+MODEL_DIR=$1
			
 
				+
			
 
				+$PARSER_EVAL \
			
 
				+  --input=$INPUT_FORMAT \
			
 
				+  --output=stdin-untoken \
			
 
				+  --hidden_layer_sizes=256,256 \
			
 
				+  --arg_prefix=brain_tokenizer_zh \
			
 
				+  --graph_builder=structured \
			
 
				+  --task_context=$CONTEXT \
			
 
				+  --resource_dir=$MODEL_DIR \
			
 
				+  --model_path=$MODEL_DIR/tokenizer-params \
			
 
				+  --batch_size=1024 \
			
 
				+  --alsologtostderr \
			
 
				+  --slim_model
			
--- a/syntaxnet/universal.md
+++ b/syntaxnet/universal.md
@@ -0,0 +1,104 @@
 
				+# Parsey Universal.
			
 
				+
			
 
				+A collection of pretrained syntactic models is now available for download at
			
 
				+`http://download.tensorflow.org/models/<language>.zip`
			
 
				+
			
 
				+After downloading and unzipping a model, you can run it similarly to
			
 
				+Parsey McParseface with:
			
 
				+
			
 
				+```shell
			
 
				+  MODEL_DIRECTORY=/where/you/unzipped/the/model/files
			
 
				+  cat sentences.txt | syntaxnet/models/parsey_universal/parse.sh \
			
 
				+    $MODEL_DIRECTORY > output.conll
			
 
				+```
			
 
				+
			
 
				+These models are trained on
			
 
				+[Universal Dependencies](http://universaldependencies.org/) datasets v1.3.
			
 
				+The following table shows their accuracy on Universal
			
 
				+Dependencies test sets for different types of annotations.
			
 
				+
			
 
				+Language | No. tokens | POS | fPOS | Morph | UAS | LAS
			
 
				+--------  | :--: | :--: | :--: | :--: | :--: | :--: | :--:
			
 
				+Ancient_Greek-PROIEL | 18502 | 97.14% | 96.97% | 89.77% | 78.74% | 73.15%
			
 
				+Ancient_Greek | 25251 | 93.22% | 84.22% | 90.01% | 68.98% | 62.07%
			
 
				+Arabic | 28268 | 95.65% | 91.03% | 91.23% | 81.49% | 75.82%
			
 
				+Basque | 24374 | 94.88% | - | 87.82% | 78.00% | 73.36%
			
 
				+Bulgarian | 15734 | 97.71% | 95.14% | 94.61% | 89.35% | 85.01%
			
 
				+Catalan | 59503 | 98.06% | 98.06% | 97.56% | 90.47% | 87.64%
			
 
				+Chinese | 12012 | 91.32% | 90.89% | 98.76% | 76.71% | 71.24%
			
 
				+Croatian | 4125 | 94.67% | - | 86.69% | 80.65% | 74.06%
			
 
				+Czech-CAC | 10862 | 98.11% | 92.43% | 91.43% | 87.28% | 83.44%
			
 
				+Czech-CLTT | 4105 | 95.79% | 87.36% | 86.33% | 77.34% | 73.40%
			
 
				+Czech | 173920 | 98.12% | 93.76% | 93.13% | 89.47% | 85.93%
			
 
				+Danish | 5884 | 95.28% | - | 95.24% | 79.84% | 76.34%
			
 
				+Dutch-LassySmall | 4562 | 95.62% | - | 95.44% | 81.63% | 78.08%
			
 
				+Dutch | 5843 | 89.89% | 86.03% | 89.12% | 77.70% | 71.21%
			
 
				+English-LinES | 8481 | 95.34% | 93.11% | - | 81.50% | 77.37%
			
 
				+English | 25096 | 90.48% | 89.71% | 91.30% | 84.79% | 80.38%
			
 
				+Estonian | 23670 | 95.92% | 96.76% | 92.73% | 83.10% | 78.83%
			
 
				+Finnish-FTB | 16286 | 93.50% | 91.15% | 92.44% | 84.97% | 80.48%
			
 
				+Finnish | 9140 | 94.78% | 95.84% | 92.42% | 83.65% | 79.60%
			
 
				+French | 7018 | 96.27% | - | 96.05% | 84.68% | 81.05%
			
 
				+Galician | 29746 | 96.81% | 96.14% | - | 84.48% | 81.35%
			
 
				+German | 16268 | 91.79% | - | - | 79.73% | 74.07%
			
 
				+Gothic | 5158 | 95.58% | 96.03% | 87.32% | 79.33% | 71.69%
			
 
				+Greek | 5668 | 97.48% | 97.48% | 92.70% | 83.68% | 79.99%
			
 
				+Hebrew | 12125 | 95.04% | 95.04% | 92.05% | 84.61% | 78.71%
			
 
				+Hindi | 35430 | 96.45% | 95.77% | 90.98% | 93.04% | 89.32%
			
 
				+Hungarian | 4235 | 94.00% | - | 75.68% | 78.75% | 71.83%
			
 
				+Indonesian | 11780 | 92.62% | - | - | 80.03% | 72.99%
			
 
				+Irish | 3821 | 91.34% | 89.95% | 77.07% | 74.51% | 66.29%
			
 
				+Italian | 10952 | 97.31% | 97.18% | 97.27% | 89.81% | 87.13%
			
 
				+Kazakh | 587 | 75.47% | 75.13% | - | 58.09% | 43.95%
			
 
				+Latin-ITTB | 6548 | 97.98% | 92.68% | 93.52% | 84.22% | 81.17%
			
 
				+Latin-PROIEL | 14906 | 96.50% | 96.08% | 88.39% | 77.60% | 70.98%
			
 
				+Latin | 4832 | 88.04% | 74.07% | 76.03% | 56.00% | 45.80%
			
 
				+Latvian | 3985 | 80.95% | 66.60% | 73.60% | 58.92% | 51.47%
			
 
				+Norwegian | 30034 | 97.44% | - | 95.58% | 88.61% | 86.22%
			
 
				+Old_Church_Slavonic | 5079 | 96.50% | 96.28% | 89.43% | 84.86% | 78.85%
			
 
				+Persian | 16022 | 96.20% | 95.72% | 95.90% | 84.42% | 80.28%
			
 
				+Polish | 7185 | 95.05% | 85.83% | 86.12% | 88.30% | 82.71%
			
 
				+Portuguese-BR | 29438 | 97.07% | 97.07% | 99.91% | 87.91% | 85.44%
			
 
				+Portuguese | 6262 | 96.81% | 90.67% | 94.22% | 85.12% | 81.28%
			
 
				+Romanian | 18375 | 95.26% | 91.66% | 91.98% | 83.64% | 75.36%
			
 
				+Russian-SynTagRus | 107737 | 98.27% | - | 94.91% | 91.68% | 87.44%
			
 
				+Russian | 9573 | 95.27% | 95.02% | 87.75% | 81.75% | 77.71%
			
 
				+Slovenian-SST | 2951 | 90.00% | 84.48% | 84.38% | 65.06% | 56.96%
			
 
				+Slovenian | 14063 | 96.22% | 90.46% | 90.35% | 87.71% | 84.60%
			
 
				+Spanish-AnCora | 53594 | 98.28% | 98.28% | 97.82% | 89.26% | 86.50%
			
 
				+Spanish | 7953 | 95.27% | - | 95.74% | 85.06% | 81.53%
			
 
				+Swedish-LinES | 8228 | 96.00% | 93.77% | - | 81.38% | 77.21%
			
 
				+Swedish | 20377 | 96.27% | 94.13% | 94.14% | 83.84% | 80.28%
			
 
				+Tamil | 1989 | 79.29% | 71.79% | 75.97% | 64.45% | 55.35%
			
 
				+Turkish | 8616 | 93.63% | 92.62% | 86.79% | 82.00% | 71.37%
			
 
				+**Average** | - | 94.27% | 92.93% | 90.38% | 81.12% | 75.85%
			
 
				+
			
 
				+These results are obtained using gold text segmentation. Accuracies are measured
			
 
				+over all tokens, including punctuation. `POS`, `fPOS` are coarse and fine
			
 
				+part-of-speech tagging accuracies. `Morph` is full-token accuracy of predicted
			
 
				+morphological attributes. `UAS` and `LAS` are unlabeled and labeled attachment
			
 
				+scores.
			
 
				+
			
 
				+Many of these models also support text segmentation, with:
			
 
				+
			
 
				+```shell
			
 
				+  MODEL_DIRECTORY=/where/you/unzipped/the/model/files
			
 
				+  cat sentences.txt | syntaxnet/models/parsey_universal/tokenize.sh \
			
 
				+    $MODEL_DIRECTORY > output.conll
			
 
				+```
			
 
				+
			
 
				+Text segmentation is currently available for:
			
 
				+`Bulgarian`, `Czech`, `German`, `Greek`, `English`, `Spanish`, `Estonian`,
			
 
				+`Basque`, `Persian`, `Finnish`, `Finnish-FTB`, `French`, `Galician`,
			
 
				+`Ancient_Greek`, `Ancient_Greek-PROIEL`, `Hebrew`, `Hindi`, `Croatian`,
			
 
				+`Hungarian`, `Indonesian`, `Italian`, `Latin`, `Latin-PROIEL`, `Dutch`,
			
 
				+`Norwegian`, `Polish`, `Portuguese`, `Slovenian`, `Swedish`, `Tamil`.
			
 
				+
			
 
				+For `Chinese` (traditional) we use a larger text segmentation
			
 
				+model, which can be run with:
			
 
				+
			
 
				+```shell
			
 
				+  MODEL_DIRECTORY=/where/you/unzipped/the/model/files
			
 
				+  cat sentences.txt | syntaxnet/models/parsey_universal/tokenize_zh.sh \
			
 
				+    $MODEL_DIRECTORY > output.conll
			
 
				+```