tokenize.sh 934 B

1234567891011121314151617181920212223242526272829303132
  1. # A script that runs a tokenizer on a text file with one sentence per line.
  2. #
  3. # Example usage:
  4. # bazel build syntaxnet:parser_eval
  5. # cat untokenized-sentences.txt |
  6. # syntaxnet/models/parsey_universal/tokenize.sh \
  7. # $MODEL_DIRECTORY > output.conll
  8. #
  9. # Models can be downloaded from
  10. # http://download.tensorflow.org/models/parsey_universal/<language>.zip
  11. # for the languages listed at
  12. # https://github.com/tensorflow/models/blob/master/syntaxnet/universal.md
  13. #
  14. PARSER_EVAL=bazel-bin/syntaxnet/parser_eval
  15. CONTEXT=syntaxnet/models/parsey_universal/context.pbtxt
  16. INPUT_FORMAT=stdin-untoken
  17. MODEL_DIR=$1
  18. $PARSER_EVAL \
  19. --input=$INPUT_FORMAT \
  20. --output=stdin-untoken \
  21. --hidden_layer_sizes=128,128 \
  22. --arg_prefix=brain_tokenizer \
  23. --graph_builder=greedy \
  24. --task_context=$CONTEXT \
  25. --resource_dir=$MODEL_DIR \
  26. --model_path=$MODEL_DIR/tokenizer-params \
  27. --batch_size=32 \
  28. --alsologtostderr \
  29. --slim_model