sentence.proto 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. // Protocol buffer specification for document analysis.
  2. syntax = "proto2";
  3. package syntaxnet;
  4. // A Sentence contains the raw text contents of a sentence, as well as an
  5. // analysis.
  6. message Sentence {
  7. // Identifier for document.
  8. optional string docid = 1;
  9. // Raw text contents of the sentence.
  10. optional string text = 2;
  11. // Tokenization of the sentence.
  12. repeated Token token = 3;
  13. extensions 1000 to max;
  14. }
  15. // A document token marks a span of bytes in the document text as a token
  16. // or word.
  17. message Token {
  18. // Token word form.
  19. required string word = 1;
  20. // Start position of token in text.
  21. required int32 start = 2;
  22. // End position of token in text. Gives index of last byte, not one past
  23. // the last byte. If token came from lexer, excludes any trailing HTML tags.
  24. required int32 end = 3;
  25. // Head of this token in the dependency tree: the id of the token which has an
  26. // arc going to this one. If it is the root token of a sentence, then it is
  27. // set to -1.
  28. optional int32 head = 4 [default = -1];
  29. // Part-of-speech tag for token.
  30. optional string tag = 5;
  31. // Coarse-grained word category for token.
  32. optional string category = 6;
  33. // Label for dependency relation between this token and its head.
  34. optional string label = 7;
  35. // Break level for tokens that indicates how it was separated from the
  36. // previous token in the text.
  37. enum BreakLevel {
  38. NO_BREAK = 0; // No separation between tokens.
  39. SPACE_BREAK = 1; // Tokens separated by space.
  40. LINE_BREAK = 2; // Tokens separated by line break.
  41. SENTENCE_BREAK = 3; // Tokens separated by sentence break.
  42. }
  43. optional BreakLevel break_level = 8 [default = SPACE_BREAK];
  44. extensions 1000 to max;
  45. }
  46. // Stores information about the morphology of a token.
  47. message TokenMorphology {
  48. extend Token {
  49. optional TokenMorphology morphology = 63949837;
  50. }
  51. // Morphology is represented by a set of attribute values.
  52. message Attribute {
  53. required string name = 1;
  54. required string value = 2;
  55. }
  56. // This attribute field is designated to hold a single disambiguated analysis.
  57. repeated Attribute attribute = 3;
  58. };