// Protocol buffer specification for document analysis.

syntax = "proto2";

package syntaxnet;

// A Sentence contains the raw text contents of a sentence, as well as an
// analysis.
message Sentence {
  // Identifier for document.
  optional string docid = 1;

  // Raw text contents of the sentence.
  optional string text = 2;

  // Tokenization of the sentence.
  repeated Token token = 3;

  extensions 1000 to max;
}

// A document token marks a span of bytes in the document text as a token
// or word.
message Token {
  // Token word form.
  required string word = 1;

  // Start position of token in text.
  required int32 start = 2;

  // End position of token in text. Gives index of last byte, not one past
  // the last byte. If token came from lexer, excludes any trailing HTML tags.
  required int32 end = 3;

  // Head of this token in the dependency tree: the id of the token which has an
  // arc going to this one. If it is the root token of a sentence, then it is
  // set to -1.
  optional int32 head = 4 [default = -1];

  // Part-of-speech tag for token.
  optional string tag = 5;

  // Coarse-grained word category for token.
  optional string category = 6;

  // Label for dependency relation between this token and its head.
  optional string label = 7;

  // Break level for tokens that indicates how it was separated from the
  // previous token in the text.
  enum BreakLevel {
    NO_BREAK = 0;         // No separation between tokens.
    SPACE_BREAK = 1;      // Tokens separated by space.
    LINE_BREAK = 2;       // Tokens separated by line break.
    SENTENCE_BREAK = 3;   // Tokens separated by sentence break.
  }

  optional BreakLevel break_level = 8 [default = SPACE_BREAK];

  extensions 1000 to max;
}

// Stores information about the morphology of a token.
message TokenMorphology {
  extend Token {
    optional TokenMorphology morphology = 63949837;
  }

  // Morphology is represented by a set of attribute values.
  message Attribute {
    required string name = 1;
    required string value = 2;
  }
  // This attribute field is designated to hold a single disambiguated analysis.
  repeated Attribute attribute = 3;
};