// K-best part-of-speech and dependency annotations for tokens.

syntax = "proto2";

import "syntaxnet/sentence.proto";

package syntaxnet;

// A list of alternative (k-best) syntax analyses, grouped by sentences.
message KBestSyntaxAnalyses {
  extend Sentence {
    optional KBestSyntaxAnalyses extension = 60366242;
  }

  // Alternative analyses for each sentence. Sentences are listed in the
  // order visited by a SentenceIterator.
  repeated KBestSyntaxAnalysesForSentence sentence = 1;

  // Alternative analyses for each token.
  repeated KBestSyntaxAnalysesForToken token = 2;
}

// A list of alternative (k-best) analyses for a sentence spanning from a start
// token index to an end token index. The alternative analyses are ordered by
// decreasing model score from best to worst. The first analysis is the 1-best
// analysis, which is typically also stored in the document tokens.
message KBestSyntaxAnalysesForSentence {
  // First token of sentence.
  optional int32 start = 1 [default = -1];

  // Last token of sentence.
  optional int32 end = 2 [default = -1];

  // K-best analyses for the tokens in this sentence. All of the analyses in
  // the list have the same "type"; e.g., k-best taggings,
  // k-best {tagging+parse}s, etc.
  // Note also that the type of analysis stored in this list can change
  // depending on where we are in the document processing pipeline; e.g.,
  // may initially be taggings, and then switch to parses.  The first
  // token_analysis would be the 1-best analysis, which is typically also stored
  // in the document.  Note: some post-processors will update the document's
  // syntax trees, but will leave these unchanged.
  repeated AlternativeTokenAnalysis token_analysis = 3;
}

// A list of scored alternative (k-best) analyses for a particular token. These
// are all distinct from each other and ordered by decreasing model score. The
// first is the 1-best analysis, which may or may not match the document tokens
// depending on how the k-best analyses are selected.
message KBestSyntaxAnalysesForToken {
  // All token analyses in this repeated field refer to the same token.
  // Each alternative analysis will contain a single entry for repeated fields
  // such as head, tag, category and label.
  repeated AlternativeTokenAnalysis token_analysis = 3;
}

// An alternative analysis of tokens in the document. The repeated fields
// are indexed relative to the beginning of a sentence. Fields not
// represented in the alternative analysis are assumed to be unchanged.
// Currently only alternatives for tags, categories and (labeled) dependency
// heads are supported.
// Each repeated field should either have length=0 or length=number of tokens.
message AlternativeTokenAnalysis {
  // Head of this token in the dependency tree: the id of the token which has
  // an arc going to this one. If it is the root token of a sentence, then it
  // is set to -1.
  repeated int32 head = 1;

  // Part-of-speech tag for token.
  repeated string tag = 2;

  // Coarse-grained word category for token.
  repeated string category = 3;

  // Label for dependency relation between this token and its head.
  repeated string label = 4;

  // The score of this analysis, where bigger values typically indicate better
  // quality, but there are no guarantees and there is also no pre-defined
  // range.
  optional double score = 5;
}