| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- // Protocol buffer specification for document analysis.
- syntax = "proto2";
- package syntaxnet;
- // A Sentence contains the raw text contents of a sentence, as well as an
- // analysis.
- message Sentence {
- // Identifier for document.
- optional string docid = 1;
- // Raw text contents of the sentence.
- optional string text = 2;
- // Tokenization of the sentence.
- repeated Token token = 3;
- extensions 1000 to max;
- }
- // A document token marks a span of bytes in the document text as a token
- // or word.
- message Token {
- // Token word form.
- required string word = 1;
- // Start position of token in text.
- required int32 start = 2;
- // End position of token in text. Gives index of last byte, not one past
- // the last byte. If token came from lexer, excludes any trailing HTML tags.
- required int32 end = 3;
- // Head of this token in the dependency tree: the id of the token which has an
- // arc going to this one. If it is the root token of a sentence, then it is
- // set to -1.
- optional int32 head = 4 [default = -1];
- // Part-of-speech tag for token.
- optional string tag = 5;
- // Coarse-grained word category for token.
- optional string category = 6;
- // Label for dependency relation between this token and its head.
- optional string label = 7;
- // Break level for tokens that indicates how it was separated from the
- // previous token in the text.
- enum BreakLevel {
- NO_BREAK = 0; // No separation between tokens.
- SPACE_BREAK = 1; // Tokens separated by space.
- LINE_BREAK = 2; // Tokens separated by line break.
- SENTENCE_BREAK = 3; // Tokens separated by sentence break.
- }
- optional BreakLevel break_level = 8 [default = SPACE_BREAK];
- extensions 1000 to max;
- }
- // Stores information about the morphology of a token.
- message TokenMorphology {
- extend Token {
- optional TokenMorphology morphology = 63949837;
- }
- // Morphology is represented by a set of attribute values.
- message Attribute {
- required string name = 1;
- required string value = 2;
- }
- // This attribute field is designated to hold a single disambiguated analysis.
- repeated Attribute attribute = 3;
- };
|