parser_state.h 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
  1. /* Copyright 2016 Google Inc. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. // Parser state for the transition-based dependency parser.
  13. #ifndef SYNTAXNET_PARSER_STATE_H_
  14. #define SYNTAXNET_PARSER_STATE_H_
  15. #include <string>
  16. #include <vector>
  17. #include "syntaxnet/utils.h"
  18. #include "syntaxnet/kbest_syntax.pb.h"
  19. #include "syntaxnet/parser_transitions.h"
  20. #include "syntaxnet/sentence.pb.h"
  21. namespace syntaxnet {
  22. class TermFrequencyMap;
  23. // A ParserState object represents the state of the parser during the parsing of
  24. // a sentence. The state consists of a pointer to the next input token and a
  25. // stack of partially processed tokens. The parser state can be changed by
  26. // applying a sequence of transitions. Some transitions also add relations
  27. // to the dependency tree of the sentence. The parser state also records the
  28. // (partial) parse tree for the sentence by recording the head of each token and
  29. // the label of this relation. The state is used for both training and parsing.
  30. class ParserState {
  31. public:
  32. // String representation of the root label.
  33. static const char kRootLabel[];
  34. // Default value for the root label in case it's not in the label map.
  35. static const int kDefaultRootLabel = -1;
  36. // Initializes the parser state for a sentence, using an additional transition
  37. // state for preprocessing and/or additional information specific to the
  38. // transition system. The transition state is allowed to be null, in which
  39. // case no additional work is performed. The parser state takes ownership of
  40. // the transition state. A label map is used for transforming between integer
  41. // and string representations of the labels.
  42. ParserState(Sentence *sentence,
  43. ParserTransitionState *transition_state,
  44. const TermFrequencyMap *label_map);
  45. // Deletes the parser state.
  46. ~ParserState();
  47. // Clones the parser state.
  48. ParserState *Clone() const;
  49. // Returns the root label.
  50. int RootLabel() const;
  51. // Returns the index of the next input token.
  52. int Next() const;
  53. // Returns the number of tokens in the sentence.
  54. int NumTokens() const { return num_tokens_; }
  55. // Returns the token index relative to the next input token. If no such token
  56. // exists, returns -2.
  57. int Input(int offset) const;
  58. // Advances to the next input token.
  59. void Advance();
  60. // Returns true if all tokens have been processed.
  61. bool EndOfInput() const;
  62. // Pushes an element to the stack.
  63. void Push(int index);
  64. // Pops the top element from stack and returns it.
  65. int Pop();
  66. // Returns the element from the top of the stack.
  67. int Top() const;
  68. // Returns the element at a certain position in the stack. Stack(0) is the top
  69. // stack element. If no such position exists, returns -2.
  70. int Stack(int position) const;
  71. // Returns the number of elements on the stack.
  72. int StackSize() const;
  73. // Returns true if the stack is empty.
  74. bool StackEmpty() const;
  75. // Returns the head index for a given token.
  76. int Head(int index) const;
  77. // Returns the label of the relation to head for a given token.
  78. int Label(int index) const;
  79. // Returns the parent of a given token 'n' levels up in the tree.
  80. int Parent(int index, int n) const;
  81. // Returns the leftmost child of a given token 'n' levels down in the tree. If
  82. // no such child exists, returns -2.
  83. int LeftmostChild(int index, int n) const;
  84. // Returns the rightmost child of a given token 'n' levels down in the tree.
  85. // If no such child exists, returns -2.
  86. int RightmostChild(int index, int n) const;
  87. // Returns the n-th left sibling of a given token. If no such sibling exists,
  88. // returns -2.
  89. int LeftSibling(int index, int n) const;
  90. // Returns the n-th right sibling of a given token. If no such sibling exists,
  91. // returns -2.
  92. int RightSibling(int index, int n) const;
  93. // Adds an arc to the partial dependency tree of the state.
  94. void AddArc(int index, int head, int label);
  95. // Returns the gold head index for a given token, based on the underlying
  96. // annotated sentence.
  97. int GoldHead(int index) const;
  98. // Returns the gold label for a given token, based on the underlying annotated
  99. // sentence.
  100. int GoldLabel(int index) const;
  101. // Get a reference to the underlying token at index. Returns an empty default
  102. // Token if accessing the root.
  103. const Token &GetToken(int index) const {
  104. if (index == -1) return kRootToken;
  105. return sentence().token(index);
  106. }
  107. // Annotates a document with the dependency relations built during parsing for
  108. // one of its sentences. If rewrite_root_labels is true, then all tokens with
  109. // no heads will be assigned the default root label "ROOT".
  110. void AddParseToDocument(Sentence *document, bool rewrite_root_labels) const;
  111. // As above, but uses the default of rewrite_root_labels = true.
  112. void AddParseToDocument(Sentence *document) const {
  113. AddParseToDocument(document, true);
  114. }
  115. // Whether a parsed token should be considered correct for evaluation.
  116. bool IsTokenCorrect(int index) const;
  117. // Returns the string representation of a dependency label, or an empty string
  118. // if the label is invalid.
  119. string LabelAsString(int label) const;
  120. // Returns a string representation of the parser state.
  121. string ToString() const;
  122. // Returns the underlying sentence instance.
  123. const Sentence &sentence() const { return *sentence_; }
  124. Sentence *mutable_sentence() const { return sentence_; }
  125. // Returns the transition system-specific state.
  126. const ParserTransitionState *transition_state() const {
  127. return transition_state_;
  128. }
  129. ParserTransitionState *mutable_transition_state() {
  130. return transition_state_;
  131. }
  132. // Gets/sets the flag which says that the state was obtained though gold
  133. // transitions only.
  134. bool is_gold() const { return is_gold_; }
  135. void set_is_gold(bool is_gold) { is_gold_ = is_gold; }
  136. private:
  137. // Empty constructor used for the cloning operation.
  138. ParserState() {}
  139. // Default value for the root token.
  140. const Token kRootToken;
  141. // Sentence to parse. Not owned.
  142. Sentence *sentence_ = nullptr;
  143. // Number of tokens in the sentence to parse.
  144. int num_tokens_;
  145. // Which alternative token analysis is used for tag/category/head/label
  146. // information. -1 means use default.
  147. int alternative_ = -1;
  148. // Transition system-specific state. Owned.
  149. ParserTransitionState *transition_state_ = nullptr;
  150. // Label map used for conversions between integer and string representations
  151. // of the dependency labels. Not owned.
  152. const TermFrequencyMap *label_map_ = nullptr;
  153. // Root label.
  154. int root_label_;
  155. // Index of the next input token.
  156. int next_;
  157. // Parse stack of partially processed tokens.
  158. vector<int> stack_;
  159. // List of head positions for the (partial) dependency tree.
  160. vector<int> head_;
  161. // List of dependency relation labels describing the (partial) dependency
  162. // tree.
  163. vector<int> label_;
  164. // Score of the parser state.
  165. double score_ = 0.0;
  166. // True if this is the gold standard sequence (used for structured learning).
  167. bool is_gold_ = false;
  168. TF_DISALLOW_COPY_AND_ASSIGN(ParserState);
  169. };
  170. } // namespace syntaxnet
  171. #endif // SYNTAXNET_PARSER_STATE_H_