binary_segment_state.h 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. /* Copyright 2016 Google Inc. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. #ifndef SYNTAXNET_BINARY_SEGMENT_STATE_H_
  13. #define SYNTAXNET_BINARY_SEGMENT_STATE_H_
  14. #include "syntaxnet/parser_state.h"
  15. #include "syntaxnet/parser_transitions.h"
  16. namespace syntaxnet {
  17. class Sentence;
  18. // Parser state for binary segmentation transition system. The input of the
  19. // system is a sequence of utf8 characters that are to be segmented into tokens.
  20. // The system contains two type of transitions/actions:
  21. // -START: the token at input is the first character of a new word.
  22. // -MERGE: the token at input is to be merged with the its previous token.
  23. //
  24. // A BinarySegmentState is used to store segmentation histories that can be used
  25. // as features. In addition, it also provides the functionality to add
  26. // segmentation results to the document. The function assumes that sentences in
  27. // a document are processed in left-to-right order. See also the comments of
  28. // the FinishDocument function for explaination.
  29. //
  30. // Note on spaces:
  31. // Spaces, or more generally break-characters, should never be any part of a
  32. // word, and the START/MERGE of spaces would be ignored. In addition, if a space
  33. // starts a new word, then the actual first char of that word is the first
  34. // non-space token following the space.
  35. // Some examples:
  36. // -chars: ' ' A B
  37. // -tags: S M M
  38. // -result: 'AB'
  39. //
  40. // -chars: A ' ' B
  41. // -tags: S M M
  42. // -result: 'AB'
  43. //
  44. // -chars: A ' ' B
  45. // -tags: S S M
  46. // -result: 'AB'
  47. //
  48. // -chars: A B ' '
  49. // -tags: S S M
  50. // -result: 'A', 'B'
  51. class BinarySegmentState : public ParserTransitionState {
  52. public:
  53. ParserTransitionState *Clone() const override;
  54. void Init(ParserState *state) override {}
  55. // Returns the number of start tokens that have already been identified. In
  56. // other words, number of start tokens between the first token of the sentence
  57. // and state.Input(), with state.Input() excluded.
  58. static int NumStarts(const ParserState &state) {
  59. return state.StackSize();
  60. }
  61. // Returns the index of the k-th most recent start token.
  62. static int LastStart(int k, const ParserState &state) {
  63. DCHECK_GE(k, 0);
  64. DCHECK_LT(k, NumStarts(state));
  65. return state.Stack(k);
  66. }
  67. // Adds the token at given index as a new start token.
  68. static void AddStart(int index, ParserState *state) {
  69. state->Push(index);
  70. }
  71. // Adds segmentation results to the given sentence.
  72. void AddParseToDocument(const ParserState &state,
  73. bool rewrite_root_labels,
  74. Sentence *sentence) const override;
  75. // Whether a parsed token should be considered correct for evaluation.
  76. bool IsTokenCorrect(const ParserState &state, int index) const override {
  77. return true;
  78. }
  79. // Returns a human readable string representation of this state.
  80. string ToString(const ParserState &state) const override;
  81. };
  82. } // namespace syntaxnet
  83. #endif // SYNTAXNET_BINARY_SEGMENT_STATE_H_