radu
/
TensorFlow-Models


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
							/* Copyright 2016 Google Inc. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#ifndef SYNTAXNET_BINARY_SEGMENT_STATE_H_
#define SYNTAXNET_BINARY_SEGMENT_STATE_H_

#include "syntaxnet/parser_state.h"
#include "syntaxnet/parser_transitions.h"

namespace syntaxnet {

class Sentence;

// Parser state for binary segmentation transition system. The input of the
// system is a sequence of utf8 characters that are to be segmented into tokens.
// The system contains two type of transitions/actions:
//  -START: the token at input is the first character of a new word.
//  -MERGE: the token at input is to be merged with the its previous token.
//
// A BinarySegmentState is used to store segmentation histories that can be used
// as features. In addition, it also provides the functionality to add
// segmentation results to the document. The function assumes that sentences in
// a document are processed in left-to-right order. See also the comments of
// the FinishDocument function for explaination.
//
// Note on spaces:
// Spaces, or more generally break-characters, should never be any part of a
// word, and the START/MERGE of spaces would be ignored. In addition, if a space
// starts a new word, then the actual first char of that word is the first
// non-space token following the space.
// Some examples:
//  -chars:  ' ' A B
//  -tags:    S  M M
//  -result: 'AB'
//
//  -chars:  A ' ' B
//  -tags:   S  M  M
//  -result: 'AB'
//
//  -chars:  A ' ' B
//  -tags:   S  S  M
//  -result: 'AB'
//
//  -chars:  A  B  ' '
//  -tags:   S  S  M
//  -result: 'A', 'B'
class BinarySegmentState : public ParserTransitionState {
 public:
  ParserTransitionState *Clone() const override;
  void Init(ParserState *state) override {}

  // Returns the number of start tokens that have already been identified. In
  // other words, number of start tokens between the first token of the sentence
  // and state.Input(), with state.Input() excluded.
  static int NumStarts(const ParserState &state) {
    return state.StackSize();
  }

  // Returns the index of the k-th most recent start token.
  static int LastStart(int k, const ParserState &state) {
    DCHECK_GE(k, 0);
    DCHECK_LT(k, NumStarts(state));
    return state.Stack(k);
  }

  // Adds the token at given index as a new start token.
  static void AddStart(int index, ParserState *state) {
    state->Push(index);
  }

  // Adds segmentation results to the given sentence.
  void AddParseToDocument(const ParserState &state,
                          bool rewrite_root_labels,
                          Sentence *sentence) const override;

  // Whether a parsed token should be considered correct for evaluation.
  bool IsTokenCorrect(const ParserState &state, int index) const override {
    return true;
  }

  // Returns a human readable string representation of this state.
  string ToString(const ParserState &state) const override;
};

}  // namespace syntaxnet

#endif  // SYNTAXNET_BINARY_SEGMENT_STATE_H_