radu
/
TensorFlow-Models


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
							/* Copyright 2016 Google Inc. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "syntaxnet/segmenter_utils.h"
#include "util/utf8/unicodetext.h"
#include "util/utf8/unilib.h"
#include "util/utf8/unilib_utf8_utils.h"

namespace syntaxnet {

// Separators, code Zs from http://www.unicode.org/Public/UNIDATA/PropList.txt
// NB: This list is not necessarily exhaustive.
const std::unordered_set<int> SegmenterUtils::kBreakChars({
  0x2028,  // line separator
  0x2029,  // paragraph separator
  0x0020,  // space
  0x00a0,  // no-break space
  0x1680,  // Ogham space mark
  0x180e,  // Mongolian vowel separator
  0x202f,  // narrow no-break space
  0x205f,  // medium mathematical space
  0x3000,  // ideographic space
  0xe5e5,  // Google addition
  0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008,
  0x2009, 0x200a
});

void SegmenterUtils::GetUTF8Chars(const string &text,
                                  vector<tensorflow::StringPiece> *chars) {
  const char *start = text.c_str();
  const char *end = text.c_str() + text.size();
  while (start < end) {
    int char_length = UniLib::OneCharLen(start);
    chars->emplace_back(start, char_length);
    start += char_length;
  }
}

void SegmenterUtils::SetCharsAsTokens(
    const string &text,
    const vector<tensorflow::StringPiece> &chars,
    Sentence *sentence) {
  sentence->clear_token();
  sentence->set_text(text);
  for (int i = 0; i < chars.size(); ++i) {
    Token *tok = sentence->add_token();
    tok->set_word(chars[i].ToString());  // NOLINT
    int start_byte, end_byte;
    GetCharStartEndBytes(text, chars[i], &start_byte, &end_byte);
    tok->set_start(start_byte);
    tok->set_end(end_byte);
  }
}

bool SegmenterUtils::IsValidSegment(const Sentence &sentence,
                                    const Token &token) {
  // Check that the token is not empty, both by string and by bytes.
  if (token.word().empty()) return false;
  if (token.start() > token.end()) return false;

  // Check token boudaries inside of text.
  if (token.start() < 0) return false;
  if (token.end() >= sentence.text().size()) return false;

  // Check that token string is valid UTF8, by bytes.
  const char s = sentence.text()[token.start()];
  const char e = sentence.text()[token.end() + 1];
  if (UniLib::IsTrailByte(s)) return false;
  if (UniLib::IsTrailByte(e)) return false;
  return true;
}

}  // namespace syntaxnet