tokenizer.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. # coding=utf-8
  2. # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """Megatron tokenizers."""
  16. from abc import ABC
  17. from abc import abstractmethod
  18. from .bert_tokenization import FullTokenizer as FullBertTokenizer
  19. from .gpt2_tokenization import GPT2Tokenizer
  20. def build_tokenizer(args):
  21. """Initialize tokenizer."""
  22. if args.rank == 0:
  23. print('> building {} tokenizer ...'.format(args.tokenizer_type),
  24. flush=True)
  25. # Select and instantiate the tokenizer.
  26. assert args.vocab_file is not None
  27. if args.tokenizer_type == 'BertWordPieceLowerCase':
  28. tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
  29. lower_case=True,
  30. vocab_extra_ids=args.vocab_extra_ids)
  31. elif args.tokenizer_type == 'BertWordPieceCase':
  32. tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
  33. lower_case=False,
  34. vocab_extra_ids=args.vocab_extra_ids)
  35. elif args.tokenizer_type == 'GPT2BPETokenizer':
  36. assert args.merge_file is not None
  37. tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
  38. else:
  39. raise NotImplementedError('{} tokenizer is not '
  40. 'implemented.'.format(args.tokenizer_type))
  41. # Add vocab size.
  42. args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size,
  43. args)
  44. return tokenizer
  45. def _vocab_size_with_padding(orig_vocab_size, args):
  46. """Pad vocab size so it is divisible by model parallel size and
  47. still having GPU friendly size."""
  48. after = orig_vocab_size
  49. multiple = args.make_vocab_size_divisible_by * \
  50. args.tensor_model_parallel_size
  51. while (after % multiple) != 0:
  52. after += 1
  53. if args.rank == 0:
  54. print(' > padded vocab (size: {}) with {} dummy tokens '
  55. '(new size: {})'.format(
  56. orig_vocab_size, after - orig_vocab_size, after), flush=True)
  57. return after
  58. class AbstractTokenizer(ABC):
  59. """Abstract class for tokenizer."""
  60. def __init__(self, name):
  61. self.name = name
  62. super().__init__()
  63. @property
  64. @abstractmethod
  65. def vocab_size(self):
  66. pass
  67. @property
  68. @abstractmethod
  69. def vocab(self):
  70. """Dictionary from vocab text token to id token."""
  71. pass
  72. @property
  73. @abstractmethod
  74. def inv_vocab(self):
  75. """Dictionary from vocab id token to text token."""
  76. pass
  77. @abstractmethod
  78. def tokenize(self, text):
  79. pass
  80. def detokenize(self, token_ids):
  81. raise NotImplementedError('detokenizer is not implemented for {} '
  82. 'tokenizer'.format(self.name))
  83. @property
  84. def cls(self):
  85. raise NotImplementedError('CLS is not provided for {} '
  86. 'tokenizer'.format(self.name))
  87. @property
  88. def sep(self):
  89. raise NotImplementedError('SEP is not provided for {} '
  90. 'tokenizer'.format(self.name))
  91. @property
  92. def pad(self):
  93. raise NotImplementedError('PAD is not provided for {} '
  94. 'tokenizer'.format(self.name))
  95. @property
  96. def eod(self):
  97. raise NotImplementedError('EOD is not provided for {} '
  98. 'tokenizer'.format(self.name))
  99. @property
  100. def mask(self):
  101. raise NotImplementedError('MASK is not provided for {} '
  102. 'tokenizer'.format(self.name))
  103. class _BertWordPieceTokenizer(AbstractTokenizer):
  104. """Original BERT wordpiece tokenizer."""
  105. def __init__(self, vocab_file, lower_case=True, vocab_extra_ids=0):
  106. if lower_case:
  107. name = 'BERT Lower Case'
  108. else:
  109. name = 'BERT Upper Case'
  110. super().__init__(name)
  111. self.tokenizer = FullBertTokenizer(vocab_file, do_lower_case=lower_case)
  112. self.cls_id = self.tokenizer.vocab['[CLS]']
  113. self.sep_id = self.tokenizer.vocab['[SEP]']
  114. self.pad_id = self.tokenizer.vocab['[PAD]']
  115. self.mask_id = self.tokenizer.vocab['[MASK]']
  116. self._additional_special_tokens = []
  117. # (dsachan) Add BOS and EOS tokens
  118. SPECIAL_TOKENS = {'eos_token': '[EOS]',
  119. 'bos_token': '[BOS]'}
  120. self._bos_token = '[BOS]'
  121. self.add_token(self._bos_token)
  122. self._bos_token_id = self.vocab.get(self._bos_token)
  123. self._eos_token = '[EOS]'
  124. self.add_token(self._eos_token)
  125. self._eos_token_id = self.vocab.get(self._eos_token)
  126. # (dsachan) Add additional special tokens
  127. # These can be used as sentinel tokens in T5 model inputs
  128. additional_special_tokens = []
  129. additional_special_tokens.extend(
  130. ["<extra_id_{}>".format(i) for i in range(vocab_extra_ids)])
  131. self.add_additional_special_tokens(additional_special_tokens)
  132. def add_token(self, token):
  133. if token not in self.vocab:
  134. self.inv_vocab[self.vocab_size] = token
  135. # self.vocab_size comes from len(vocab)
  136. # and it will increase as we add elements
  137. self.vocab[token] = self.vocab_size
  138. def add_additional_special_tokens(self, tokens_list):
  139. setattr(self, "additional_special_tokens", tokens_list)
  140. for value in tokens_list:
  141. self.add_token(value)
  142. @property
  143. def vocab_size(self):
  144. return self.tokenizer.vocab_size()
  145. @property
  146. def vocab(self):
  147. return self.tokenizer.vocab
  148. @property
  149. def inv_vocab(self):
  150. return self.tokenizer.inv_vocab
  151. def tokenize(self, text):
  152. text_tokens = self.tokenizer.tokenize(text)
  153. return self.tokenizer.convert_tokens_to_ids(text_tokens)
  154. def decode(self, ids):
  155. tokens = self.tokenizer.convert_ids_to_tokens(ids)
  156. return self.tokenizer.convert_tokens_to_string(tokens)
  157. def decode_token_ids(self, token_ids):
  158. tokens = self.tokenizer.convert_ids_to_tokens(token_ids)
  159. exclude_list = ['[PAD]', '[CLS]']
  160. non_pads = [t for t in tokens if t not in exclude_list]
  161. result = ""
  162. for s in non_pads:
  163. if s.startswith("##"):
  164. result += s[2:]
  165. else:
  166. result += " " + s
  167. return result
  168. @property
  169. def cls(self):
  170. return self.cls_id
  171. @property
  172. def sep(self):
  173. return self.sep_id
  174. @property
  175. def pad(self):
  176. return self.pad_id
  177. @property
  178. def mask(self):
  179. return self.mask_id
  180. @property
  181. def bos_token(self):
  182. """ Beginning of sentence token id """
  183. return self._bos_token
  184. @property
  185. def eos_token(self):
  186. """ End of sentence token id """
  187. return self._eos_token
  188. @property
  189. def additional_special_tokens(self):
  190. """ All the additional special tokens you may want to use (list of strings)."""
  191. return self._additional_special_tokens
  192. @property
  193. def bos_token_id(self):
  194. """ Id of the beginning of sentence token in the vocabulary."""
  195. return self._bos_token_id
  196. @property
  197. def eos_token_id(self):
  198. """ Id of the end of sentence token in the vocabulary."""
  199. return self._eos_token_id
  200. @property
  201. def additional_special_tokens_ids(self):
  202. """ Ids of all the additional special tokens in the vocabulary (list of integers)."""
  203. return [self.vocab.get(token) for token in self._additional_special_tokens]
  204. @additional_special_tokens.setter
  205. def additional_special_tokens(self, value):
  206. self._additional_special_tokens = value
  207. class _GPT2BPETokenizer(AbstractTokenizer):
  208. """Original GPT2 BPE tokenizer."""
  209. def __init__(self, vocab_file, merge_file):
  210. name = 'GPT2 BPE'
  211. super().__init__(name)
  212. self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
  213. special_tokens=[], max_len=None)
  214. self.eod_id = self.tokenizer.encoder['<|endoftext|>']
  215. @property
  216. def vocab_size(self):
  217. return len(self.tokenizer.encoder)
  218. @property
  219. def vocab(self):
  220. return self.tokenizer.encoder
  221. @property
  222. def inv_vocab(self):
  223. return self.tokenizer.decoder
  224. def tokenize(self, text):
  225. return self.tokenizer.encode(text)
  226. def detokenize(self, token_ids):
  227. return self.tokenizer.decode(token_ids)
  228. @property
  229. def eod(self):
  230. return self.eod_id