trainGPTTokenizer.py 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. # Copyright (c) 2020 NVIDIA Corporation. All rights reserved.
  2. # -*- coding: utf-8 -*-
  3. import os , sys
  4. from tokenizers import Tokenizer
  5. from tokenizers import Tokenizer, models, pre_tokenizers, trainers
  6. from tokenizers.decoders import ByteLevel as ByteLevelDecoder
  7. from tokenizers.trainers import BpeTrainer
  8. from tokenizers.normalizers import NFKC, Sequence
  9. from tokenizers.pre_tokenizers import Whitespace
  10. import argparse
  11. import os
  12. if __name__ == '__main__':
  13. parser = argparse.ArgumentParser()
  14. parser.add_argument('--infile', default=None, type=str, help='path to the txt files')
  15. parser.add_argument('--bpe_path', default=None, type=str, help='output GPTBPE path')
  16. parser.add_argument('--load_pretrained', action='store_true', help='load pretrained GPTBPE model')
  17. parser.add_argument('--pretrained_gpt_dir', default=None, type=str, help='path to pretrained GPT vocab and merge files, default None')
  18. parser.add_argument('--vocab_size', default=None, type=int,
  19. help='specify the vocab_size when training HF GPTBPE for own language usually 16k/32k/48k/64k')
  20. args = parser.parse_args()
  21. tokenizer = Tokenizer(models.BPE())
  22. if args.load_pretrained :
  23. if args.pretrained_gpt_dir is not None :
  24. print("loading gpt2bpe english vocab and merge \n")
  25. vocab_file=args.pretrained_gpt_dir+'/gpt2-vocab.json'
  26. merge_file=args.pretrained_gpt_dir+'/gpt2-merges.txt'
  27. tokenizer.model = models.BPE.from_file(vocab_file, merge_file)
  28. else:
  29. print("please provide path to the pretrained gpt vocab and merge file !")
  30. tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
  31. tokenizer.decoder = ByteLevelDecoder()
  32. print("include minimal special token end of text ")
  33. special_tokens= ["<|endoftext|>"]
  34. # Set the training hyperparameters
  35. trainer = trainers.BpeTrainer(
  36. vocab_size=args.vocab_size,
  37. special_tokens=special_tokens,
  38. initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
  39. )
  40. # Train it with either files or an iterator:
  41. tokenizer.train([args.infile], trainer=trainer)
  42. print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))
  43. # You will see the generated files in the output.
  44. print("saving trained BPE model to : ", args.bpe_path)
  45. tokenizer.model.save(args.bpe_path)
  46. print("model saved ! \n\n\n")
  47. print("testing ...\n\n\n")
  48. test_txt="Har någon funderat på varför man inte får inomhusteperaturens kurva synlig i grafen? Är det någon som frågat Thermia? Skulle det inte vara väsentligt att kunna kolla historiken på den då man skall ställa in kurvan?"
  49. output = tokenizer.encode(test_txt)
  50. print(output.tokens)