trainGPTTokenizer.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. # -*- coding: utf-8 -*-
  2. import os , sys
  3. from tokenizers import Tokenizer
  4. from tokenizers import Tokenizer, models, pre_tokenizers, trainers
  5. from tokenizers.decoders import ByteLevel as ByteLevelDecoder
  6. from tokenizers.trainers import BpeTrainer
  7. from tokenizers.normalizers import NFKC, Sequence
  8. from tokenizers.pre_tokenizers import Whitespace
  9. import argparse
  10. import os
  11. if __name__ == '__main__':
  12. parser = argparse.ArgumentParser()
  13. parser.add_argument('--infile', default=None, type=str, help='path to the txt files')
  14. parser.add_argument('--bpe_path', default=None, type=str, help='output GPTBPE path')
  15. parser.add_argument('--load_pretrained', action='store_true', help='load pretrained GPTBPE model')
  16. parser.add_argument('--pretrained_gpt_dir', default=None, type=str, help='path to pretrained GPT vocab and merge files, default None')
  17. parser.add_argument('--vocab_size', default=None, type=int,
  18. help='specify the vocab_size when training HF GPTBPE for own language usually 16k/32k/48k/64k')
  19. args = parser.parse_args()
  20. tokenizer = Tokenizer(models.BPE())
  21. if args.load_pretrained :
  22. if args.pretrained_gpt_dir is not None :
  23. print("loading gpt2bpe english vocab and merge \n")
  24. vocab_file=args.pretrained_gpt_dir+'/gpt2-vocab.json'
  25. merge_file=args.pretrained_gpt_dir+'/gpt2-merges.txt'
  26. tokenizer.model = models.BPE.from_file(vocab_file, merge_file)
  27. else:
  28. print("please provide path to the pretrained gpt vocab and merge file !")
  29. tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
  30. tokenizer.decoder = ByteLevelDecoder()
  31. print("include minimal special token end of text ")
  32. special_tokens= ["<|endoftext|>"]
  33. # Set the training hyperparameters
  34. trainer = trainers.BpeTrainer(
  35. vocab_size=args.vocab_size,
  36. special_tokens=special_tokens,
  37. initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
  38. )
  39. # Train it with either files or an iterator:
  40. tokenizer.train([args.infile], trainer=trainer)
  41. print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))
  42. # You will see the generated files in the output.
  43. print("saving trained BPE model to : ", args.bpe_path)
  44. tokenizer.model.save(args.bpe_path)
  45. print("model saved ! \n\n\n")
  46. print("testing ...\n\n\n")
  47. test_txt="Har någon funderat på varför man inte får inomhusteperaturens kurva synlig i grafen? Är det någon som frågat Thermia? Skulle det inte vara väsentligt att kunna kolla historiken på den då man skall ställa in kurvan?"
  48. output = tokenizer.encode(test_txt)
  49. print(output.tokens)