| 1234567891011121314151617181920212223 | import fireimport osimport sentencepiece as spmdef main(data_file, save_path, vocab_size=16_000, num_threads=8):    os.makedirs(save_path, exist_ok=True)    tokenizer_name = os.path.join(save_path, "tokenizer")        spm.SentencePieceTrainer.train(        input=data_file,        model_prefix=tokenizer_name,        vocab_size=vocab_size,        num_threads=num_threads,        model_type="bpe",        max_sentence_length=1073741824,        shuffle_input_sentence="true",        character_coverage=1.0,        hard_vocab_limit="false",    )if __name__ == "__main__":    fire.Fire(main)
 |