SV_preprocess_gpt.sh 615 B

12345678910111213141516171819
  1. INPUT_JSON_FILE=/workspace/SVdata/raw/json/79803/SV_CC100Sprakbank.json
  2. #OUTPUT_PATH=./SVdata/gpt2bpe/SV_GPT3_56kvocab_CC100Sprakbank
  3. OUTPUT_PATH=./SVdata/gpt2bpe/SV_GPT3_56kvocab_CC100Sprakbank
  4. VOCAB_FILE=./SVdata/gpt2bpe/56k/vocab.json
  5. MERGE_FILE=./SVdata/gpt2bpe/56k/merges.txt
  6. NUM_CPUS=1
  7. python tools/preprocess_data.py \
  8. --input $INPUT_JSON_FILE \
  9. --output-prefix $OUTPUT_PATH \
  10. --json-keys text \
  11. --vocab-file $VOCAB_FILE \
  12. --merge-file $MERGE_FILE \
  13. --dataset-impl mmap \
  14. --tokenizer-type GPT2BPETokenizer \
  15. --workers $NUM_CPUS \
  16. --append-eod