SV_preprocess_gpt.sh 678 B

1234567891011121314151617181920
  1. # Copyright (c) 2020 NVIDIA Corporation. All rights reserved.
  2. INPUT_JSON_FILE=/workspace/SVdata/raw/json/79803/SV_CC100Sprakbank.json
  3. #OUTPUT_PATH=./SVdata/gpt2bpe/SV_GPT3_56kvocab_CC100Sprakbank
  4. OUTPUT_PATH=./SVdata/gpt2bpe/SV_GPT3_56kvocab_CC100Sprakbank
  5. VOCAB_FILE=./SVdata/gpt2bpe/56k/vocab.json
  6. MERGE_FILE=./SVdata/gpt2bpe/56k/merges.txt
  7. NUM_CPUS=1
  8. python tools/preprocess_data.py \
  9. --input $INPUT_JSON_FILE \
  10. --output-prefix $OUTPUT_PATH \
  11. --json-keys text \
  12. --vocab-file $VOCAB_FILE \
  13. --merge-file $MERGE_FILE \
  14. --dataset-impl mmap \
  15. --tokenizer-type GPT2BPETokenizer \
  16. --workers $NUM_CPUS \
  17. --append-eod