123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116 |
- # coding=utf-8
- # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import ftfy
- import json
- from langdetect import detect
- import numpy as np
- import time
- import os
- import sys
- from tokenizer import Tokenizer
- MIN_DOCUMENT_LENGHT = 128
- def print_progress(prefix, start_time, num_docs, num_fixed_text,
- num_non_english_docs, chars_non_english_docs,
- num_small_docs, chars_small_docs):
- string = prefix + ' | '
- string += 'elapsed time: {:.2f} | '.format(time.time() - start_time)
- string += 'documents: {} | '.format(num_docs)
- string += 'fixed text: {} | '.format(num_fixed_text)
- string += 'non-english: {} | '.format(num_non_english_docs)
- string += 'non-english chars: {} | '.format(chars_non_english_docs)
- string += 'small docs: {} | '.format(num_small_docs)
- string += 'small docs chars: {}'.format(chars_small_docs)
- print(string, flush=True)
- def filter_corpus(filename, out_filename, print_interval=10000):
- print(' > filtering {}'.format(filename))
- tokenizer = Tokenizer(cache_dir='./cache')
- num_docs = 0
- num_written_docs = 0
- num_small_docs = 0
- num_fixed_text = 0
- num_non_english_docs = 0
- chars_non_english_docs = 0
- chars_small_docs = 0
- start_time = time.time()
- with open(out_filename, 'wb') as f:
- with open(filename, 'r') as fin:
- for line in fin:
- try:
- num_docs += 1
- myjson = json.loads(line)
- # Fix text
- text = ftfy.fix_text(myjson['text'])
- if text != myjson['text']:
- num_fixed_text += 1
- myjson['text'] = text
- # Detect language.
- if detect(text) != 'en':
- print('[non-english text]', myjson)
- num_non_english_docs += 1
- chars_non_english_docs += len(text)
- continue
- # On average each token is 5 characters so 8 is an
- # upper bound.
- if len(text) < (8 * MIN_DOCUMENT_LENGHT):
- tokens = tokenizer.tokenize_document(text)
- if len(tokens) < MIN_DOCUMENT_LENGHT:
- print('[small document, skipping]:', myjson)
- num_small_docs += 1
- chars_small_docs += len(text)
- continue
- myjson = json.dumps(myjson, ensure_ascii=False)
- f.write(myjson.encode('utf-8'))
- f.write('\n'.encode('utf-8'))
- num_written_docs += 1
- if num_docs % print_interval == 0:
- print_progress('[PROGRESS]', start_time, num_docs,
- num_fixed_text, num_non_english_docs,
- chars_non_english_docs,
- num_small_docs, chars_small_docs)
- except Exception as e:
- print(' skipping ', line, e)
- print_progress('[FINAL]', start_time, num_docs,
- num_fixed_text, num_non_english_docs,
- chars_non_english_docs,
- num_small_docs, chars_small_docs)
- if __name__ == '__main__':
- print('building gpt2 dataset ...')
- input_filename = sys.argv[1]
- output_filename = sys.argv[2]
- print('will be reading {}'.format(input_filename))
- print('and will write the results to {}'.format(output_filename))
- filter_corpus(input_filename, output_filename)
|