123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192 |
- # coding=utf-8
- # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """
- Filter and clean documents:
- Capable to clean docs with less than 512 characters, less than
- 256 characters and contains javascript, fix text and dataset specific
- cleaning like stories and realnews datasets.
- Program arguments have the details.
- """
- import argparse
- from functools import partial
- import glob
- import ftfy
- import json
- from langdetect import detect
- import multiprocessing
- import os
- from pathlib import Path
- import re
- import time
- def process_doc(json_line, args):
- # Read the line.
- document = json.loads(json_line)
- text = document['text']
- output = {'remove_512': False, 'remove_256_javascript': False, \
- 'remove_512_non_english': False, 'ftfy_fix_text': False, \
- 'general_cleaning': False}
- try:
- # Reomove all docs with less than 512 characters
- if "remove_512" in args.tasks:
- if len(text) < 512:
- output['remove_512'] = True
- return output, text, document, True
- # Remove docs if less than 256 character length and contains Javascript
- if "remove_256_javascript" in args.tasks:
- if len(text) < 256 and 'javascript' in text.lower():
- output['remove_256_javascript'] = True
- return output, text, document, True
- # Remove docs < 512 and nonenglish
- if "remove_512_non_english" in args.tasks:
- if len(text) < 512 and detect(text) != 'en':
- output['remove_512_non_english'] = True
- return output, text, document, True
- # Fix the text using ftfy, don't remove the text, hence return False
- if "ftfy_fix_text" in args.tasks:
- fixed_text = ftfy.fix_text(text)
- output['ftfy_fix_text'] = True
- return output, fixed_text, document, False
- # Cleaning extra spaces and newlines
- if "general_cleaning" in args.tasks:
- cleaned_text = re.sub(r" +|\b\n+ |\b\n+", " ", text)
- #cleaned_text = re.sub(r"\n\n+", "\n\n", text) # used this for Gutenberg dataset
- #cleaned_text = re.sub(r"\n", "\n\n", text) # Used this for realnews
- # stories datasets
- #cleaned_text = re.sub(r" \'", "'", text)
- #cleaned_text = re.sub(r" \!", "!", cleaned_text)
- #cleaned_text = re.sub(r" \.", ".", cleaned_text)
- #cleaned_text = re.sub(r" \?", "?", cleaned_text)
- #cleaned_text = re.sub(r" - ", "-", cleaned_text)
- ##cleaned_text = re.sub(r"\" ", "\"", cleaned_text)
- #cleaned_text = re.sub(r" @ ", "@", cleaned_text)
- output['general_cleaning'] = True
- return output, cleaned_text, document, False
- except Exception as e:
- print('Error: *************************\n{}\ntext: {}'.format(e, \
- text), flush=True)
- return output, text, document, True
- # don't remove
- return output, text, document, False
- def process_set(args, input_file, output_f_cleaned, output_f_filtered):
- print(' > working on {} ...'.format(input_file), flush=True)
-
- num_docs = num_remove_512 = num_remove_java = num_remove_512_non_english \
- = num_ftfy_fix_text = num_general_cleaning = 0
- # Output file and counters.
- output_cleaned = open(output_f_cleaned, 'wb')
- output_filtered = open(output_f_filtered, 'wb')
- start_time = time.time()
- # Setup multi-processing.
- num_workers = 40
- fin = open(input_file, 'r', encoding='utf-8')
- pool = multiprocessing.Pool(num_workers)
- process_doc_partial = partial(process_doc, args=args)
- processed_docs = pool.imap(process_doc_partial, fin, 500)
- # Process documents.
- for output, text, document, to_filter in processed_docs:
- num_docs += 1
- num_remove_512 += 1 if output['remove_512'] else 0
- num_remove_java += 1 if output['remove_256_javascript'] else 0
- num_remove_512_non_english += 1 if output['remove_512_non_english'] \
- else 0
- num_ftfy_fix_text += 1 if output['ftfy_fix_text'] else 0
- num_general_cleaning += 1 if output['general_cleaning'] else 0
- document['text'] = text
- myjson = json.dumps(document, ensure_ascii=False)
- if to_filter:
- output_filtered.write(myjson.encode('utf-8'))
- output_filtered.write('\n'.encode('utf-8'))
- else:
- output_cleaned.write(myjson.encode('utf-8'))
- output_cleaned.write('\n'.encode('utf-8'))
- if num_docs % args.log_interval == 0:
- print(' processed {:9d} documents in {:.2f} seconds ...'.format(
- num_docs, time.time() - start_time), flush=True)
- # Close the file.
- output_cleaned.close()
- output_filtered.close()
- fin.close()
- # Print stats.
- print(' >> total docs: {} remove_512 {} remove_256_javascript {} '\
- 'remove_512_non_english {} ftfy_fix_text {} general_cleaning {}'.\
- format(num_docs, num_remove_512, num_remove_java,\
- num_remove_512_non_english, num_ftfy_fix_text, \
- num_general_cleaning), flush=True)
- if __name__ == '__main__':
- print('parsing the arguments ...')
- parser = argparse.ArgumentParser()
- parser.add_argument('--input-files', nargs = '*', required=True, default=\
- None, help = 'Input json files that needs to be'\
- ' cleaned')
- parser.add_argument('--tasks', nargs = '*', required=True, default=None,\
- help = 'Tasks to perform on the input files, ' \
- 'such as remove_512, remove_256_javascript, ' \
- 'remove_512_non_english, ftfy_fix_text, and ' \
- 'general_cleaning. 256 or 512 means the number' \
- ' of characters.')
- parser.add_argument('--output-path', type=str, default=None,
- help='Directory where the output should go')
- parser.add_argument('--log-interval', type=int, default=100,
- help='Log interval')
- args = parser.parse_args()
- print('cleanup dataset ...')
- for input_file in args.input_files:
- input_filename, input_filename_ext = os.path.splitext(Path(input_file)\
- .name)
- output_f_cleaned = os.path.join(args.output_path, input_filename + \
- "_cleaned" + input_filename_ext)
- output_f_filtered = os.path.join(args.output_path, input_filename + \
- "_filtered" + input_filename_ext)
- process_set(args, input_file, output_f_cleaned, output_f_filtered)
- print('done :-)', flush=True)
|