cleanup_fix_dataset.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. # coding=utf-8
  2. # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """
  16. Filter and clean documents:
  17. Capable to clean docs with less than 512 characters, less than
  18. 256 characters and contains javascript, fix text and dataset specific
  19. cleaning like stories and realnews datasets.
  20. Program arguments have the details.
  21. """
  22. import argparse
  23. from functools import partial
  24. import glob
  25. import ftfy
  26. import json
  27. from langdetect import detect
  28. import multiprocessing
  29. import os
  30. from pathlib import Path
  31. import re
  32. import time
  33. def process_doc(json_line, args):
  34. # Read the line.
  35. document = json.loads(json_line)
  36. text = document['text']
  37. output = {'remove_512': False, 'remove_256_javascript': False, \
  38. 'remove_512_non_english': False, 'ftfy_fix_text': False, \
  39. 'general_cleaning': False}
  40. try:
  41. # Reomove all docs with less than 512 characters
  42. if "remove_512" in args.tasks:
  43. if len(text) < 512:
  44. output['remove_512'] = True
  45. return output, text, document, True
  46. # Remove docs if less than 256 character length and contains Javascript
  47. if "remove_256_javascript" in args.tasks:
  48. if len(text) < 256 and 'javascript' in text.lower():
  49. output['remove_256_javascript'] = True
  50. return output, text, document, True
  51. # Remove docs < 512 and nonenglish
  52. if "remove_512_non_english" in args.tasks:
  53. if len(text) < 512 and detect(text) != 'en':
  54. output['remove_512_non_english'] = True
  55. return output, text, document, True
  56. # Fix the text using ftfy, don't remove the text, hence return False
  57. if "ftfy_fix_text" in args.tasks:
  58. fixed_text = ftfy.fix_text(text)
  59. output['ftfy_fix_text'] = True
  60. return output, fixed_text, document, False
  61. # Cleaning extra spaces and newlines
  62. if "general_cleaning" in args.tasks:
  63. cleaned_text = re.sub(r" +|\b\n+ |\b\n+", " ", text)
  64. #cleaned_text = re.sub(r"\n\n+", "\n\n", text) # used this for Gutenberg dataset
  65. #cleaned_text = re.sub(r"\n", "\n\n", text) # Used this for realnews
  66. # stories datasets
  67. #cleaned_text = re.sub(r" \'", "'", text)
  68. #cleaned_text = re.sub(r" \!", "!", cleaned_text)
  69. #cleaned_text = re.sub(r" \.", ".", cleaned_text)
  70. #cleaned_text = re.sub(r" \?", "?", cleaned_text)
  71. #cleaned_text = re.sub(r" - ", "-", cleaned_text)
  72. ##cleaned_text = re.sub(r"\" ", "\"", cleaned_text)
  73. #cleaned_text = re.sub(r" @ ", "@", cleaned_text)
  74. output['general_cleaning'] = True
  75. return output, cleaned_text, document, False
  76. except Exception as e:
  77. print('Error: *************************\n{}\ntext: {}'.format(e, \
  78. text), flush=True)
  79. return output, text, document, True
  80. # don't remove
  81. return output, text, document, False
  82. def process_set(args, input_file, output_f_cleaned, output_f_filtered):
  83. print(' > working on {} ...'.format(input_file), flush=True)
  84. num_docs = num_remove_512 = num_remove_java = num_remove_512_non_english \
  85. = num_ftfy_fix_text = num_general_cleaning = 0
  86. # Output file and counters.
  87. output_cleaned = open(output_f_cleaned, 'wb')
  88. output_filtered = open(output_f_filtered, 'wb')
  89. start_time = time.time()
  90. # Setup multi-processing.
  91. num_workers = 40
  92. fin = open(input_file, 'r', encoding='utf-8')
  93. pool = multiprocessing.Pool(num_workers)
  94. process_doc_partial = partial(process_doc, args=args)
  95. processed_docs = pool.imap(process_doc_partial, fin, 500)
  96. # Process documents.
  97. for output, text, document, to_filter in processed_docs:
  98. num_docs += 1
  99. num_remove_512 += 1 if output['remove_512'] else 0
  100. num_remove_java += 1 if output['remove_256_javascript'] else 0
  101. num_remove_512_non_english += 1 if output['remove_512_non_english'] \
  102. else 0
  103. num_ftfy_fix_text += 1 if output['ftfy_fix_text'] else 0
  104. num_general_cleaning += 1 if output['general_cleaning'] else 0
  105. document['text'] = text
  106. myjson = json.dumps(document, ensure_ascii=False)
  107. if to_filter:
  108. output_filtered.write(myjson.encode('utf-8'))
  109. output_filtered.write('\n'.encode('utf-8'))
  110. else:
  111. output_cleaned.write(myjson.encode('utf-8'))
  112. output_cleaned.write('\n'.encode('utf-8'))
  113. if num_docs % args.log_interval == 0:
  114. print(' processed {:9d} documents in {:.2f} seconds ...'.format(
  115. num_docs, time.time() - start_time), flush=True)
  116. # Close the file.
  117. output_cleaned.close()
  118. output_filtered.close()
  119. fin.close()
  120. # Print stats.
  121. print(' >> total docs: {} remove_512 {} remove_256_javascript {} '\
  122. 'remove_512_non_english {} ftfy_fix_text {} general_cleaning {}'.\
  123. format(num_docs, num_remove_512, num_remove_java,\
  124. num_remove_512_non_english, num_ftfy_fix_text, \
  125. num_general_cleaning), flush=True)
  126. if __name__ == '__main__':
  127. print('parsing the arguments ...')
  128. parser = argparse.ArgumentParser()
  129. parser.add_argument('--input-files', nargs = '*', required=True, default=\
  130. None, help = 'Input json files that needs to be'\
  131. ' cleaned')
  132. parser.add_argument('--tasks', nargs = '*', required=True, default=None,\
  133. help = 'Tasks to perform on the input files, ' \
  134. 'such as remove_512, remove_256_javascript, ' \
  135. 'remove_512_non_english, ftfy_fix_text, and ' \
  136. 'general_cleaning. 256 or 512 means the number' \
  137. ' of characters.')
  138. parser.add_argument('--output-path', type=str, default=None,
  139. help='Directory where the output should go')
  140. parser.add_argument('--log-interval', type=int, default=100,
  141. help='Log interval')
  142. args = parser.parse_args()
  143. print('cleanup dataset ...')
  144. for input_file in args.input_files:
  145. input_filename, input_filename_ext = os.path.splitext(Path(input_file)\
  146. .name)
  147. output_f_cleaned = os.path.join(args.output_path, input_filename + \
  148. "_cleaned" + input_filename_ext)
  149. output_f_filtered = os.path.join(args.output_path, input_filename + \
  150. "_filtered" + input_filename_ext)
  151. process_set(args, input_file, output_f_cleaned, output_f_filtered)
  152. print('done :-)', flush=True)