create_loose_json.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839
  1. # Copyright (c) 2020 NVIDIA Corporation. All rights reserved.
  2. # -*- coding: utf-8 -*-
  3. import json
  4. import os, sys
  5. import numpy as np
  6. import argparse
  7. import numpy as np
  8. import torch
  9. import os
  10. import pandas as pd
  11. import time
  12. #also filter duplicates
  13. def main(args):
  14. with open(args.infile,'r', encoding='utf-8',errors='ignore') as fin:
  15. with open(args.outfile,'a', encoding='utf-8') as fout:
  16. lines=fin.readlines()
  17. i=0
  18. for line in lines:
  19. if line.strip() not in ['\n','\t','',' ','\r\n'] : # make sure it's not empty
  20. d={'text':line.strip()}
  21. #print(d.items())
  22. data = json.dumps(d,ensure_ascii=False)
  23. fout.write(data)
  24. fout.write('\n')
  25. i+=1
  26. if i%1000000==0:
  27. print("process {} documents so far ...".format(str(i)))
  28. print("example: ", line)
  29. fin.close()
  30. fout.close()
  31. print("finished processing {} lines to loose json format".format(str(i)))
  32. if __name__ == '__main__':
  33. parser = argparse.ArgumentParser()
  34. parser.add_argument('--infile', default=None, type=str, help='input file path')
  35. parser.add_argument('--outfile', default=None, type=str,
  36. help='output file path')
  37. args = parser.parse_args()
  38. main(args)