create_loose_json.py 1.3 KB

1234567891011121314151617181920212223242526272829303132333435363738
  1. # -*- coding: utf-8 -*-
  2. import json
  3. import os, sys
  4. import numpy as np
  5. import argparse
  6. import numpy as np
  7. import torch
  8. import os
  9. import pandas as pd
  10. import time
  11. #also filter duplicates
  12. def main(args):
  13. with open(args.infile,'r', encoding='utf-8',errors='ignore') as fin:
  14. with open(args.outfile,'a', encoding='utf-8') as fout:
  15. lines=fin.readlines()
  16. i=0
  17. for line in lines:
  18. if line.strip() not in ['\n','\t','',' ','\r\n'] : # make sure it's not empty
  19. d={'text':line.strip()}
  20. #print(d.items())
  21. data = json.dumps(d,ensure_ascii=False)
  22. fout.write(data)
  23. fout.write('\n')
  24. i+=1
  25. if i%1000000==0:
  26. print("process {} documents so far ...".format(str(i)))
  27. print("example: ", line)
  28. fin.close()
  29. fout.close()
  30. print("finished processing {} lines to loose json format".format(str(i)))
  31. if __name__ == '__main__':
  32. parser = argparse.ArgumentParser()
  33. parser.add_argument('--infile', default=None, type=str, help='input file path')
  34. parser.add_argument('--outfile', default=None, type=str,
  35. help='output file path')
  36. args = parser.parse_args()
  37. main(args)