get_nyheterdata.py 821 B

123456789101112131415161718192021222324252627282930
  1. # Copyright (c) 2020 NVIDIA Corporation. All rights reserved.
  2. import json
  3. import os, sys
  4. import numpy as np
  5. import nltk
  6. from sb_corpus_reader import SBCorpusReader
  7. import random
  8. def write2csv(out_path, fname, sents):
  9. f=open(out_path+fname,'a')
  10. for s in sents:
  11. if len(s)>=2:
  12. s_text=' '.join(s)
  13. f.write(s_text+'\n')
  14. print("finish processing ",fname)
  15. f.close()
  16. out_path='./'
  17. xml_f=out_path+'webbnyheter2013.xml'
  18. if xml_f.endswith('.xml') :
  19. corpus = SBCorpusReader(xml_f)
  20. sents=corpus.sents()
  21. print(sents[:2])
  22. #n=len(sents)
  23. #rn=random.randint(0,n-1)
  24. #print("a random sample of sentence : \n".format(' '.join(sents[rn])))
  25. fname='webnyheter2013.txt'
  26. print("write to : ",fname)
  27. write2csv(out_path,fname,sents)
  28. print('-----'*10)