gensim_similarity.py 1.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. # Documents Similarity using NLTK and Gensim library
  2. import gensim
  3. import nltk
  4. from nltk.tokenize import word_tokenize
  5. raw_documents = ["I'm taking the show on the road.",
  6. "My socks are a force multiplier.",
  7. "I am the barber who cuts everyone's hair who doesn't cut their own.",
  8. "Legend has it that the mind is a mad monkey.",
  9. "I make my own fun."]
  10. print("Number of documents:",len(raw_documents))
  11. gen_docs = [[w.lower() for w in word_tokenize(text)]
  12. for text in raw_documents]
  13. print(gen_docs)
  14. dictionary = gensim.corpora.Dictionary(gen_docs)
  15. print(dictionary[5])
  16. print(dictionary.token2id['road'])
  17. print("Number of words in dictionary:",len(dictionary))
  18. for i in range(len(dictionary)):
  19. print(i, dictionary[i])
  20. corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
  21. print(corpus)
  22. tf_idf = gensim.models.TfidfModel(corpus)
  23. print(tf_idf)
  24. s = 0
  25. for i in corpus:
  26. s += len(i)
  27. print(s)
  28. sims = gensim.similarities.Similarity('workdir/',tf_idf[corpus],
  29. num_features=len(dictionary))
  30. print(sims)
  31. print(type(sims))
  32. query_doc = [w.lower() for w in word_tokenize("Socks are a force for good.")]
  33. print(query_doc)
  34. query_doc_bow = dictionary.doc2bow(query_doc)
  35. print(query_doc_bow)
  36. query_doc_tf_idf = tf_idf[query_doc_bow]
  37. print(query_doc_tf_idf)
  38. print(sims[query_doc_tf_idf])