jaccard.py 1.0 KB

1234567891011121314151617181920212223242526272829303132
  1. from .shingle_based import ShingleBased
  2. from .string_distance import NormalizedStringDistance, MetricStringDistance
  3. from .string_similarity import NormalizedStringSimilarity
  4. class Jaccard(ShingleBased, MetricStringDistance, NormalizedStringDistance, NormalizedStringSimilarity):
  5. def __init__(self, k):
  6. super().__init__(k)
  7. def distance(self, s0, s1):
  8. 1.0 - self.similarity(s0, s1)
  9. def similarity(self, s0, s1):
  10. if s0 is None:
  11. raise TypeError("Argument s0 is NoneType.")
  12. if s1 is None:
  13. raise TypeError("Argument s1 is NoneType.")
  14. if s0 == s1:
  15. return 1.0
  16. if len(s0) < self.get_k() or len(s1) < self.get_k():
  17. return 0.0
  18. profile0 = self.get_profile(s0)
  19. profile1 = self.get_profile(s1)
  20. union = set()
  21. for ite in profile0.keys():
  22. union.add(ite)
  23. for ite in profile1.keys():
  24. union.add(ite)
  25. inter = int(len(profile0.keys()) + len(profile1.keys()) - len(union))
  26. return 1.0 * inter / len(union)