overlap_coefficient.py 968 B

1234567891011121314151617181920212223242526272829
  1. from .shingle_based import ShingleBased
  2. from .string_distance import NormalizedStringDistance
  3. from .string_similarity import NormalizedStringSimilarity
  4. class OverlapCoefficient(ShingleBased, NormalizedStringDistance, NormalizedStringSimilarity):
  5. def __init__(self, k=3):
  6. super().__init__(k)
  7. def distance(self, s0, s1):
  8. return 1.0 - self.similarity(s0, s1)
  9. def similarity(self, s0, s1):
  10. if s0 is None:
  11. raise TypeError("Argument s0 is NoneType.")
  12. if s1 is None:
  13. raise TypeError("Argument s1 is NoneType.")
  14. if s0 == s1:
  15. return 1.0
  16. union = set()
  17. profile0, profile1 = self.get_profile(s0), self.get_profile(s1)
  18. for k in profile0.keys():
  19. union.add(k)
  20. for k in profile1.keys():
  21. union.add(k)
  22. inter = int(len(profile0.keys()) + len(profile1.keys()) - len(union))
  23. return inter / min(len(profile0), len(profile1))