sorensen_dice.py 1019 B

1234567891011121314151617181920212223242526272829303132
  1. from .shingle_based import ShingleBased
  2. from .string_distance import NormalizedStringDistance
  3. from .string_similarity import NormalizedStringSimilarity
  4. class SorensenDice(ShingleBased, NormalizedStringDistance, NormalizedStringSimilarity):
  5. def __init__(self, k=3):
  6. super().__init__(k)
  7. def distance(self, s0, s1):
  8. return 1.0 - self.similarity(s0, s1)
  9. def similarity(self, s0, s1):
  10. if s0 is None:
  11. raise TypeError("Argument s0 is NoneType.")
  12. if s1 is None:
  13. raise TypeError("Argument s1 is NoneType.")
  14. if s0 == s1:
  15. return 1.0
  16. union = set()
  17. profile0, profile1 = self.get_profile(s0), self.get_profile(s1)
  18. for k in profile0.keys():
  19. union.add(k)
  20. for k in profile1.keys():
  21. union.add(k)
  22. inter = 0
  23. for k in union:
  24. if k in profile0.keys() and k in profile1.keys():
  25. inter += 1
  26. return 2.0 * inter / (len(profile0) + len(profile1))