cosine.py 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. # Copyright (c) 2018 luozhouyang
  2. #
  3. # Permission is hereby granted, free of charge, to any person obtaining a copy
  4. # of this software and associated documentation files (the "Software"), to deal
  5. # in the Software without restriction, including without limitation the rights
  6. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. # copies of the Software, and to permit persons to whom the Software is
  8. # furnished to do so, subject to the following conditions:
  9. #
  10. # The above copyright notice and this permission notice shall be included in all
  11. # copies or substantial portions of the Software.
  12. #
  13. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  19. # SOFTWARE.
  20. import math
  21. from .shingle_based import ShingleBased
  22. from .string_distance import NormalizedStringDistance
  23. from .string_similarity import NormalizedStringSimilarity
  24. class Cosine(ShingleBased, NormalizedStringDistance,
  25. NormalizedStringSimilarity):
  26. def __init__(self, k):
  27. super().__init__(k)
  28. def distance(self, s0, s1):
  29. return 1.0 - self.similarity(s0, s1)
  30. def similarity(self, s0, s1):
  31. if s0 is None:
  32. raise TypeError("Argument s0 is NoneType.")
  33. if s1 is None:
  34. raise TypeError("Argument s1 is NoneType.")
  35. if s0 == s1:
  36. return 1.0
  37. if len(s0) < self.get_k() or len(s1) < self.get_k():
  38. return 0.0
  39. profile0 = self.get_profile(s0)
  40. profile1 = self.get_profile(s1)
  41. return self._dot_product(profile0, profile1) / (self._norm(profile0) * self._norm(profile1))
  42. def similarity_profiles(self, profile0, profile1):
  43. return self._dot_product(profile0, profile1) / (self._norm(profile0) * self._norm(profile1))
  44. @staticmethod
  45. def _dot_product(profile0, profile1):
  46. small = profile1
  47. large = profile0
  48. if len(profile0) < len(profile1):
  49. small = profile0
  50. large = profile1
  51. agg = 0.0
  52. for k, v in small.items():
  53. i = large.get(k)
  54. if not i:
  55. continue
  56. agg += 1.0 * v * i
  57. return agg
  58. @staticmethod
  59. def _norm(profile):
  60. agg = 0.0
  61. for k, v in profile.items():
  62. agg += 1.0 * v * v
  63. return math.sqrt(agg)