cosine.py 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. # Copyright (c) 2018 luozhouyang
  2. #
  3. # Permission is hereby granted, free of charge, to any person obtaining a copy
  4. # of this software and associated documentation files (the "Software"), to deal
  5. # in the Software without restriction, including without limitation the rights
  6. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. # copies of the Software, and to permit persons to whom the Software is
  8. # furnished to do so, subject to the following conditions:
  9. #
  10. # The above copyright notice and this permission notice shall be included in all
  11. # copies or substantial portions of the Software.
  12. #
  13. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  19. # SOFTWARE.
  20. import math
  21. from .shingle_based import ShingleBased
  22. from .string_distance import NormalizedStringDistance
  23. from .string_similarity import NormalizedStringSimilarity
  24. class Cosine(ShingleBased, NormalizedStringDistance,
  25. NormalizedStringSimilarity):
  26. def __init__(self, k):
  27. super().__init__(k)
  28. def distance(self, s0, s1):
  29. return 1.0 - self.similarity(s0, s1)
  30. def similarity(self, s0, s1):
  31. if s0 is None:
  32. raise TypeError("Argument s0 is NoneType.")
  33. if s1 is None:
  34. raise TypeError("Argument s1 is NoneType.")
  35. if s0 == s1:
  36. return 1.0
  37. if len(s0) < self.get_k() or len(s1) < self.get_k():
  38. return 0.0
  39. profile0 = self.get_profile(s0)
  40. profile1 = self.get_profile(s1)
  41. return self._dot_product(profile0, profile1) / (
  42. self._norm(profile0) * self._norm(profile1))
  43. def similarity_profiles(self, profile0, profile1):
  44. return self._dot_product(profile0, profile1) / (
  45. self._norm(profile0) * self._norm(profile1))
  46. @staticmethod
  47. def _dot_product(profile0, profile1):
  48. small = profile1
  49. large = profile0
  50. if len(profile0) < len(profile1):
  51. small = profile0
  52. large = profile1
  53. agg = 0.0
  54. for k, v in small.items():
  55. i = large.get(k)
  56. if not i:
  57. continue
  58. agg += 1.0 * v * i
  59. return agg
  60. @staticmethod
  61. def _norm(profile):
  62. agg = 0.0
  63. for k, v in profile.items():
  64. agg += 1.0 * v * v
  65. return math.sqrt(agg)