cosine.py 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. # Copyright (c) 2018 luozhouyang
  2. #
  3. # Permission is hereby granted, free of charge, to any person obtaining a copy
  4. # of this software and associated documentation files (the "Software"), to deal
  5. # in the Software without restriction, including without limitation the rights
  6. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. # copies of the Software, and to permit persons to whom the Software is
  8. # furnished to do so, subject to the following conditions:
  9. #
  10. # The above copyright notice and this permission notice shall be included in all
  11. # copies or substantial portions of the Software.
  12. #
  13. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  19. # SOFTWARE.
  20. import math
  21. from .shingle_based import ShingleBased
  22. from .string_distance import NormalizedStringDistance
  23. from .string_similarity import NormalizedStringSimilarity
  24. import re
  25. _SPACE_PATTERN = re.compile("\\s+")
  26. class Cosine(ShingleBased, NormalizedStringDistance,
  27. NormalizedStringSimilarity):
  28. def __init__(self, k):
  29. super().__init__(k)
  30. def distance(self, s0, s1):
  31. return 1.0 - self.similarity(s0, s1)
  32. def similarity(self, s0, s1):
  33. if s0 is None:
  34. raise TypeError("Argument s0 is NoneType.")
  35. if s1 is None:
  36. raise TypeError("Argument s1 is NoneType.")
  37. if s0 == s1:
  38. return 1.0
  39. s0 = _SPACE_PATTERN.sub("", s0)
  40. s1 = _SPACE_PATTERN.sub("", s1)
  41. if len(s0) < self.get_k() or len(s1) < self.get_k():
  42. return 0.0
  43. profile0 = self.get_profile(s0)
  44. profile1 = self.get_profile(s1)
  45. return self._dot_product(profile0, profile1) / (
  46. self._norm(profile0) * self._norm(profile1))
  47. def similarity_profiles(self, profile0, profile1):
  48. return self._dot_product(profile0, profile1) / (
  49. self._norm(profile0) * self._norm(profile1))
  50. @staticmethod
  51. def _dot_product(profile0, profile1):
  52. small = profile1
  53. large = profile0
  54. if len(profile0) < len(profile1):
  55. small = profile0
  56. large = profile1
  57. agg = 0.0
  58. for k, v in small.items():
  59. i = large.get(k)
  60. if not i:
  61. continue
  62. agg += 1.0 * v * i
  63. return agg
  64. @staticmethod
  65. def _norm(profile):
  66. agg = 0.0
  67. for k, v in profile.items():
  68. agg += 1.0 * v * v
  69. return math.sqrt(agg)