similarity.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. # Copyright (c) 2018 luozhouyang
  2. #
  3. # Permission is hereby granted, free of charge, to any person obtaining a copy
  4. # of this software and associated documentation files (the "Software"), to deal
  5. # in the Software without restriction, including without limitation the rights
  6. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. # copies of the Software, and to permit persons to whom the Software is
  8. # furnished to do so, subject to the following conditions:
  9. #
  10. # The above copyright notice and this permission notice shall be included in all
  11. # copies or substantial portions of the Software.
  12. #
  13. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  19. # SOFTWARE.
  20. from enum import IntEnum
  21. from .cosine import Cosine
  22. from .damerau import Damerau
  23. from .jaccard import Jaccard
  24. from .jaro_winkler import JaroWinkler
  25. from .levenshtein import Levenshtein
  26. from .longest_common_subsequence import LongestCommonSubsequence
  27. from .metric_lcs import MetricLCS
  28. from .ngram import NGram
  29. from .normalized_levenshtein import NormalizedLevenshtein
  30. from .optimal_string_alignment import OptimalStringAlignment
  31. from .qgram import QGram
  32. from .sorensen_dice import SorensenDice
  33. from .weighted_levenshtein import WeightedLevenshtein
  34. class Algorithm(IntEnum):
  35. COSINE = 1
  36. DAMERAU = 2
  37. JACCARD = 3
  38. JARO_WINKLE = 4
  39. LEVENSHTEIN = 5
  40. LCS = 6
  41. METRIC_LCS = 7
  42. N_GRAM = 8
  43. NORMALIZED_LEVENSHTEIN = 9
  44. OPTIMAL_STRING_ALIGNMENT = 10
  45. Q_GRAM = 11
  46. SORENSEN_DICE = 12
  47. WEIGHTED_LEVENSHTEIN = 13
  48. class Factory:
  49. @staticmethod
  50. def get_algorithm(algorithm: Algorithm, k=3):
  51. if algorithm == Algorithm.COSINE:
  52. return Cosine(k)
  53. elif algorithm == Algorithm.DAMERAU:
  54. return Damerau()
  55. elif algorithm == Algorithm.JACCARD:
  56. return Jaccard(k)
  57. elif algorithm == Algorithm.JARO_WINKLE:
  58. return JaroWinkler()
  59. elif algorithm == Algorithm.LEVENSHTEIN:
  60. return Levenshtein()
  61. elif algorithm == Algorithm.LCS:
  62. return LongestCommonSubsequence()
  63. elif algorithm == Algorithm.METRIC_LCS:
  64. return MetricLCS()
  65. elif algorithm == Algorithm.N_GRAM:
  66. return NGram()
  67. elif algorithm == Algorithm.NORMALIZED_LEVENSHTEIN:
  68. return NormalizedLevenshtein()
  69. elif algorithm == Algorithm.OPTIMAL_STRING_ALIGNMENT:
  70. return OptimalStringAlignment()
  71. elif algorithm == Algorithm.Q_GRAM:
  72. return QGram()
  73. elif algorithm == Algorithm.SORENSEN_DICE:
  74. return SorensenDice(k)
  75. elif algorithm == Algorithm.WEIGHTED_LEVENSHTEIN:
  76. raise TypeError("This method does not support create weighted_levenshtein algorithm.")
  77. else:
  78. return Cosine(k)
  79. @staticmethod
  80. def get_weighted_levenshtein(char_sub, char_change):
  81. return WeightedLevenshtein(char_sub, char_change)
  82. if __name__ == "__main__":
  83. a = Factory().get_algorithm(Algorithm.LEVENSHTEIN)
  84. distance_format = "distance: {:.4} between {} and {}"
  85. s0 = "你好"
  86. s1 = "你好啊"
  87. print(distance_format.format(str(a.distance(s0, s1)), s0, s1))