similarity.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. from enum import IntEnum
  2. from .cosine import Cosine
  3. from .damerau import Damerau
  4. from .jaccard import Jaccard
  5. from .jarowinkler import JaroWinkler
  6. from .levenshtein import Levenshtein
  7. from .longest_common_subsequence import LongestCommonSubsequence
  8. from .metric_lcs import MetricLCS
  9. from .ngram import NGram
  10. from .normalized_levenshtein import NormalizedLevenshtein
  11. from .optimal_string_alignment import OptimalStringAlignment
  12. from .qgram import QGram
  13. from .sorensen_dice import SorensenDice
  14. from .weighted_levenshtein import WeightedLevenshtein
  15. class Algorithm(IntEnum):
  16. COSINE = 1
  17. DAMERAU = 2
  18. JACCARD = 3
  19. JARO_WINKLE = 4
  20. LEVENSHTEIN = 5
  21. LCS = 6
  22. METRIC_LCS = 7
  23. N_GRAM = 8
  24. NORMALIZED_LEVENSHTEIN = 9
  25. OPTIMAL_STRING_ALIGNMENT = 10
  26. Q_GRAM = 11
  27. SORENSEN_DICE = 12
  28. WEIGHTED_LEVENSHTEIN = 13
  29. class Factory:
  30. @staticmethod
  31. def get_algorithm(algorithm: Algorithm, k=3):
  32. if algorithm == Algorithm.COSINE:
  33. return Cosine(k)
  34. elif algorithm == Algorithm.DAMERAU:
  35. return Damerau()
  36. elif algorithm == Algorithm.JACCARD:
  37. return Jaccard(k)
  38. elif algorithm == Algorithm.JARO_WINKLE:
  39. return JaroWinkler()
  40. elif algorithm == Algorithm.LEVENSHTEIN:
  41. return Levenshtein()
  42. elif algorithm == Algorithm.LCS:
  43. return LongestCommonSubsequence()
  44. elif algorithm == Algorithm.METRIC_LCS:
  45. return MetricLCS()
  46. elif algorithm == Algorithm.N_GRAM:
  47. return NGram()
  48. elif algorithm == Algorithm.NORMALIZED_LEVENSHTEIN:
  49. return NormalizedLevenshtein()
  50. elif algorithm == Algorithm.OPTIMAL_STRING_ALIGNMENT:
  51. return OptimalStringAlignment()
  52. elif algorithm == Algorithm.Q_GRAM:
  53. return QGram()
  54. elif algorithm == Algorithm.SORENSEN_DICE:
  55. return SorensenDice(k)
  56. elif algorithm == Algorithm.WEIGHTED_LEVENSHTEIN:
  57. raise TypeError("This method does not support create weighted_levenshtein algorithm.")
  58. else:
  59. return Cosine(k)
  60. @staticmethod
  61. def get_weighted_levenshtein(char_sub, char_change):
  62. return WeightedLevenshtein(char_sub, char_change)
  63. if __name__ == "__main__":
  64. a = Factory().get_algorithm(Algorithm.LEVENSHTEIN)
  65. distance_format = "distance: {:.4} between {} and {}"
  66. s0 = "你好"
  67. s1 = "你好啊"
  68. print(distance_format.format(str(a.distance(s0, s1)), s0, s1))