sift4.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. # Permission is hereby granted, free of charge, to any person obtaining a copy
  2. # of this software and associated documentation files (the "Software"), to deal
  3. # in the Software without restriction, including without limitation the rights
  4. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  5. # copies of the Software, and to permit persons to whom the Software is
  6. # furnished to do so, subject to the following conditions:
  7. #
  8. # The above copyright notice and this permission notice shall be included in all
  9. # copies or substantial portions of the Software.
  10. #
  11. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  12. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  13. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  14. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  15. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  16. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  17. # SOFTWARE.
  18. from .string_distance import MetricStringDistance
  19. class SIFT4Options(MetricStringDistance):
  20. def __init__(self, options=None):
  21. self.options = {
  22. 'maxdistance': 0,
  23. 'tokenizer': lambda x: [i for i in x],
  24. 'tokenmatcher': lambda t1, t2: t1 == t2,
  25. 'matchingevaluator': lambda t1, t2: 1,
  26. 'locallengthevaluator': lambda x: x,
  27. 'transpositioncostevaluator': lambda c1, c2: 1,
  28. 'transpositionsevaluator': lambda lcss, trans: lcss - trans
  29. }
  30. otheroptions = {
  31. 'tokenizer': {
  32. 'ngram': self.ngramtokenizer,
  33. 'wordsplit': self.wordsplittokenizer,
  34. 'characterfrequency': self.characterfrequencytokenizer
  35. },
  36. 'tokematcher': {'sift4tokenmatcher': self.sift4tokenmatcher},
  37. 'matchingevaluator': {'sift4matchingevaluator': self.sift4matchingevaluator},
  38. 'locallengthevaluator': {
  39. 'rewardlengthevaluator': self.rewardlengthevaluator,
  40. 'rewardlengthevaluator2': self.rewardlengthevaluator2
  41. },
  42. 'transpositioncostevaluator': {'longertranspositionsaremorecostly':self.longertranspositionsaremorecostly},
  43. 'transpositionsevaluator': {}
  44. }
  45. if isinstance(options, dict):
  46. for k, v in options.items():
  47. if k in self.options.keys():
  48. if k == 'maxdistance':
  49. if isinstance(v, int):
  50. self.options[k] = v
  51. else:
  52. raise ValueError("Option maxdistance should be int")
  53. else:
  54. if callable(v):
  55. self.options[k] = v
  56. else:
  57. if v in otheroptions[k].keys():
  58. self.options[k] = otheroptions[k][v]
  59. else:
  60. msg = "Option {} should be callable or one of [{}]".format(k, ', '.join(otheroptions[k].keys()))
  61. raise ValueError(msg)
  62. else:
  63. raise ValueError("Option {} not recognized.".format(k))
  64. elif options is not None:
  65. raise ValueError("options should be a dictionary")
  66. self.maxdistance = self.options['maxdistance']
  67. self.tokenizer = self.options['tokenizer']
  68. self.tokenmatcher = self.options['tokenmatcher']
  69. self.matchingevaluator = self.options['matchingevaluator']
  70. self.locallengthevaluator = self.options['locallengthevaluator']
  71. self.transpositioncostevaluator = self.options['transpositioncostevaluator']
  72. self.transpositionsevaluator = self.options['transpositionsevaluator']
  73. # tokenizers:
  74. @staticmethod
  75. def ngramtokenizer(s, n):
  76. result = []
  77. if not s:
  78. return result
  79. for i in range(len(s) - n - 1):
  80. result.append(s[i:(i + n)])
  81. return result
  82. @staticmethod
  83. def wordsplittokenizer(s):
  84. if not s:
  85. return []
  86. return s.split()
  87. @staticmethod
  88. def characterfrequencytokenizer(s):
  89. letters = [i for i in 'abcdefghijklmnopqrstuvwxyz']
  90. return [s.lower().count(x) for x in letters]
  91. # tokenMatchers:
  92. @staticmethod
  93. def sift4tokenmatcher(t1, t2):
  94. similarity = 1 - SIFT4().distance(t1, t2, 5) / max(len(t1), len(t2))
  95. return similarity > 0.7
  96. # matchingEvaluators:
  97. @staticmethod
  98. def sift4matchingevaluator(t1, t2):
  99. similarity = 1 - SIFT4().distance(t1, t2, 5) / max(len(t1), len(t2))
  100. return similarity
  101. # localLengthEvaluators:
  102. @staticmethod
  103. def rewardlengthevaluator(l):
  104. if l < 1:
  105. return l
  106. return l - 1 / (l + 1)
  107. @staticmethod
  108. def rewardlengthevaluator2(l):
  109. return pow(l, 1.5)
  110. # transpositionCostEvaluators:
  111. @staticmethod
  112. def longertranspositionsaremorecostly(c1, c2):
  113. return abs(c2 - c1) / 9 + 1
  114. class SIFT4:
  115. # As described in https://siderite.dev/blog/super-fast-and-accurate-string-distance.html/
  116. def distance(self, s1, s2, maxoffset=5, options=None):
  117. options = SIFT4Options(options)
  118. t1, t2 = options.tokenizer(s1), options.tokenizer(s2)
  119. l1, l2 = len(t1), len(t2)
  120. if l1 == 0:
  121. return l2
  122. if l2 == 0:
  123. return l1
  124. c1, c2, lcss, local_cs, trans, offset_arr = 0, 0, 0, 0, 0, []
  125. while (c1 < l1) and (c2 < l2):
  126. if options.tokenmatcher(t1[c1], t2[c2]):
  127. local_cs += options.matchingevaluator(t1[c1], t2[c2])
  128. isTrans = False
  129. i = 0
  130. while i < len(offset_arr):
  131. ofs = offset_arr[i]
  132. if (c1 <= ofs['c1']) or (c2 <= ofs['c2']):
  133. isTrans = abs(c2 - c1) >= abs(ofs['c2'] - ofs['c1'])
  134. if isTrans:
  135. trans += options.transpositioncostevaluator(c1, c2)
  136. else:
  137. if not ofs['trans']:
  138. ofs['trans'] = True
  139. trans += options.transpositioncostevaluator(ofs['c1'], ofs['c2'])
  140. break
  141. else:
  142. if (c1 > ofs['c2']) and (c2 > ofs['c1']):
  143. offset_arr.pop(i)
  144. else:
  145. i += 1
  146. offset_arr.append({'c1': c1, 'c2': c2, 'trans': isTrans})
  147. else:
  148. lcss += options.locallengthevaluator(local_cs)
  149. local_cs = 0
  150. if c1 != c2:
  151. c1 = c2 = min(c1, c2)
  152. for i in range(maxoffset):
  153. if (c1 + i < l1) or (c2 + i < l2):
  154. if (c1 + i < l1) and options.tokenmatcher(t1[c1 + i], t2[c2]):
  155. c1 += i - 1
  156. c2 -= 1
  157. break
  158. if (c2 + i < l2) and options.tokenmatcher(t1[c1], t2[c2 + i]):
  159. c1 -= 1
  160. c2 += i - 1
  161. break
  162. c1 += 1
  163. c2 += 1
  164. if options.maxdistance:
  165. temporarydistance = options.locallengthevaluator(max(c1, c2)) - options.transpositionsevaluator(lcss, trans)
  166. if temporarydistance >= options.maxdistance:
  167. return round(temporarydistance)
  168. if (c1 >= l1) or (c2 >= l2):
  169. lcss += options.locallengthevaluator(local_cs)
  170. local_cs = 0
  171. c1 = c2 = min(c1, c2)
  172. lcss += options.locallengthevaluator(local_cs)
  173. return round(options.locallengthevaluator(max(l1, l2)) - options.transpositionsevaluator(lcss, trans))