# Copyright (c) 2018 luozhouyang # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from .string_distance import NormalizedStringDistance from .string_similarity import NormalizedStringSimilarity class JaroWinkler(NormalizedStringSimilarity, NormalizedStringDistance): def __init__(self, threshold=0.7): self.threshold = threshold self.three = 3 self.jw_coef = 0.1 def get_threshold(self): return self.threshold def similarity(self, s0, s1): if s0 is None: raise TypeError("Argument s0 is NoneType.") if s1 is None: raise TypeError("Argument s1 is NoneType.") if s0 == s1: return 1.0 mtp = self.matches(s0, s1) m = mtp[0] if m == 0: return 0.0 j = (m / len(s0) + m / len(s1) + (m - mtp[1]) / m) / self.three jw = j if j > self.get_threshold(): jw = j + min(self.jw_coef, 1.0 / mtp[self.three]) * mtp[2] * (1 - j) return jw def distance(self, s0, s1): return 1.0 - self.similarity(s0, s1) @staticmethod def matches(s0, s1): if len(s0) > len(s1): max_str = s0 min_str = s1 else: max_str = s1 min_str = s0 ran = int(max(len(max_str) / 2 - 1, 0)) match_indexes = [-1] * len(min_str) match_flags = [False] * len(max_str) matches = 0 for mi in range(len(min_str)): c1 = min_str[mi] for xi in range(max(mi - ran, 0), min(mi + ran + 1, len(max_str))): if not match_flags[xi] and c1 == max_str[xi]: match_indexes[mi] = xi match_flags[xi] = True matches += 1 break ms0, ms1 = [0] * matches, [0] * matches si = 0 for i in range(len(min_str)): if match_indexes[i] != -1: ms0[si] = min_str[i] si += 1 si = 0 for j in range(len(max_str)): if match_flags[j]: ms1[si] = max_str[j] si += 1 transpositions = 0 for mi in range(len(ms0)): if ms0[mi] != ms1[mi]: transpositions += 1 prefix = 0 for mi in range(len(min_str)): if s0[mi] == s1[mi]: prefix += 1 else: break return [matches, int(transpositions / 2), prefix, len(max_str)]