1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798 |
- # Copyright (c) 2018 luozhouyang
- #
- # Permission is hereby granted, free of charge, to any person obtaining a copy
- # of this software and associated documentation files (the "Software"), to deal
- # in the Software without restriction, including without limitation the rights
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- # copies of the Software, and to permit persons to whom the Software is
- # furnished to do so, subject to the following conditions:
- #
- # The above copyright notice and this permission notice shall be included in all
- # copies or substantial portions of the Software.
- #
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- # SOFTWARE.
- from .string_distance import NormalizedStringDistance
- from .string_similarity import NormalizedStringSimilarity
- class JaroWinkler(NormalizedStringSimilarity, NormalizedStringDistance):
- def __init__(self, threshold=0.7):
- self.threshold = threshold
- self.three = 3
- self.jw_coef = 0.1
- def get_threshold(self):
- return self.threshold
- def similarity(self, s0, s1):
- if s0 is None:
- raise TypeError("Argument s0 is NoneType.")
- if s1 is None:
- raise TypeError("Argument s1 is NoneType.")
- if s0 == s1:
- return 1.0
- mtp = self.matches(s0, s1)
- m = mtp[0]
- if m == 0:
- return 0.0
- j = (m / len(s0) + m / len(s1) + (m - mtp[1]) / m) / self.three
- jw = j
- if j > self.get_threshold():
- jw = j + min(self.jw_coef, 1.0 / mtp[self.three]) * mtp[2] * (1 - j)
- return jw
- def distance(self, s0, s1):
- return 1.0 - self.similarity(s0, s1)
- @staticmethod
- def matches(s0, s1):
- if len(s0) > len(s1):
- max_str = s0
- min_str = s1
- else:
- max_str = s1
- min_str = s0
- ran = int(max(len(max_str) / 2 - 1, 0))
- match_indexes = [-1] * len(min_str)
- match_flags = [False] * len(max_str)
- matches = 0
- for mi in range(len(min_str)):
- c1 = min_str[mi]
- for xi in range(max(mi - ran, 0), min(mi + ran + 1, len(max_str))):
- if not match_flags[xi] and c1 == max_str[xi]:
- match_indexes[mi] = xi
- match_flags[xi] = True
- matches += 1
- break
- ms0, ms1 = [0] * matches, [0] * matches
- si = 0
- for i in range(len(min_str)):
- if match_indexes[i] != -1:
- ms0[si] = min_str[i]
- si += 1
- si = 0
- for j in range(len(max_str)):
- if match_flags[j]:
- ms1[si] = max_str[j]
- si += 1
- transpositions = 0
- for mi in range(len(ms0)):
- if ms0[mi] != ms1[mi]:
- transpositions += 1
- prefix = 0
- for mi in range(len(min_str)):
- if s0[mi] == s1[mi]:
- prefix += 1
- else:
- break
- return [matches, int(transpositions / 2), prefix, len(max_str)]
|