浏览代码

Create similarity module

luozhouyang 7 年之前
父节点
当前提交
88250363da

+ 4 - 0
.gitignore

@@ -0,0 +1,4 @@
+.idea/
+.vscode/
+__pycache__/
+similarity/__pycache__/

文件差异内容过多而无法显示
+ 461 - 0
README.md


+ 0 - 0
similarity/__init__.py


+ 62 - 0
similarity/cosine.py

@@ -0,0 +1,62 @@
+import math
+
+from .shingle_based import ShingleBased
+from .string_distance import NormalizedStringDistance
+from .string_similarity import NormalizedStringSimilarity
+
+
+class Cosine(ShingleBased, NormalizedStringDistance, NormalizedStringSimilarity):
+
+    def __init__(self, k):
+        super().__init__(k)
+
+    def distance(self, s0, s1):
+        return 1.0 - self.similarity(s0, s1)
+
+    def similarity(self, s0, s1):
+        if s0 is None:
+            raise TypeError("Argument s0 is NoneType.")
+        if s1 is None:
+            raise TypeError("Argument s1 is NoneType.")
+        if s0 == s1:
+            return 1.0
+        if len(s0) < self.get_k() or len(s1) < self.get_k():
+            return 0.0
+        profile0 = self.get_profile(s0)
+        profile1 = self.get_profile(s1)
+        return self._dot_product(profile0, profile1) / (self._norm(profile0) * self._norm(profile1))
+
+    def similarity_profiles(self, profile0, profile1):
+        return self._dot_product(profile0, profile1) / (self._norm(profile0) * self._norm(profile1))
+
+    @staticmethod
+    def _dot_product(profile0, profile1):
+        small = profile1
+        large = profile0
+        if len(profile0) < len(profile1):
+            small = profile0
+            large = profile1
+        agg = 0.0
+        for k, v in small.items():
+            i = large.get(k)
+            if not i:
+                continue
+            agg += 1.0 * v * i
+        return agg
+
+    @staticmethod
+    def _norm(profile):
+        agg = 0.0
+        for k, v in profile.items():
+            agg += 1.0 * v * v
+        return math.sqrt(agg)
+
+
+if __name__ == "__main__":
+    cosine = Cosine(1)
+    str0 = "上海市宝山区 你好"
+    str1 = "上海浦东新区 你好吗"
+    d = cosine.distance(str0, str1)
+    s = cosine.similarity(str0, str1)
+    print(d)
+    print(s)

+ 32 - 0
similarity/cosine_test.py

@@ -0,0 +1,32 @@
+import unittest
+
+from .cosine import Cosine
+
+
+class TestCosine(unittest.TestCase):
+
+    def test_cosine(self):
+        a = Cosine(1)
+        s0 = ""
+        s1 = ""
+        s2 = "上海"
+        s3 = "上海市"
+        distance_format = "distance: {:.4}\t between {} and {}"
+        similarity_format = "similarity: {:.4}\t between {} and {}"
+        print(distance_format.format(str(a.distance(s0, s1)), s0, s1))
+        print(distance_format.format(str(a.distance(s0, s2)), s0, s2))
+        print(distance_format.format(str(a.distance(s0, s3)), s0, s3))
+        print(distance_format.format(str(a.distance(s1, s2)), s1, s2))
+        print(distance_format.format(str(a.distance(s1, s3)), s1, s3))
+        print(distance_format.format(str(a.distance(s2, s3)), s2, s3))
+
+        print(similarity_format.format(str(a.similarity(s0, s1)), s0, s1))
+        print(similarity_format.format(str(a.similarity(s0, s2)), s0, s2))
+        print(similarity_format.format(str(a.similarity(s0, s3)), s0, s3))
+        print(similarity_format.format(str(a.similarity(s1, s2)), s1, s2))
+        print(similarity_format.format(str(a.similarity(s1, s3)), s1, s3))
+        print(similarity_format.format(str(a.similarity(s2, s3)), s2, s3))
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 43 - 0
similarity/damerau.py

@@ -0,0 +1,43 @@
+from .string_distance import MetricStringDistance
+import numpy as np
+
+
+class Damerau(MetricStringDistance):
+
+    def distance(self, s0, s1):
+        if s0 is None:
+            raise TypeError("Argument s0 is NoneType.")
+        if s1 is None:
+            raise TypeError("Argument s1 is NoneType.")
+        if s0 == s1:
+            return 0.0
+        inf = int(len(s0) + len(s1))
+        da = dict()
+        for i in range(len(s0)):
+            da[s0[i]] = str(0)
+        for i in range(len(s1)):
+            da[s1[i]] = str(0)
+        h = np.zeros((len(s0) + 2, len(s1) + 2))
+        for i in range(len(s0) + 1):
+            h[i + 1][0] = inf
+            h[i + 1][1] = i
+        for j in range(len(s1) + 1):
+            h[0][j + 1] = inf
+            h[1][j + 1] = j
+        for i in range(1, len(s0) + 1):
+            db = 0
+            for j in range(1, len(s1) + 1):
+                i1 = int(da[s1[j - 1]])
+                j1 = db
+
+                cost = 1
+                if s0[i - 1] == s1[j - 1]:
+                    cost = 0
+                    db = j
+                h[i + 1][j + 1] = min(h[i][j] + cost,
+                                      h[i + 1][j] + 1,
+                                      h[i][j + 1] + 1,
+                                      h[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1))
+            da[s0[i - 1]] = str(i)
+
+        return h[len(s0) + 1][len(s1) + 1]

+ 24 - 0
similarity/damerau_test.py

@@ -0,0 +1,24 @@
+import unittest
+
+from .damerau import Damerau
+
+
+class TestDamerau(unittest.TestCase):
+
+    def test_damerau(self):
+        a = Damerau()
+        s0 = ""
+        s1 = ""
+        s2 = "上海"
+        s3 = "上海市"
+        distance_format = "distance: {:.4}\t between {} and {}"
+        print(distance_format.format(str(a.distance(s0, s1)), s0, s1))
+        print(distance_format.format(str(a.distance(s0, s2)), s0, s2))
+        print(distance_format.format(str(a.distance(s0, s3)), s0, s3))
+        print(distance_format.format(str(a.distance(s1, s2)), s1, s2))
+        print(distance_format.format(str(a.distance(s1, s3)), s1, s3))
+        print(distance_format.format(str(a.distance(s2, s3)), s2, s3))
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 31 - 0
similarity/jaccard.py

@@ -0,0 +1,31 @@
+from .shingle_based import ShingleBased
+from .string_distance import NormalizedStringDistance, MetricStringDistance
+from .string_similarity import NormalizedStringSimilarity
+
+
+class Jaccard(ShingleBased, MetricStringDistance, NormalizedStringDistance, NormalizedStringSimilarity):
+
+    def __init__(self, k):
+        super().__init__(k)
+
+    def distance(self, s0, s1):
+        1.0 - self.similarity(s0, s1)
+
+    def similarity(self, s0, s1):
+        if s0 is None:
+            raise TypeError("Argument s0 is NoneType.")
+        if s1 is None:
+            raise TypeError("Argument s1 is NoneType.")
+        if s0 == s1:
+            return 1.0
+        if len(s0) < self.get_k() or len(s1) < self.get_k():
+            return 0.0
+        profile0 = self.get_profile(s0)
+        profile1 = self.get_profile(s1)
+        union = set()
+        for ite in profile0.keys():
+            union.add(ite)
+        for ite in profile1.keys():
+            union.add(ite)
+        inter = int(len(profile0.keys()) + len(profile1.keys()) - len(union))
+        return 1.0 * inter / len(union)

+ 32 - 0
similarity/jaccard_test.py

@@ -0,0 +1,32 @@
+import unittest
+
+from .jaccard import Jaccard
+
+
+class TestJaccard(unittest.TestCase):
+
+    def test_jaccard(self):
+        a = Jaccard(1)
+        s0 = ""
+        s1 = ""
+        s2 = "上海"
+        s3 = "上海市"
+        distance_format = "distance: {:.4}\t between {} and {}"
+        similarity_format = "similarity: {:.4}\t between {} and {}"
+        print(distance_format.format(str(a.distance(s0, s1)), s0, s1))
+        print(distance_format.format(str(a.distance(s0, s2)), s0, s2))
+        print(distance_format.format(str(a.distance(s0, s3)), s0, s3))
+        print(distance_format.format(str(a.distance(s1, s2)), s1, s2))
+        print(distance_format.format(str(a.distance(s1, s3)), s1, s3))
+        print(distance_format.format(str(a.distance(s2, s3)), s2, s3))
+
+        print(similarity_format.format(str(a.similarity(s0, s1)), s0, s1))
+        print(similarity_format.format(str(a.similarity(s0, s2)), s0, s2))
+        print(similarity_format.format(str(a.similarity(s0, s3)), s0, s3))
+        print(similarity_format.format(str(a.similarity(s1, s2)), s1, s2))
+        print(similarity_format.format(str(a.similarity(s1, s3)), s1, s3))
+        print(similarity_format.format(str(a.similarity(s2, s3)), s2, s3))
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 77 - 0
similarity/jarowinkler.py

@@ -0,0 +1,77 @@
+from .string_distance import NormalizedStringDistance
+from .string_similarity import NormalizedStringSimilarity
+
+
+class JaroWinkler(NormalizedStringSimilarity, NormalizedStringDistance):
+
+    def __init__(self, threshold=0.7):
+        self.threshold = threshold
+        self.three = 3
+        self.jw_coef = 0.1
+
+    def get_threshold(self):
+        return self.threshold
+
+    def similarity(self, s0, s1):
+        if s0 is None:
+            raise TypeError("Argument s0 is NoneType.")
+        if s1 is None:
+            raise TypeError("Argument s1 is NoneType.")
+        if s0 == s1:
+            return 1.0
+        mtp = self.matches(s0, s1)
+        m = mtp[0]
+        if m == 0:
+            return 0.0
+        j = (m / len(s0) + m / len(s1) + (m - mtp[1]) / m) / self.three
+        jw = j
+        if j > self.get_threshold():
+            jw = j + min(self.jw_coef, 1.0 / mtp[self.three]) * mtp[2] * (1 - j)
+        return jw
+
+    def distance(self, s0, s1):
+        return 1.0 - self.similarity(s0, s1)
+
+    @staticmethod
+    def matches(s0, s1):
+        if len(s0) > len(s1):
+            max_str = s0
+            min_str = s1
+        else:
+            max_str = s1
+            min_str = s0
+        ran = int(max(len(max_str) / 2 - 1, 0))
+        match_indexes = [-1] * len(min_str)
+        match_flags = [False] * len(max_str)
+        matches = 0
+        for mi in range(len(min_str)):
+            c1 = min_str[mi]
+            for xi in range(max(mi - ran, 0), min(mi + ran + 1, len(max_str))):
+                if not match_flags[xi] and c1 == max_str[xi]:
+                    match_indexes[mi] = xi
+                    match_flags[xi] = True
+                    matches += 1
+                    break
+
+        ms0, ms1 = [0] * matches, [0] * matches
+        si = 0
+        for i in range(len(min_str)):
+            if match_indexes[i] != -1:
+                ms0[si] = min_str[i]
+                si += 1
+        si = 0
+        for j in range(len(max_str)):
+            if match_flags[j]:
+                ms1[si] = max_str[j]
+                si += 1
+        transpositions = 0
+        for mi in range(len(ms0)):
+            if ms0[mi] != ms1[mi]:
+                transpositions += 1
+        prefix = 0
+        for mi in range(len(min_str)):
+            if s0[mi] == s1[mi]:
+                prefix += 1
+            else:
+                break
+        return [matches, int(transpositions / 2), prefix, len(max_str)]

+ 32 - 0
similarity/jarowinkler_test.py

@@ -0,0 +1,32 @@
+import unittest
+
+from .jarowinkler import JaroWinkler
+
+
+class TestJaroWinkler(unittest.TestCase):
+
+    def test_jarowinkler(self):
+        a = JaroWinkler()
+        s0 = ""
+        s1 = ""
+        s2 = "上海"
+        s3 = "上海市"
+        distance_format = "distance: {:.4}\t between {} and {}"
+        similarity_format = "similarity: {:.4}\t between {} and {}"
+        print(distance_format.format(str(a.distance(s0, s1)), s0, s1))
+        print(distance_format.format(str(a.distance(s0, s2)), s0, s2))
+        print(distance_format.format(str(a.distance(s0, s3)), s0, s3))
+        print(distance_format.format(str(a.distance(s1, s2)), s1, s2))
+        print(distance_format.format(str(a.distance(s1, s3)), s1, s3))
+        print(distance_format.format(str(a.distance(s2, s3)), s2, s3))
+
+        print(similarity_format.format(str(a.similarity(s0, s1)), s0, s1))
+        print(similarity_format.format(str(a.similarity(s0, s2)), s0, s2))
+        print(similarity_format.format(str(a.similarity(s0, s3)), s0, s3))
+        print(similarity_format.format(str(a.similarity(s1, s2)), s1, s2))
+        print(similarity_format.format(str(a.similarity(s1, s3)), s1, s3))
+        print(similarity_format.format(str(a.similarity(s2, s3)), s2, s3))
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 33 - 0
similarity/levenshtein.py

@@ -0,0 +1,33 @@
+from .string_distance import MetricStringDistance
+
+
+class Levenshtein(MetricStringDistance):
+
+    def distance(self, s0, s1):
+        if s0 is None:
+            raise TypeError("Argument s0 is NoneType.")
+        if s1 is None:
+            raise TypeError("Argument s1 is NoneType.")
+        if s0 == s1:
+            return 0.0
+        if len(s0) == 0:
+            return len(s1)
+        if len(s1) == 0:
+            return len(s1)
+
+        v0 = [0] * (len(s1) + 1)
+        v1 = [0] * (len(s1) + 1)
+
+        for i in range(len(v0)):
+            v0[i] = i
+
+        for i in range(len(s0)):
+            v1[0] = i + 1
+            for j in range(len(s1)):
+                cost = 1
+                if s0[i] == s1[j]:
+                    cost = 0
+                v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost)
+            v0, v1 = v1, v0
+
+        return v0[len(s1)]

+ 24 - 0
similarity/levenshtein_test.py

@@ -0,0 +1,24 @@
+import unittest
+
+from .levenshtein import Levenshtein
+
+
+class TestLevenshtein(unittest.TestCase):
+
+    def test_levenshtein(self):
+        a = Levenshtein()
+        s0 = ""
+        s1 = ""
+        s2 = "上海"
+        s3 = "上海市"
+        distance_format = "distance: {:.4}\t between {} and {}"
+        print(distance_format.format(str(a.distance(s0, s1)), s0, s1))
+        print(distance_format.format(str(a.distance(s0, s2)), s0, s2))
+        print(distance_format.format(str(a.distance(s0, s3)), s0, s3))
+        print(distance_format.format(str(a.distance(s1, s2)), s1, s2))
+        print(distance_format.format(str(a.distance(s1, s3)), s1, s3))
+        print(distance_format.format(str(a.distance(s2, s3)), s2, s3))
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 32 - 0
similarity/longest_common_subsequence.py

@@ -0,0 +1,32 @@
+import numpy as np
+
+from .string_distance import StringDistance
+
+
+class LongestCommonSubsequence(StringDistance):
+    def distance(self, s0, s1):
+        if s0 is None:
+            raise TypeError("Argument s0 is NoneType.")
+        if s1 is None:
+            raise TypeError("Argument s1 is NoneType.")
+        if s0 == s1:
+            return 0.0
+        return len(s0) + len(s1) - 2 * self.length(s0, s1)
+
+    @staticmethod
+    def length(s0, s1):
+        if s0 is None:
+            raise TypeError("Argument s0 is NoneType.")
+        if s1 is None:
+            raise TypeError("Argument s1 is NoneType.")
+        s0_len, s1_len = len(s0), len(s1)
+        x, y = s0[:], s1[:]
+        n, m = s0_len + 1, s1_len + 1
+        matrix = np.zeros((n, m))
+        for i in range(1, s0_len + 1):
+            for j in range(1, s1_len + 1):
+                if x[i - 1] == y[j - 1]:
+                    matrix[i][j] = matrix[i - 1][j - 1] + 1
+                else:
+                    matrix[i][j] = max(matrix[i][j - 1], matrix[i - 1][j])
+        return matrix[s0_len][s1_len]

+ 24 - 0
similarity/longest_common_subsequence_test.py

@@ -0,0 +1,24 @@
+import unittest
+
+from .longest_common_subsequence import LongestCommonSubsequence
+
+
+class TestLongestCommonSubsequence(unittest.TestCase):
+
+    def test_longest_common_subsequence(self):
+        a = LongestCommonSubsequence()
+        s0 = ""
+        s1 = ""
+        s2 = "上海"
+        s3 = "上海市"
+        distance_format = "distance: {:.4}\t between {} and {}"
+        print(distance_format.format(str(a.distance(s0, s1)), s0, s1))
+        print(distance_format.format(str(a.distance(s0, s2)), s0, s2))
+        print(distance_format.format(str(a.distance(s0, s3)), s0, s3))
+        print(distance_format.format(str(a.distance(s1, s2)), s1, s2))
+        print(distance_format.format(str(a.distance(s1, s3)), s1, s3))
+        print(distance_format.format(str(a.distance(s2, s3)), s2, s3))
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 20 - 0
similarity/metric_lcs.py

@@ -0,0 +1,20 @@
+from .string_distance import MetricStringDistance, NormalizedStringDistance
+from .longest_common_subsequence import LongestCommonSubsequence
+
+
+class MetricLCS(MetricStringDistance, NormalizedStringDistance):
+
+    def __init__(self):
+        self.lcs = LongestCommonSubsequence()
+
+    def distance(self, s0, s1):
+        if s0 is None:
+            raise TypeError("Argument s0 is NoneType.")
+        if s1 is None:
+            raise TypeError("Argument s1 is NoneType.")
+        if s0 == s1:
+            return 0.0
+        max_len = int(max(len(s0), len(s1)))
+        if max_len == 0:
+            return 0.0
+        return 1.0 - (1.0 * self.lcs.length(s0, s1)) / max_len

+ 24 - 0
similarity/metric_lcs_test.py

@@ -0,0 +1,24 @@
+import unittest
+
+from .metric_lcs import MetricLCS
+
+
+class TestMetricLCS(unittest.TestCase):
+
+    def test_metric_lcs(self):
+        a = MetricLCS()
+        s0 = ""
+        s1 = ""
+        s2 = "上海"
+        s3 = "上海市"
+        distance_format = "distance: {:.4}\t between {} and {}"
+        print(distance_format.format(str(a.distance(s0, s1)), s0, s1))
+        print(distance_format.format(str(a.distance(s0, s2)), s0, s2))
+        print(distance_format.format(str(a.distance(s0, s3)), s0, s3))
+        print(distance_format.format(str(a.distance(s1, s2)), s1, s2))
+        print(distance_format.format(str(a.distance(s1, s3)), s1, s3))
+        print(distance_format.format(str(a.distance(s2, s3)), s2, s3))
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 67 - 0
similarity/ngram.py

@@ -0,0 +1,67 @@
+from .string_distance import NormalizedStringDistance
+
+
+class NGram(NormalizedStringDistance):
+
+    def __init__(self, n=2):
+        self.n = n
+
+    def distance(self, s0, s1):
+        if s0 is None:
+            raise TypeError("Argument s0 is NoneType.")
+        if s1 is None:
+            raise TypeError("Argument s1 is NoneType.")
+        if s0 == s1:
+            return 0.0
+
+        special = '\n'
+        sl = len(s0)
+        tl = len(s1)
+
+        if sl == 0 or tl == 0:
+            return 1.0
+
+        cost = 0
+        if sl < self.n or tl < self.n:
+            for i in range(min(sl, tl)):
+                if s0[i] == s1[i]:
+                    cost += 1
+            return 1.0 * cost / max(sl, tl)
+
+        sa = [''] * (sl + self.n - 1)
+
+        for i in range(len(sa)):
+            if i < self.n - 1:
+                sa[i] = special
+            else:
+                sa[i] = s0[i - self.n + 1]
+
+        p = [0.0] * (sl + 1)
+        d = [0.0] * (sl + 1)
+        t_j = [''] * self.n
+        for i in range(sl + 1):
+            p[i] = 1.0 * i
+
+        for j in range(1, tl + 1):
+            if j < self.n:
+                for ti in range(self.n - j):
+                    t_j[ti] = special
+                for ti in range(self.n - j, self.n):
+                    t_j[ti] = s1[ti - (self.n - j)]
+            else:
+                t_j = s1[j - self.n:j]
+
+            d[0] = 1.0 * j
+            for i in range(sl + 1):
+                cost = 0
+                tn = self.n
+                for ni in range(self.n):
+                    if sa[i - 1 + ni] != t_j[ni]:
+                        cost += 1
+                    elif sa[i - 1 + ni] == special:
+                        tn -= 1
+                ec = cost / tn
+                d[i] = min(d[i - 1] + 1, p[i] + 1, p[i - 1] + ec)
+            p, d = d, p
+
+        return p[sl] / max(tl, sl)

+ 24 - 0
similarity/ngram_test.py

@@ -0,0 +1,24 @@
+import unittest
+
+from .ngram import NGram
+
+
+class TestNGram(unittest.TestCase):
+
+    def test_ngram(self):
+        a = NGram(2)
+        s0 = ""
+        s1 = ""
+        s2 = "上海"
+        s3 = "上海市"
+        distance_format = "distance: {:.4}\t between {} and {}"
+        print(distance_format.format(str(a.distance(s0, s1)), s0, s1))
+        print(distance_format.format(str(a.distance(s0, s2)), s0, s2))
+        print(distance_format.format(str(a.distance(s0, s3)), s0, s3))
+        print(distance_format.format(str(a.distance(s1, s2)), s1, s2))
+        print(distance_format.format(str(a.distance(s1, s3)), s1, s3))
+        print(distance_format.format(str(a.distance(s2, s3)), s2, s3))
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 24 - 0
similarity/normalized_levenshtein.py

@@ -0,0 +1,24 @@
+from .string_distance import NormalizedStringDistance
+from .string_similarity import NormalizedStringSimilarity
+from .levenshtein import Levenshtein
+
+
+class NormalizedLevenshtein(NormalizedStringDistance, NormalizedStringSimilarity):
+
+    def __init__(self):
+        self.levenshtein = Levenshtein()
+
+    def distance(self, s0, s1):
+        if s0 is None:
+            raise TypeError("Argument s0 is NoneType.")
+        if s1 is None:
+            raise TypeError("Argument s1 is NoneType.")
+        if s0 == s1:
+            return 0.0
+        m_len = max(len(s0), len(s1))
+        if m_len == 0:
+            return 0.0
+        return self.levenshtein.distance(s0, s1) / m_len
+
+    def similarity(self, s0, s1):
+        return 1.0 - self.distance(s0, s1)

+ 32 - 0
similarity/normalized_levenshtein_test.py

@@ -0,0 +1,32 @@
+import unittest
+
+from .normalized_levenshtein import NormalizedLevenshtein
+
+
+class TestNormalizedLevenshtein(unittest.TestCase):
+
+    def test_normalized_levenshtein(self):
+        a = NormalizedLevenshtein()
+        s0 = ""
+        s1 = ""
+        s2 = "上海"
+        s3 = "上海市"
+        distance_format = "distance: {:.4}\t between {} and {}"
+        similarity_format = "similarity: {:.4}\t between {} and {}"
+        print(distance_format.format(str(a.distance(s0, s1)), s0, s1))
+        print(distance_format.format(str(a.distance(s0, s2)), s0, s2))
+        print(distance_format.format(str(a.distance(s0, s3)), s0, s3))
+        print(distance_format.format(str(a.distance(s1, s2)), s1, s2))
+        print(distance_format.format(str(a.distance(s1, s3)), s1, s3))
+        print(distance_format.format(str(a.distance(s2, s3)), s2, s3))
+
+        print(similarity_format.format(str(a.similarity(s0, s1)), s0, s1))
+        print(similarity_format.format(str(a.similarity(s0, s2)), s0, s2))
+        print(similarity_format.format(str(a.similarity(s0, s3)), s0, s3))
+        print(similarity_format.format(str(a.similarity(s1, s2)), s1, s2))
+        print(similarity_format.format(str(a.similarity(s1, s3)), s1, s3))
+        print(similarity_format.format(str(a.similarity(s2, s3)), s2, s3))
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 38 - 0
similarity/optimal_string_alignment.py

@@ -0,0 +1,38 @@
+import numpy as np
+
+from .string_distance import StringDistance
+
+
+class OptimalStringAlignment(StringDistance):
+
+    def distance(self, s0, s1):
+        if s0 is None:
+            raise TypeError("Argument s0 is NoneType.")
+        if s1 is None:
+            raise TypeError("Argument s1 is NoneType.")
+        if s0 == s1:
+            return 0.0
+
+        n, m = len(s0), len(s1)
+        if n == 0:
+            return 1.0 * n
+        if m == 0:
+            return 1.0 * m
+
+        d = np.zeros((n + 2, m + 2))
+        for i in range(n + 1):
+            d[i][0] = i
+        for j in range(m + 1):
+            d[0][j] = j
+
+        for i in range(1, n + 1):
+            for j in range(1, m + 1):
+                cost = 1
+                if s0[i - 1] == s1[j - 1]:
+                    cost = 0
+                d[i][j] = min(d[i - 1][j - 1] + cost, d[i][j - 1] + 1, d[i - 1][j] + 1)
+
+                if i > 1 and j > 1 and s0[i - 1] == s1[j - 2] and s0[i - 2] == s1[j - 1]:
+                    d[i][j] = min(d[i][j], d[i - 2][j - 2] + cost)
+
+        return d[n][m]

+ 24 - 0
similarity/optimal_string_alignment_test.py

@@ -0,0 +1,24 @@
+import unittest
+
+from .optimal_string_alignment import OptimalStringAlignment
+
+
+class TestOptimalStringAlignment(unittest.TestCase):
+
+    def test_optimal_string_alignment(self):
+        a = OptimalStringAlignment()
+        s0 = ""
+        s1 = ""
+        s2 = "上海"
+        s3 = "上海市"
+        distance_format = "distance: {:.4}\t between {} and {}"
+        print(distance_format.format(str(a.distance(s0, s1)), s0, s1))
+        print(distance_format.format(str(a.distance(s0, s2)), s0, s2))
+        print(distance_format.format(str(a.distance(s0, s3)), s0, s3))
+        print(distance_format.format(str(a.distance(s1, s2)), s1, s2))
+        print(distance_format.format(str(a.distance(s1, s3)), s1, s3))
+        print(distance_format.format(str(a.distance(s2, s3)), s2, s3))
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 37 - 0
similarity/qgram.py

@@ -0,0 +1,37 @@
+from .shingle_based import ShingleBased
+from .string_distance import StringDistance
+
+
+class QGram(ShingleBased, StringDistance):
+
+    def __init__(self, k=3):
+        super().__init__(k)
+
+    def distance(self, s0, s1):
+        if s0 is None:
+            raise TypeError("Argument s0 is NoneType.")
+        if s1 is None:
+            raise TypeError("Argument s1 is NoneType.")
+        if s0 == s1:
+            return 0.0
+
+        profile0 = self.get_profile(s0)
+        profile1 = self.get_profile(s1)
+        return self.distance_profile(profile0, profile1)
+
+    @staticmethod
+    def distance_profile(profile0, profile1):
+        union = set()
+        for k in profile0.keys():
+            union.add(k)
+        for k in profile1.keys():
+            union.add(k)
+        agg = 0
+        for k in union:
+            v0, v1 = 0, 0
+            if profile0.get(k) is not None:
+                v0 = int(profile0.get(k))
+            if profile1.get(k) is not None:
+                v1 = int(profile1.get(k))
+            agg += abs(v0 - v1)
+        return agg

+ 24 - 0
similarity/qgram_test.py

@@ -0,0 +1,24 @@
+import unittest
+
+from .qgram import QGram
+
+
+class TestQGram(unittest.TestCase):
+
+    def test_qgram(self):
+        a = QGram(1)
+        s0 = ""
+        s1 = ""
+        s2 = "上海"
+        s3 = "上海市"
+        distance_format = "distance: {:.4}\t between {} and {}"
+        print(distance_format.format(str(a.distance(s0, s1)), s0, s1))
+        print(distance_format.format(str(a.distance(s0, s2)), s0, s2))
+        print(distance_format.format(str(a.distance(s0, s3)), s0, s3))
+        print(distance_format.format(str(a.distance(s1, s2)), s1, s2))
+        print(distance_format.format(str(a.distance(s1, s3)), s1, s3))
+        print(distance_format.format(str(a.distance(s2, s3)), s2, s3))
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 24 - 0
similarity/shingle_based.py

@@ -0,0 +1,24 @@
+import re
+
+_SPACE_PATTERN = re.compile("\\s+")
+
+
+class ShingleBased:
+
+    def __init__(self, k=3):
+        self.k = k
+
+    def get_k(self):
+        return self.k
+
+    def get_profile(self, string):
+        shingles = dict()
+        no_space_str = _SPACE_PATTERN.sub("", string)
+        for i in range(len(no_space_str) - self.k + 1):
+            shingle = no_space_str[i:i + self.k]
+            old = shingles.get(shingle)
+            if old:
+                shingles[str(shingle)] = int(old + 1)
+            else:
+                shingles[str(shingle)] = 1
+        return shingles

+ 75 - 0
similarity/similarity.py

@@ -0,0 +1,75 @@
+from enum import IntEnum
+from .cosine import Cosine
+from .damerau import Damerau
+from .jaccard import Jaccard
+from .jarowinkler import JaroWinkler
+from .levenshtein import Levenshtein
+from .longest_common_subsequence import LongestCommonSubsequence
+from .metric_lcs import MetricLCS
+from .ngram import NGram
+from .normalized_levenshtein import NormalizedLevenshtein
+from .optimal_string_alignment import OptimalStringAlignment
+from .qgram import QGram
+from .sorensen_dice import SorensenDice
+from .weighted_levenshtein import WeightedLevenshtein
+
+
+class Algorithm(IntEnum):
+    COSINE = 1
+    DAMERAU = 2
+    JACCARD = 3
+    JARO_WINKLE = 4
+    LEVENSHTEIN = 5
+    LCS = 6
+    METRIC_LCS = 7
+    N_GRAM = 8
+    NORMALIZED_LEVENSHTEIN = 9
+    OPTIMAL_STRING_ALIGNMENT = 10
+    Q_GRAM = 11
+    SORENSEN_DICE = 12
+    WEIGHTED_LEVENSHTEIN = 13
+
+
+class Factory:
+    @staticmethod
+    def get_algorithm(algorithm: Algorithm, k=3):
+        if algorithm == Algorithm.COSINE:
+            return Cosine(k)
+        elif algorithm == Algorithm.DAMERAU:
+            return Damerau()
+        elif algorithm == Algorithm.JACCARD:
+            return Jaccard(k)
+        elif algorithm == Algorithm.JARO_WINKLE:
+            return JaroWinkler()
+        elif algorithm == Algorithm.LEVENSHTEIN:
+            return Levenshtein()
+        elif algorithm == Algorithm.LCS:
+            return LongestCommonSubsequence()
+        elif algorithm == Algorithm.METRIC_LCS:
+            return MetricLCS()
+        elif algorithm == Algorithm.N_GRAM:
+            return NGram()
+        elif algorithm == Algorithm.NORMALIZED_LEVENSHTEIN:
+            return NormalizedLevenshtein()
+        elif algorithm == Algorithm.OPTIMAL_STRING_ALIGNMENT:
+            return OptimalStringAlignment()
+        elif algorithm == Algorithm.Q_GRAM:
+            return QGram()
+        elif algorithm == Algorithm.SORENSEN_DICE:
+            return SorensenDice(k)
+        elif algorithm == Algorithm.WEIGHTED_LEVENSHTEIN:
+            raise TypeError("This method does not support create weighted_levenshtein algorithm.")
+        else:
+            return Cosine(k)
+
+    @staticmethod
+    def get_weighted_levenshtein(char_sub, char_change):
+        return WeightedLevenshtein(char_sub, char_change)
+
+
+if __name__ == "__main__":
+    a = Factory().get_algorithm(Algorithm.LEVENSHTEIN)
+    distance_format = "distance: {:.4} between {} and {}"
+    s0 = "你好"
+    s1 = "你好啊"
+    print(distance_format.format(str(a.distance(s0, s1)), s0, s1))

+ 31 - 0
similarity/sorensen_dice.py

@@ -0,0 +1,31 @@
+from .shingle_based import ShingleBased
+from .string_distance import NormalizedStringDistance
+from .string_similarity import NormalizedStringSimilarity
+
+
+class SorensenDice(ShingleBased, NormalizedStringDistance, NormalizedStringSimilarity):
+
+    def __init__(self, k=3):
+        super().__init__(k)
+
+    def distance(self, s0, s1):
+        return 1.0 - self.similarity(s0, s1)
+
+    def similarity(self, s0, s1):
+        if s0 is None:
+            raise TypeError("Argument s0 is NoneType.")
+        if s1 is None:
+            raise TypeError("Argument s1 is NoneType.")
+        if s0 == s1:
+            return 1.0
+        union = set()
+        profile0, profile1 = self.get_profile(s0), self.get_profile(s1)
+        for k in profile0.keys():
+            union.add(k)
+        for k in profile1.keys():
+            union.add(k)
+        inter = 0
+        for k in union:
+            if k in profile0.keys() and k in profile1.keys():
+                inter += 1
+        return 2.0 * inter / (len(profile0) + len(profile1))

+ 32 - 0
similarity/sorensen_dice_test.py

@@ -0,0 +1,32 @@
+import unittest
+
+from .sorensen_dice import SorensenDice
+
+
+class TestSorensenDice(unittest.TestCase):
+
+    def test_sorensen_dice(self):
+        a = SorensenDice(2)
+        # s0 = ""
+        # s1 = ""
+        s2 = "上海"
+        s3 = "上海市"
+        distance_format = "distance: {:.4}\t between {} and {}"
+        similarity_format = "similarity: {:.4}\t between {} and {}"
+        # print(distance_format.format(str(a.distance(s0, s1)), s0, s1))
+        # print(distance_format.format(str(a.distance(s0, s2)), s0, s2))
+        # print(distance_format.format(str(a.distance(s0, s3)), s0, s3))
+        # print(distance_format.format(str(a.distance(s1, s2)), s1, s2))
+        # print(distance_format.format(str(a.distance(s1, s3)), s1, s3))
+        print(distance_format.format(str(a.distance(s2, s3)), s2, s3))
+
+        # print(similarity_format.format(str(a.similarity(s0, s1)), s0, s1))
+        # print(similarity_format.format(str(a.similarity(s0, s2)), s0, s2))
+        # print(similarity_format.format(str(a.similarity(s0, s3)), s0, s3))
+        # print(similarity_format.format(str(a.similarity(s1, s2)), s1, s2))
+        # print(similarity_format.format(str(a.similarity(s1, s3)), s1, s3))
+        print(similarity_format.format(str(a.similarity(s2, s3)), s2, s3))
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 16 - 0
similarity/string_distance.py

@@ -0,0 +1,16 @@
+class StringDistance:
+
+    def distance(self, s0, s1):
+        raise NotImplementedError()
+
+
+class NormalizedStringDistance(StringDistance):
+
+    def distance(self, s0, s1):
+        raise NotImplementedError()
+
+
+class MetricStringDistance(StringDistance):
+
+    def distance(self, s0, s1):
+        raise NotImplementedError()

+ 10 - 0
similarity/string_similarity.py

@@ -0,0 +1,10 @@
+class StringSimilarity:
+
+    def similarity(self, s0, s1):
+        raise NotImplementedError()
+
+
+class NormalizedStringSimilarity(StringSimilarity):
+
+    def similarity(self, s0, s1):
+        raise NotImplementedError()

+ 69 - 0
similarity/weighted_levenshtein.py

@@ -0,0 +1,69 @@
+from .string_distance import StringDistance
+
+
+class CharacterInsDelInterface:
+
+    def deletion_cost(self, c):
+        raise NotImplementedError()
+
+    def insertion_cost(self, c):
+        raise NotImplementedError()
+
+
+class CharacterSubstitutionInterface:
+
+    def cost(self, c0, c1):
+        raise NotImplementedError()
+
+
+class WeightedLevenshtein(StringDistance):
+
+    def __init__(self, character_substitution, character_ins_del=None):
+        self.character_ins_del = character_ins_del
+        if character_substitution is None:
+            raise TypeError("Argument character_substitution is NoneType.")
+        self.character_substitution = character_substitution
+
+    def distance(self, s0, s1):
+        if s0 is None:
+            raise TypeError("Argument s0 is NoneType.")
+        if s1 is None:
+            raise TypeError("Argument s1 is NoneType.")
+        if s0 == s1:
+            return 0.0
+        if len(s0) == 0:
+            return len(s1)
+        if len(s1) == 0:
+            return len(s0)
+
+        v0, v1 = [0.0] * (len(s1) + 1), [0.0] * (len(s1) + 1)
+
+        v0[0] = 0
+        for i in range(1, len(v0)):
+            v0[i] = v0[i - 1] + self._insertion_cost(s1[i - 1])
+
+        for i in range(len(s0)):
+            s1i = s0[i]
+            deletion_cost = self._deletion_cost(s1i)
+            v1[0] = v0[0] + deletion_cost
+
+            for j in range(len(s1)):
+                s2j = s1[j]
+                cost = 0
+                if s1i != s2j:
+                    cost = self.character_substitution.cost(s1i, s2j)
+                insertion_cost = self._insertion_cost(s2j)
+                v1[j + 1] = min(v1[j] + insertion_cost, v0[j + 1] + deletion_cost, v0[j] + cost)
+            v0, v1 = v1, v0
+
+        return v0[len(s1)]
+
+    def _insertion_cost(self, c):
+        if self.character_ins_del is None:
+            return 1.0
+        return self.character_ins_del.insertion_cost(c)
+
+    def _deletion_cost(self, c):
+        if self.character_ins_del is None:
+            return 1.0
+        return self.character_ins_del.deletion_cost(c)

+ 30 - 0
similarity/weighted_levenshtein_test.py

@@ -0,0 +1,30 @@
+import unittest
+
+from .weighted_levenshtein import WeightedLevenshtein, CharacterSubstitutionInterface
+
+
+class CharSub(CharacterSubstitutionInterface):
+
+    def cost(self, c0, c1):
+        return 1.0
+
+
+class TestWeightedLevenshtein(unittest.TestCase):
+
+    def test_weighted_levenshtein(self):
+        a = WeightedLevenshtein(character_substitution=CharSub())
+        s0 = ""
+        s1 = ""
+        s2 = "上海"
+        s3 = "上海市"
+        distance_format = "distance: {:.4}\t between {} and {}"
+        print(distance_format.format(str(a.distance(s0, s1)), s0, s1))
+        print(distance_format.format(str(a.distance(s0, s2)), s0, s2))
+        print(distance_format.format(str(a.distance(s0, s3)), s0, s3))
+        print(distance_format.format(str(a.distance(s1, s2)), s1, s2))
+        print(distance_format.format(str(a.distance(s1, s3)), s1, s3))
+        print(distance_format.format(str(a.distance(s2, s3)), s2, s3))
+
+
+if __name__ == "__main__":
+    unittest.main()