Explorar o código

Fixed #1 ZeroDivisionError

luozhouyang %!s(int64=6) %!d(string=hai) anos
pai
achega
680c7d99c2
Modificáronse 3 ficheiros con 13 adicións e 41 borrados
  1. 6 20
      similarity/cosine_test.py
  2. 6 20
      similarity/jaccard_test.py
  3. 1 1
      similarity/shingle_based.py

+ 6 - 20
similarity/cosine_test.py

@@ -26,26 +26,12 @@ from .cosine import Cosine
 class TestCosine(unittest.TestCase):
 
     def test_cosine(self):
-        a = Cosine(1)
-        s0 = ""
-        s1 = ""
-        s2 = "上海"
-        s3 = "上海市"
-        distance_format = "distance: {:.4}\t between {} and {}"
-        similarity_format = "similarity: {:.4}\t between {} and {}"
-        print(distance_format.format(str(a.distance(s0, s1)), s0, s1))
-        print(distance_format.format(str(a.distance(s0, s2)), s0, s2))
-        print(distance_format.format(str(a.distance(s0, s3)), s0, s3))
-        print(distance_format.format(str(a.distance(s1, s2)), s1, s2))
-        print(distance_format.format(str(a.distance(s1, s3)), s1, s3))
-        print(distance_format.format(str(a.distance(s2, s3)), s2, s3))
-
-        print(similarity_format.format(str(a.similarity(s0, s1)), s0, s1))
-        print(similarity_format.format(str(a.similarity(s0, s2)), s0, s2))
-        print(similarity_format.format(str(a.similarity(s0, s3)), s0, s3))
-        print(similarity_format.format(str(a.similarity(s1, s2)), s1, s2))
-        print(similarity_format.format(str(a.similarity(s1, s3)), s1, s3))
-        print(similarity_format.format(str(a.similarity(s2, s3)), s2, s3))
+        cos = Cosine(1)
+        s = ['', ' ', 'Shanghai', 'ShangHai', 'Shang Hai']
+        for i in range(len(s)):
+            for j in range(i, len(s)):
+                print('dis between \'%s\' and \'%s\': %.4f' % (s[i], s[j], cos.distance(s[i], s[j])))
+                print('sim between \'%s\' and \'%s\': %.4f' % (s[i], s[j], cos.similarity(s[i], s[j])))
 
 
 if __name__ == "__main__":

+ 6 - 20
similarity/jaccard_test.py

@@ -26,26 +26,12 @@ from .jaccard import Jaccard
 class TestJaccard(unittest.TestCase):
 
     def test_jaccard(self):
-        a = Jaccard(1)
-        s0 = ""
-        s1 = ""
-        s2 = "上海"
-        s3 = "上海市"
-        distance_format = "distance: {:.4}\t between {} and {}"
-        similarity_format = "similarity: {:.4}\t between {} and {}"
-        print(distance_format.format(str(a.distance(s0, s1)), s0, s1))
-        print(distance_format.format(str(a.distance(s0, s2)), s0, s2))
-        print(distance_format.format(str(a.distance(s0, s3)), s0, s3))
-        print(distance_format.format(str(a.distance(s1, s2)), s1, s2))
-        print(distance_format.format(str(a.distance(s1, s3)), s1, s3))
-        print(distance_format.format(str(a.distance(s2, s3)), s2, s3))
-
-        print(similarity_format.format(str(a.similarity(s0, s1)), s0, s1))
-        print(similarity_format.format(str(a.similarity(s0, s2)), s0, s2))
-        print(similarity_format.format(str(a.similarity(s0, s3)), s0, s3))
-        print(similarity_format.format(str(a.similarity(s1, s2)), s1, s2))
-        print(similarity_format.format(str(a.similarity(s1, s3)), s1, s3))
-        print(similarity_format.format(str(a.similarity(s2, s3)), s2, s3))
+        jaccard = Jaccard(1)
+        s = ['', ' ', 'Shanghai', 'ShangHai', 'Shang Hai']
+        for i in range(len(s)):
+            for j in range(i, len(s)):
+                print('dis between \'%s\' and \'%s\': %.4f' % (s[i], s[j], jaccard.distance(s[i], s[j])))
+                print('sim between \'%s\' and \'%s\': %.4f' % (s[i], s[j], jaccard.similarity(s[i], s[j])))
 
 
 if __name__ == "__main__":

+ 1 - 1
similarity/shingle_based.py

@@ -33,7 +33,7 @@ class ShingleBased:
 
     def get_profile(self, string):
         shingles = dict()
-        no_space_str = _SPACE_PATTERN.sub("", string)
+        no_space_str = _SPACE_PATTERN.sub(" ", string)
         for i in range(len(no_space_str) - self.k + 1):
             shingle = no_space_str[i:i + self.k]
             old = shingles.get(shingle)