Browse Source

Merge pull request #3 from pmalgorzata/zero-division-solution

added regex prior to profiles calculation
luozhouyang 6 năm trước cách đây
mục cha
commit
c6340456ae
2 tập tin đã thay đổi với 13 bổ sung0 xóa
  1. 7 0
      similarity/cosine.py
  2. 6 0
      similarity/jaccard.py

+ 7 - 0
similarity/cosine.py

@@ -24,6 +24,9 @@ from .shingle_based import ShingleBased
 from .string_distance import NormalizedStringDistance
 from .string_similarity import NormalizedStringSimilarity
 
+import re
+_SPACE_PATTERN = re.compile("\\s+")
+
 
 class Cosine(ShingleBased, NormalizedStringDistance,
              NormalizedStringSimilarity):
@@ -41,6 +44,10 @@ class Cosine(ShingleBased, NormalizedStringDistance,
             raise TypeError("Argument s1 is NoneType.")
         if s0 == s1:
             return 1.0
+            
+        s0 = _SPACE_PATTERN.sub("", s0)
+        s1 = _SPACE_PATTERN.sub("", s1)
+        
         if len(s0) < self.get_k() or len(s1) < self.get_k():
             return 0.0
         profile0 = self.get_profile(s0)

+ 6 - 0
similarity/jaccard.py

@@ -22,6 +22,8 @@ from .shingle_based import ShingleBased
 from .string_distance import NormalizedStringDistance, MetricStringDistance
 from .string_similarity import NormalizedStringSimilarity
 
+import re
+_SPACE_PATTERN = re.compile("\\s+")
 
 class Jaccard(ShingleBased, MetricStringDistance, NormalizedStringDistance, NormalizedStringSimilarity):
 
@@ -38,6 +40,10 @@ class Jaccard(ShingleBased, MetricStringDistance, NormalizedStringDistance, Norm
             raise TypeError("Argument s1 is NoneType.")
         if s0 == s1:
             return 1.0
+            
+        s0 = _SPACE_PATTERN.sub("", s0)
+        s1 = _SPACE_PATTERN.sub("", s1)
+        
         if len(s0) < self.get_k() or len(s1) < self.get_k():
             return 0.0
         profile0 = self.get_profile(s0)