Browse Source

Merge pull request #3 from pmalgorzata/zero-division-solution

added regex prior to profiles calculation
luozhouyang 6 years ago
parent
commit
c6340456ae
2 changed files with 13 additions and 0 deletions
  1. 7 0
      similarity/cosine.py
  2. 6 0
      similarity/jaccard.py

+ 7 - 0
similarity/cosine.py

@@ -24,6 +24,9 @@ from .shingle_based import ShingleBased
 from .string_distance import NormalizedStringDistance
 from .string_distance import NormalizedStringDistance
 from .string_similarity import NormalizedStringSimilarity
 from .string_similarity import NormalizedStringSimilarity
 
 
+import re
+_SPACE_PATTERN = re.compile("\\s+")
+
 
 
 class Cosine(ShingleBased, NormalizedStringDistance,
 class Cosine(ShingleBased, NormalizedStringDistance,
              NormalizedStringSimilarity):
              NormalizedStringSimilarity):
@@ -41,6 +44,10 @@ class Cosine(ShingleBased, NormalizedStringDistance,
             raise TypeError("Argument s1 is NoneType.")
             raise TypeError("Argument s1 is NoneType.")
         if s0 == s1:
         if s0 == s1:
             return 1.0
             return 1.0
+            
+        s0 = _SPACE_PATTERN.sub("", s0)
+        s1 = _SPACE_PATTERN.sub("", s1)
+        
         if len(s0) < self.get_k() or len(s1) < self.get_k():
         if len(s0) < self.get_k() or len(s1) < self.get_k():
             return 0.0
             return 0.0
         profile0 = self.get_profile(s0)
         profile0 = self.get_profile(s0)

+ 6 - 0
similarity/jaccard.py

@@ -22,6 +22,8 @@ from .shingle_based import ShingleBased
 from .string_distance import NormalizedStringDistance, MetricStringDistance
 from .string_distance import NormalizedStringDistance, MetricStringDistance
 from .string_similarity import NormalizedStringSimilarity
 from .string_similarity import NormalizedStringSimilarity
 
 
+import re
+_SPACE_PATTERN = re.compile("\\s+")
 
 
 class Jaccard(ShingleBased, MetricStringDistance, NormalizedStringDistance, NormalizedStringSimilarity):
 class Jaccard(ShingleBased, MetricStringDistance, NormalizedStringDistance, NormalizedStringSimilarity):
 
 
@@ -38,6 +40,10 @@ class Jaccard(ShingleBased, MetricStringDistance, NormalizedStringDistance, Norm
             raise TypeError("Argument s1 is NoneType.")
             raise TypeError("Argument s1 is NoneType.")
         if s0 == s1:
         if s0 == s1:
             return 1.0
             return 1.0
+            
+        s0 = _SPACE_PATTERN.sub("", s0)
+        s1 = _SPACE_PATTERN.sub("", s1)
+        
         if len(s0) < self.get_k() or len(s1) < self.get_k():
         if len(s0) < self.get_k() or len(s1) < self.get_k():
             return 0.0
             return 0.0
         profile0 = self.get_profile(s0)
         profile0 = self.get_profile(s0)