ngram.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. from .string_distance import NormalizedStringDistance
  2. class NGram(NormalizedStringDistance):
  3. def __init__(self, n=2):
  4. self.n = n
  5. def distance(self, s0, s1):
  6. if s0 is None:
  7. raise TypeError("Argument s0 is NoneType.")
  8. if s1 is None:
  9. raise TypeError("Argument s1 is NoneType.")
  10. if s0 == s1:
  11. return 0.0
  12. special = '\n'
  13. sl = len(s0)
  14. tl = len(s1)
  15. if sl == 0 or tl == 0:
  16. return 1.0
  17. cost = 0
  18. if sl < self.n or tl < self.n:
  19. for i in range(min(sl, tl)):
  20. if s0[i] == s1[i]:
  21. cost += 1
  22. return 1.0 * cost / max(sl, tl)
  23. sa = [''] * (sl + self.n - 1)
  24. for i in range(len(sa)):
  25. if i < self.n - 1:
  26. sa[i] = special
  27. else:
  28. sa[i] = s0[i - self.n + 1]
  29. p = [0.0] * (sl + 1)
  30. d = [0.0] * (sl + 1)
  31. t_j = [''] * self.n
  32. for i in range(sl + 1):
  33. p[i] = 1.0 * i
  34. for j in range(1, tl + 1):
  35. if j < self.n:
  36. for ti in range(self.n - j):
  37. t_j[ti] = special
  38. for ti in range(self.n - j, self.n):
  39. t_j[ti] = s1[ti - (self.n - j)]
  40. else:
  41. t_j = s1[j - self.n:j]
  42. d[0] = 1.0 * j
  43. for i in range(sl + 1):
  44. cost = 0
  45. tn = self.n
  46. for ni in range(self.n):
  47. if sa[i - 1 + ni] != t_j[ni]:
  48. cost += 1
  49. elif sa[i - 1 + ni] == special:
  50. tn -= 1
  51. ec = cost / tn
  52. d[i] = min(d[i - 1] + 1, p[i] + 1, p[i - 1] + ec)
  53. p, d = d, p
  54. return p[sl] / max(tl, sl)