ngram.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. # Copyright (c) 2018 luozhouyang
  2. #
  3. # Permission is hereby granted, free of charge, to any person obtaining a copy
  4. # of this software and associated documentation files (the "Software"), to deal
  5. # in the Software without restriction, including without limitation the rights
  6. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. # copies of the Software, and to permit persons to whom the Software is
  8. # furnished to do so, subject to the following conditions:
  9. #
  10. # The above copyright notice and this permission notice shall be included in all
  11. # copies or substantial portions of the Software.
  12. #
  13. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  19. # SOFTWARE.
  20. from .string_distance import NormalizedStringDistance
  21. class NGram(NormalizedStringDistance):
  22. def __init__(self, n=2):
  23. self.n = n
  24. def distance(self, s0, s1):
  25. if s0 is None:
  26. raise TypeError("Argument s0 is NoneType.")
  27. if s1 is None:
  28. raise TypeError("Argument s1 is NoneType.")
  29. if s0 == s1:
  30. return 0.0
  31. special = '\n'
  32. sl = len(s0)
  33. tl = len(s1)
  34. if sl == 0 or tl == 0:
  35. return 1.0
  36. cost = 0
  37. if sl < self.n or tl < self.n:
  38. for i in range(min(sl, tl)):
  39. if s0[i] == s1[i]:
  40. cost += 1
  41. return 1.0 - cost / max(sl, tl)
  42. sa = [''] * (sl + self.n - 1)
  43. for i in range(len(sa)):
  44. if i < self.n - 1:
  45. sa[i] = special
  46. else:
  47. sa[i] = s0[i - self.n + 1]
  48. p = [0.0] * (sl + 1)
  49. d = [0.0] * (sl + 1)
  50. t_j = [''] * self.n
  51. for i in range(sl + 1):
  52. p[i] = 1.0 * i
  53. for j in range(1, tl + 1):
  54. if j < self.n:
  55. for ti in range(self.n - j):
  56. t_j[ti] = special
  57. for ti in range(self.n - j, self.n):
  58. t_j[ti] = s1[ti - (self.n - j)]
  59. else:
  60. t_j = s1[j - self.n:j]
  61. d[0] = 1.0 * j
  62. for i in range(sl + 1):
  63. cost = 0
  64. tn = self.n
  65. for ni in range(self.n):
  66. if sa[i - 1 + ni] != t_j[ni]:
  67. cost += 1
  68. elif sa[i - 1 + ni] == special:
  69. tn -= 1
  70. ec = cost / tn
  71. d[i] = min(d[i - 1] + 1, p[i] + 1, p[i - 1] + ec)
  72. p, d = d, p
  73. return p[sl] / max(tl, sl)