shingle_based.py 582 B

12345678910111213141516171819202122232425
  1. import re
  2. _SPACE_PATTERN = re.compile("\\s+")
  3. class ShingleBased:
  4. def __init__(self, k=3):
  5. self.k = k
  6. def get_k(self):
  7. return self.k
  8. def get_profile(self, string):
  9. shingles = dict()
  10. no_space_str = _SPACE_PATTERN.sub("", string)
  11. for i in range(len(no_space_str) - self.k + 1):
  12. shingle = no_space_str[i:i + self.k]
  13. old = shingles.get(shingle)
  14. if old:
  15. shingles[str(shingle)] = int(old + 1)
  16. else:
  17. shingles[str(shingle)] = 1
  18. return shingles