import re _SPACE_PATTERN = re.compile("\\s+") class ShingleBased: def __init__(self, k=3): self.k = k def get_k(self): return self.k def get_profile(self, string): shingles = dict() no_space_str = _SPACE_PATTERN.sub("", string) for i in range(len(no_space_str) - self.k + 1): shingle = no_space_str[i:i + self.k] old = shingles.get(shingle) if old: shingles[str(shingle)] = int(old + 1) else: shingles[str(shingle)] = 1 return shingles