post_process.py 1.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. import imagehash
  2. from PIL import Image
  3. import os
  4. def find_similar_images(base_dir, hash_size=8):
  5. snapshots_files = sorted(os.listdir(base_dir))
  6. hash_dict = {}
  7. duplicates = []
  8. num_duplicates = 0
  9. print('---'*5,"Finding similar files",'---'*5)
  10. for file in snapshots_files:
  11. read_file = Image.open(os.path.join(base_dir, file))
  12. comp_hash = str(imagehash.dhash(read_file, hash_size=hash_size))
  13. if comp_hash not in hash_dict:
  14. hash_dict[comp_hash] = file
  15. else:
  16. print('Duplicate file: ', file)
  17. duplicates.append(file)
  18. num_duplicates+=1
  19. print('\nTotal duplicate files:', num_duplicates)
  20. print("-----"*10)
  21. return hash_dict, duplicates
  22. def remove_duplicates(base_dir):
  23. _, duplicates = find_similar_images(base_dir, hash_size=12)
  24. if not len(duplicates):
  25. print('No duplicates found!')
  26. else:
  27. print("Removing duplicates...")
  28. for dup_file in duplicates:
  29. file_path = os.path.join(base_dir, dup_file)
  30. if os.path.exists(file_path):
  31. os.remove(file_path)
  32. else:
  33. print('Filepath: ', file_path, 'does not exists.')
  34. print('All duplicates removed!')
  35. print('***'*10,'\n')
  36. if __name__ == "__main__":
  37. remove_duplicates('sample_1')