generate_doc_set.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. import os
  2. import cv2
  3. import shutil
  4. import random
  5. import numpy as np
  6. np.random.seed(42)
  7. random.seed(42)
  8. DST_IMG_DIR = r"DOCUMENTS\\CHOSEN\\images"
  9. DST_MSK_DIR = r"DOCUMENTS\\CHOSEN\\masks"
  10. os.makedirs(DST_IMG_DIR, exist_ok=True)
  11. os.makedirs(DST_MSK_DIR, exist_ok=True)
  12. datasets = {
  13. r"DOCUMENTS\\datasets\\docvqa_images": 700, # 2573
  14. r"DOCUMENTS\\datasets\\formsE-H_images": 100, # 522
  15. r"DOCUMENTS\\datasets\\kaggle_noisy_images": 125, # 360
  16. r"DOCUMENTS\\datasets\\FUNSD_images": 199, # 199
  17. r"DOCUMENTS\\datasets\\nouvel_images": 125, # 125
  18. r"DOCUMENTS\\datasets\\annotated_640": 94, # 94
  19. }
  20. def copy_and_create_mask(img_paths, crop=True):
  21. for image_path in img_paths:
  22. img_name = os.path.split(image_path)[-1]
  23. image = cv2.imread(image_path)
  24. if crop:
  25. H, W, _ = image.shape
  26. image = image[42 : H - 42, 42 : W - 42, :]
  27. cv2.imwrite(os.path.join(DST_IMG_DIR, img_name), image)
  28. mask = np.ones_like(image) * 255
  29. cv2.imwrite(os.path.join(DST_MSK_DIR, img_name), mask)
  30. return
  31. for folder_path, total_to_take in datasets.items():
  32. print(folder_path)
  33. image_paths = np.asarray([os.path.join(folder_path, i) for i in os.listdir(folder_path)])
  34. chosen_image_paths = np.random.choice(image_paths, size=total_to_take, replace=False)
  35. if folder_path == r"DOCUMENTS\\datasets\\docvqa_images":
  36. crop = True
  37. else:
  38. crop = False
  39. copy_and_create_mask(img_paths=chosen_image_paths, crop=crop)