split_grp.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. import os
  2. import sys
  3. import numpy as np
  4. from openslide import open_slide
  5. import openslide
  6. import csv
  7. import time
  8. import tables
  9. import h5py
  10. from datetime import timedelta
  11. import pandas as pd
  12. import numpy as np
  13. def main():
  14. ''' Six command line arguments:
  15. - $FILE_DIR location of WSI, tif files, like /scratch/wxc4/CAMELYON16-testing
  16. - $FILE name of WSI, like test_001.tif
  17. - $PATCH_SIZE
  18. - $SPLIT_SIZE number of the sampled patches in one hdf5 file
  19. - $SPLIT_DIR output location of split parts/sets, like /projects/mikem/UserSupport/weizhe.li/split_imgs/test_001
  20. Note: patch_size is the dimension of the sampled patches, NOT equivalent to openslide's definition
  21. of tile_size. This implementation was chosen to allow for more intuitive usage.
  22. '''
  23. # print command line arguments
  24. for arg in sys.argv[1:]:
  25. print (arg)
  26. print (os.environ['HOSTNAME'])
  27. print (os.environ['SGE_TASK_ID'])
  28. x = int(os.environ['SGE_TASK_ID'])
  29. print ("x = ", x)
  30. file_dir = sys.argv[1]
  31. file = sys.argv[2]
  32. p_size = int(sys.argv[3])
  33. s_size = int(sys.argv[4])
  34. split_dir = sys.argv[5]
  35. print ("file_dir = " + file_dir)
  36. print ("file = " + file)
  37. print ("p_size = " + str( p_size))
  38. print ("s_size = " + str( s_size))
  39. print ("split_dir = " + split_dir)
  40. ''' - limit_bounds activates OpenSlide's automatic boundary limits (cuts out some background)
  41. '''
  42. patch_count = split_and_store_patches( file,
  43. file_dir,
  44. patch_size=p_size,
  45. split_size=s_size,
  46. db_location=split_dir)
  47. # end of main ()
  48. def split_and_store_patches(file_name,
  49. file_dir,
  50. patch_size=448,
  51. split_size=160,
  52. db_location=''):
  53. ''' Sample patches of specified size from .tif file.
  54. - file_name name of whole slide image to sample from
  55. - file_dir directory file is located in
  56. '''
  57. pred_size = int(os.environ['PRED_SIZE']) # Dimensiuons expand ratio 224 now
  58. slide_path = file_dir + "/" + file_name
  59. slide = openslide.open_slide(slide_path)
  60. index_dir=os.environ['INDEX_DIR']
  61. index_file_path=index_dir + "/" + file_name[:-4] + '.pkl'
  62. index_file = pd.read_pickle(index_file_path)
  63. # Drop the rows with 'is_tissue' is 0. 0 means this image patches is not from a speciman.
  64. index_file = index_file[index_file['is_tissue'] > 0] # please pay attention to the index of dataframe
  65. # num_samples = len(index_file)
  66. hdf5_file = h5py.File(db_location + "/" + file_name[:-4] + '.h5','w')
  67. num_grp = 0 ;
  68. count = 0 ; # number of subgroups in the h5 file
  69. patches, coords = [], []
  70. start_time = time.time()
  71. for _, batch_sample in index_file.iterrows():
  72. xy = batch_sample.tile_loc[::-1] # the "-1" is very important. It switches the x, y for openslide.
  73. # This will avoid go outside of WSI image.
  74. # xylarge = [abs(x * pred_size - pred_size//2) for x in xy]
  75. # if xy[0] == 0 or xy[1] == 0:
  76. # xylarge = [x * pred_size for x in xy]
  77. # else:
  78. # xylarge = [x * pred_size - pred_size//2 for x in xy]
  79. xylarge = [x * pred_size for x in xy] # placeholder, this will change
  80. if xy[0] == 0 and xy[1] == 0:
  81. xylarge = [x * pred_size for x in xy]
  82. elif xy[0] == 0 and xy[1] != 0:
  83. xylarge [0] = xy[0] * pred_size
  84. xylarge [1] = xy[1] * pred_size - pred_size//2
  85. elif xy[0] != 0 and xy[1] == 0:
  86. xylarge [0] = xy[0] * pred_size - pred_size//2
  87. xylarge [1] = xy[1] * pred_size
  88. else:
  89. xylarge = [x * pred_size - pred_size//2 for x in xy]
  90. # Please double check
  91. if xylarge[0]+patch_size > slide.dimensions[0]:
  92. xylarge[0] = xylarge[0] - (xylarge[0] + patch_size - slide.dimensions[0]-1)
  93. if xylarge[1]+patch_size > slide.dimensions[1]:
  94. xylarge[1] = xylarge[1] - (xylarge[1] + patch_size - slide.dimensions[1]-1)
  95. new_tile = generate_patches(slide, xylarge, patch_size)
  96. # Debug
  97. # print("Added: ", str (x_inc), " ", str (y_inc), " ", str (x), " ", str (y))
  98. patches.append(new_tile)
  99. coords.append(np.array([xy[0], xy[1], xylarge[0], xylarge[1], patch_size]))
  100. count += 1
  101. if ( count % split_size == 0 ) :
  102. # Write to HDF5 files all in one go.
  103. save_to_hdf5(db_location, patches, coords, str(count // split_size), hdf5_file)
  104. num_grp += 1
  105. print("Group " + str(num_grp ) + ": --- %s seconds ---" % (time.time() - start_time))
  106. start_time = time.time() # restart timer
  107. # debug
  108. print ("len(coords) = " + str(len(coords)))
  109. print ("count = " + str(count))
  110. print ("num_grp = " + str(num_grp))
  111. del patches
  112. del coords
  113. patches, coords = [], [] # Reset right away.
  114. if count % split_size != 0:
  115. # Write to HDF5 files all in one go.
  116. save_to_hdf5(db_location, patches, coords, str(count // split_size + 1), hdf5_file)
  117. num_grp += 1
  118. print("Group " + str(num_grp ) + ": --- %s seconds ---" % (time.time() - start_time))
  119. # debug
  120. print ("len(coords) = " + str(len(coords)))
  121. print ("count = " + str(count))
  122. print ("num_grp = " + str(num_grp))
  123. row = [ num_grp, file_name[:-4] ]
  124. # writing to csv file
  125. with open(db_location + "/" + file_name[:-4] + '.csv', 'w') as csvfile:
  126. csvwriter = csv.writer(csvfile, delimiter = ' ') # creating a csv writer object
  127. csvwriter.writerow(row) # writing the data rows
  128. return count
  129. # end of split_and_store_patches()
  130. def generate_patches(slide, xylarge, ps):
  131. """
  132. To extract 448x448 patches from WSI for prediction
  133. :param object slide: slide object from OpenSlide
  134. :param list xylarge: the x, y coordinates for the position where the patch
  135. should be extracted
  136. :return array img: a 448x448 patch, only 3 channels (R, G, B)
  137. """
  138. img = slide.read_region(xylarge, 0, (ps, ps))
  139. img = np.array(img)
  140. img = img[:, :, :3]
  141. return img
  142. ###########################################################################
  143. # Option 2: store to HDF5 files #
  144. ###########################################################################
  145. def save_to_hdf5(db_location, patches, coords, file_name, hdf5_file):
  146. """ Saves the numpy arrays to HDF5 files. A sub-group of patches from a single WSI will be saved
  147. to the same HDF5 file
  148. - db_location folder to save images in
  149. - patches numpy images
  150. - coords x, y tile coordinates
  151. - file_name number, like 1, 2,
  152. - hdf5_file one hdf5 wil contain many datasets
  153. """
  154. # Save patches into hdf5 file.
  155. subgrp='t'+file_name
  156. grp = hdf5_file.create_group(subgrp) ;
  157. dataset = grp.create_dataset("img", np.shape(patches),
  158. dtype=np.uint8, data=patches)
  159. # dtype=np.uint8, data=patches, compression="szip")
  160. # , compression_opts=9)
  161. dataset2 = grp.create_dataset("coord", np.shape(coords),
  162. dtype=np.int64, data=coords)
  163. # end of save_to_hdf5 ()
  164. if __name__ == "__main__":
  165. main()