|
@@ -0,0 +1,217 @@
|
|
|
+import os
|
|
|
+import sys
|
|
|
+
|
|
|
+import numpy as np
|
|
|
+from openslide import open_slide
|
|
|
+import openslide
|
|
|
+
|
|
|
+import csv
|
|
|
+import time
|
|
|
+
|
|
|
+import tables
|
|
|
+import h5py
|
|
|
+from datetime import timedelta
|
|
|
+
|
|
|
+import pandas as pd
|
|
|
+import numpy as np
|
|
|
+
|
|
|
+def main():
|
|
|
+ ''' Six command line arguments:
|
|
|
+ - $FILE_DIR location of WSI, tif files, like /scratch/wxc4/CAMELYON16-testing
|
|
|
+ - $FILE name of WSI, like test_001.tif
|
|
|
+ - $PATCH_SIZE
|
|
|
+ - $SPLIT_SIZE number of the sampled patches in one hdf5 file
|
|
|
+ - $SPLIT_DIR output location of split parts/sets, like /projects/mikem/UserSupport/weizhe.li/split_imgs/test_001
|
|
|
+
|
|
|
+ Note: patch_size is the dimension of the sampled patches, NOT equivalent to openslide's definition
|
|
|
+ of tile_size. This implementation was chosen to allow for more intuitive usage.
|
|
|
+ '''
|
|
|
+ # print command line arguments
|
|
|
+ for arg in sys.argv[1:]:
|
|
|
+ print (arg)
|
|
|
+
|
|
|
+ print (os.environ['HOSTNAME'])
|
|
|
+ print (os.environ['SGE_TASK_ID'])
|
|
|
+
|
|
|
+ x = int(os.environ['SGE_TASK_ID'])
|
|
|
+ print ("x = ", x)
|
|
|
+
|
|
|
+ file_dir = sys.argv[1]
|
|
|
+ file = sys.argv[2]
|
|
|
+ p_size = int(sys.argv[3])
|
|
|
+ s_size = int(sys.argv[4])
|
|
|
+ split_dir = sys.argv[5]
|
|
|
+
|
|
|
+ print ("file_dir = " + file_dir)
|
|
|
+ print ("file = " + file)
|
|
|
+ print ("p_size = " + str( p_size))
|
|
|
+ print ("s_size = " + str( s_size))
|
|
|
+ print ("split_dir = " + split_dir)
|
|
|
+
|
|
|
+ ''' - limit_bounds activates OpenSlide's automatic boundary limits (cuts out some background)
|
|
|
+ '''
|
|
|
+ patch_count = split_and_store_patches( file,
|
|
|
+ file_dir,
|
|
|
+ patch_size=p_size,
|
|
|
+ split_size=s_size,
|
|
|
+ db_location=split_dir)
|
|
|
+# end of main ()
|
|
|
+
|
|
|
+def split_and_store_patches(file_name,
|
|
|
+ file_dir,
|
|
|
+ patch_size=448,
|
|
|
+ split_size=160,
|
|
|
+ db_location=''):
|
|
|
+ ''' Sample patches of specified size from .tif file.
|
|
|
+ - file_name name of whole slide image to sample from
|
|
|
+ - file_dir directory file is located in
|
|
|
+ '''
|
|
|
+ pred_size = int(os.environ['PRED_SIZE']) # Dimensiuons expand ratio 224 now
|
|
|
+
|
|
|
+ slide_path = file_dir + "/" + file_name
|
|
|
+ slide = openslide.open_slide(slide_path)
|
|
|
+
|
|
|
+ index_dir=os.environ['INDEX_DIR']
|
|
|
+ index_file_path=index_dir + "/" + file_name[:-4] + '.pkl'
|
|
|
+
|
|
|
+ index_file = pd.read_pickle(index_file_path)
|
|
|
+
|
|
|
+ # Drop the rows with 'is_tissue' is 0. 0 means this image patches is not from a speciman.
|
|
|
+ index_file = index_file[index_file['is_tissue'] > 0] # please pay attention to the index of dataframe
|
|
|
+ # num_samples = len(index_file)
|
|
|
+
|
|
|
+ hdf5_file = h5py.File(db_location + "/" + file_name[:-4] + '.h5','w')
|
|
|
+
|
|
|
+ num_grp = 0 ;
|
|
|
+ count = 0 ; # number of subgroups in the h5 file
|
|
|
+ patches, coords = [], []
|
|
|
+
|
|
|
+ start_time = time.time()
|
|
|
+
|
|
|
+ for _, batch_sample in index_file.iterrows():
|
|
|
+ xy = batch_sample.tile_loc[::-1] # the "-1" is very important. It switches the x, y for openslide.
|
|
|
+
|
|
|
+ # This will avoid go outside of WSI image.
|
|
|
+ # xylarge = [abs(x * pred_size - pred_size//2) for x in xy]
|
|
|
+
|
|
|
+ # if xy[0] == 0 or xy[1] == 0:
|
|
|
+ # xylarge = [x * pred_size for x in xy]
|
|
|
+ # else:
|
|
|
+ # xylarge = [x * pred_size - pred_size//2 for x in xy]
|
|
|
+
|
|
|
+ xylarge = [x * pred_size for x in xy] # placeholder, this will change
|
|
|
+
|
|
|
+ if xy[0] == 0 and xy[1] == 0:
|
|
|
+ xylarge = [x * pred_size for x in xy]
|
|
|
+ elif xy[0] == 0 and xy[1] != 0:
|
|
|
+ xylarge [0] = xy[0] * pred_size
|
|
|
+ xylarge [1] = xy[1] * pred_size - pred_size//2
|
|
|
+ elif xy[0] != 0 and xy[1] == 0:
|
|
|
+ xylarge [0] = xy[0] * pred_size - pred_size//2
|
|
|
+ xylarge [1] = xy[1] * pred_size
|
|
|
+ else:
|
|
|
+ xylarge = [x * pred_size - pred_size//2 for x in xy]
|
|
|
+
|
|
|
+ # Please double check
|
|
|
+ if xylarge[0]+patch_size > slide.dimensions[0]:
|
|
|
+ xylarge[0] = xylarge[0] - (xylarge[0] + patch_size - slide.dimensions[0]-1)
|
|
|
+ if xylarge[1]+patch_size > slide.dimensions[1]:
|
|
|
+ xylarge[1] = xylarge[1] - (xylarge[1] + patch_size - slide.dimensions[1]-1)
|
|
|
+
|
|
|
+ new_tile = generate_patches(slide, xylarge, patch_size)
|
|
|
+
|
|
|
+ # Debug
|
|
|
+ # print("Added: ", str (x_inc), " ", str (y_inc), " ", str (x), " ", str (y))
|
|
|
+
|
|
|
+ patches.append(new_tile)
|
|
|
+ coords.append(np.array([xy[0], xy[1], xylarge[0], xylarge[1], patch_size]))
|
|
|
+ count += 1
|
|
|
+
|
|
|
+ if ( count % split_size == 0 ) :
|
|
|
+ # Write to HDF5 files all in one go.
|
|
|
+ save_to_hdf5(db_location, patches, coords, str(count // split_size), hdf5_file)
|
|
|
+ num_grp += 1
|
|
|
+
|
|
|
+ print("Group " + str(num_grp ) + ": --- %s seconds ---" % (time.time() - start_time))
|
|
|
+ start_time = time.time() # restart timer
|
|
|
+
|
|
|
+ # debug
|
|
|
+ print ("len(coords) = " + str(len(coords)))
|
|
|
+ print ("count = " + str(count))
|
|
|
+ print ("num_grp = " + str(num_grp))
|
|
|
+
|
|
|
+ del patches
|
|
|
+ del coords
|
|
|
+ patches, coords = [], [] # Reset right away.
|
|
|
+
|
|
|
+
|
|
|
+ if count % split_size != 0:
|
|
|
+ # Write to HDF5 files all in one go.
|
|
|
+ save_to_hdf5(db_location, patches, coords, str(count // split_size + 1), hdf5_file)
|
|
|
+ num_grp += 1
|
|
|
+
|
|
|
+ print("Group " + str(num_grp ) + ": --- %s seconds ---" % (time.time() - start_time))
|
|
|
+
|
|
|
+ # debug
|
|
|
+ print ("len(coords) = " + str(len(coords)))
|
|
|
+ print ("count = " + str(count))
|
|
|
+ print ("num_grp = " + str(num_grp))
|
|
|
+
|
|
|
+
|
|
|
+ row = [ num_grp, file_name[:-4] ]
|
|
|
+ # writing to csv file
|
|
|
+ with open(db_location + "/" + file_name[:-4] + '.csv', 'w') as csvfile:
|
|
|
+ csvwriter = csv.writer(csvfile, delimiter = ' ') # creating a csv writer object
|
|
|
+ csvwriter.writerow(row) # writing the data rows
|
|
|
+
|
|
|
+ return count
|
|
|
+
|
|
|
+# end of split_and_store_patches()
|
|
|
+
|
|
|
+def generate_patches(slide, xylarge, ps):
|
|
|
+ """
|
|
|
+ To extract 448x448 patches from WSI for prediction
|
|
|
+ :param object slide: slide object from OpenSlide
|
|
|
+ :param list xylarge: the x, y coordinates for the position where the patch
|
|
|
+ should be extracted
|
|
|
+ :return array img: a 448x448 patch, only 3 channels (R, G, B)
|
|
|
+ """
|
|
|
+ img = slide.read_region(xylarge, 0, (ps, ps))
|
|
|
+ img = np.array(img)
|
|
|
+ img = img[:, :, :3]
|
|
|
+
|
|
|
+ return img
|
|
|
+
|
|
|
+
|
|
|
+###########################################################################
|
|
|
+# Option 2: store to HDF5 files #
|
|
|
+###########################################################################
|
|
|
+
|
|
|
+def save_to_hdf5(db_location, patches, coords, file_name, hdf5_file):
|
|
|
+ """ Saves the numpy arrays to HDF5 files. A sub-group of patches from a single WSI will be saved
|
|
|
+ to the same HDF5 file
|
|
|
+ - db_location folder to save images in
|
|
|
+ - patches numpy images
|
|
|
+ - coords x, y tile coordinates
|
|
|
+ - file_name number, like 1, 2,
|
|
|
+ - hdf5_file one hdf5 wil contain many datasets
|
|
|
+ """
|
|
|
+ # Save patches into hdf5 file.
|
|
|
+ subgrp='t'+file_name
|
|
|
+ grp = hdf5_file.create_group(subgrp) ;
|
|
|
+ dataset = grp.create_dataset("img", np.shape(patches),
|
|
|
+ dtype=np.uint8, data=patches)
|
|
|
+ # dtype=np.uint8, data=patches, compression="szip")
|
|
|
+ # , compression_opts=9)
|
|
|
+
|
|
|
+ dataset2 = grp.create_dataset("coord", np.shape(coords),
|
|
|
+ dtype=np.int64, data=coords)
|
|
|
+
|
|
|
+# end of save_to_hdf5 ()
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|
|
|
+
|
|
|
+
|