123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218 |
- import os
- import sys
- import numpy as np
- from openslide import open_slide
- import openslide
- import csv
- import time
- import tables
- import h5py
- from datetime import timedelta
- import pandas as pd
- import numpy as np
- def main():
- ''' Six command line arguments:
- - $FILE_DIR location of WSI, tif files, like /scratch/wxc4/CAMELYON16-testing
- - $FILE name of WSI, like test_001.tif
- - $PATCH_SIZE
- - $SPLIT_SIZE number of the sampled patches in one hdf5 file
- - $SPLIT_DIR output location of split parts/sets, like /projects/mikem/UserSupport/weizhe.li/split_imgs/test_001
-
- Note: patch_size is the dimension of the sampled patches, NOT equivalent to openslide's definition
- of tile_size. This implementation was chosen to allow for more intuitive usage.
- '''
- # print command line arguments
- for arg in sys.argv[1:]:
- print (arg)
- print (os.environ['HOSTNAME'])
- print (os.environ['SGE_TASK_ID'])
-
- x = int(os.environ['SGE_TASK_ID'])
- print ("x = ", x)
- file_dir = sys.argv[1]
- file = sys.argv[2]
- p_size = int(sys.argv[3])
- s_size = int(sys.argv[4])
- split_dir = sys.argv[5]
- print ("file_dir = " + file_dir)
- print ("file = " + file)
- print ("p_size = " + str( p_size))
- print ("s_size = " + str( s_size))
- print ("split_dir = " + split_dir)
-
- ''' - limit_bounds activates OpenSlide's automatic boundary limits (cuts out some background)
- '''
- patch_count = split_and_store_patches( file,
- file_dir,
- patch_size=p_size,
- split_size=s_size,
- db_location=split_dir)
- # end of main ()
- def split_and_store_patches(file_name,
- file_dir,
- patch_size=448,
- split_size=160,
- db_location=''):
- ''' Sample patches of specified size from .tif file.
- - file_name name of whole slide image to sample from
- - file_dir directory file is located in
- '''
- pred_size = int(os.environ['PRED_SIZE']) # Dimensiuons expand ratio 224 now
- slide_path = file_dir + "/" + file_name
- slide = openslide.open_slide(slide_path)
- index_dir=os.environ['INDEX_DIR']
- index_file_path=index_dir + "/" + file_name[:-4] + '.pkl'
- index_file = pd.read_pickle(index_file_path)
-
- # Drop the rows with 'is_tissue' is 0. 0 means this image patches is not from a speciman.
- index_file = index_file[index_file['is_tissue'] > 0] # please pay attention to the index of dataframe
- # num_samples = len(index_file)
-
- hdf5_file = h5py.File(db_location + "/" + file_name[:-4] + '.h5','w')
-
- num_grp = 0 ;
- count = 0 ; # number of subgroups in the h5 file
- patches, coords = [], []
-
- start_time = time.time()
- for _, batch_sample in index_file.iterrows():
- xy = batch_sample.tile_loc[::-1] # the "-1" is very important. It switches the x, y for openslide.
- # This will avoid go outside of WSI image.
- # xylarge = [abs(x * pred_size - pred_size//2) for x in xy]
-
- # if xy[0] == 0 or xy[1] == 0:
- # xylarge = [x * pred_size for x in xy]
- # else:
- # xylarge = [x * pred_size - pred_size//2 for x in xy]
-
- xylarge = [x * pred_size for x in xy] # placeholder, this will change
-
- if xy[0] == 0 and xy[1] == 0:
- xylarge = [x * pred_size for x in xy]
- elif xy[0] == 0 and xy[1] != 0:
- xylarge [0] = xy[0] * pred_size
- xylarge [1] = xy[1] * pred_size - pred_size//2
- elif xy[0] != 0 and xy[1] == 0:
- xylarge [0] = xy[0] * pred_size - pred_size//2
- xylarge [1] = xy[1] * pred_size
- else:
- xylarge = [x * pred_size - pred_size//2 for x in xy]
-
- # Please double check
- if xylarge[0]+patch_size > slide.dimensions[0]:
- xylarge[0] = xylarge[0] - (xylarge[0] + patch_size - slide.dimensions[0]-1)
- if xylarge[1]+patch_size > slide.dimensions[1]:
- xylarge[1] = xylarge[1] - (xylarge[1] + patch_size - slide.dimensions[1]-1)
- new_tile = generate_patches(slide, xylarge, patch_size)
-
- # Debug
- # print("Added: ", str (x_inc), " ", str (y_inc), " ", str (x), " ", str (y))
-
- patches.append(new_tile)
- coords.append(np.array([xy[0], xy[1], xylarge[0], xylarge[1], patch_size]))
- count += 1
-
- if ( count % split_size == 0 ) :
- # Write to HDF5 files all in one go.
- save_to_hdf5(db_location, patches, coords, str(count // split_size), hdf5_file)
- num_grp += 1
-
- print("Group " + str(num_grp ) + ": --- %s seconds ---" % (time.time() - start_time))
- start_time = time.time() # restart timer
-
- # debug
- print ("len(coords) = " + str(len(coords)))
- print ("count = " + str(count))
- print ("num_grp = " + str(num_grp))
- del patches
- del coords
- patches, coords = [], [] # Reset right away.
-
- if count % split_size != 0:
- # Write to HDF5 files all in one go.
- save_to_hdf5(db_location, patches, coords, str(count // split_size + 1), hdf5_file)
- num_grp += 1
- print("Group " + str(num_grp ) + ": --- %s seconds ---" % (time.time() - start_time))
-
- # debug
- print ("len(coords) = " + str(len(coords)))
- print ("count = " + str(count))
- print ("num_grp = " + str(num_grp))
-
- row = [ num_grp, file_name[:-4] ]
- # writing to csv file
- with open(db_location + "/" + file_name[:-4] + '.csv', 'w') as csvfile:
- csvwriter = csv.writer(csvfile, delimiter = ' ') # creating a csv writer object
- csvwriter.writerow(row) # writing the data rows
-
- return count
- # end of split_and_store_patches()
- def generate_patches(slide, xylarge, ps):
- """
- To extract 448x448 patches from WSI for prediction
- :param object slide: slide object from OpenSlide
- :param list xylarge: the x, y coordinates for the position where the patch
- should be extracted
- :return array img: a 448x448 patch, only 3 channels (R, G, B)
- """
- img = slide.read_region(xylarge, 0, (ps, ps))
- img = np.array(img)
- img = img[:, :, :3]
-
- return img
- ###########################################################################
- # Option 2: store to HDF5 files #
- ###########################################################################
- def save_to_hdf5(db_location, patches, coords, file_name, hdf5_file):
- """ Saves the numpy arrays to HDF5 files. A sub-group of patches from a single WSI will be saved
- to the same HDF5 file
- - db_location folder to save images in
- - patches numpy images
- - coords x, y tile coordinates
- - file_name number, like 1, 2,
- - hdf5_file one hdf5 wil contain many datasets
- """
- # Save patches into hdf5 file.
- subgrp='t'+file_name
- grp = hdf5_file.create_group(subgrp) ;
- dataset = grp.create_dataset("img", np.shape(patches),
- dtype=np.uint8, data=patches)
- # dtype=np.uint8, data=patches, compression="szip")
- # , compression_opts=9)
-
- dataset2 = grp.create_dataset("coord", np.shape(coords),
- dtype=np.int64, data=coords)
-
- # end of save_to_hdf5 ()
- if __name__ == "__main__":
- main()
-
-
|