5 år sedan · f110b1469d
--- a/split_grp.py
+++ b/split_grp.py
@@ -0,0 +1,217 @@
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+import numpy as np
			
 
				+from openslide import open_slide  
			
 
				+import openslide
			
 
				+
			
 
				+import csv
			
 
				+import time
			
 
				+
			
 
				+import tables
			
 
				+import h5py
			
 
				+from datetime import timedelta
			
 
				+
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+
			
 
				+def main():
			
 
				+    ''' Six command line arguments:
			
 
				+        - $FILE_DIR      location of WSI, tif files, like /scratch/wxc4/CAMELYON16-testing
			
 
				+        - $FILE          name of WSI, like test_001.tif
			
 
				+        - $PATCH_SIZE
			
 
				+        - $SPLIT_SIZE    number of the sampled patches in one hdf5 file
			
 
				+        - $SPLIT_DIR     output location of split parts/sets, like /projects/mikem/UserSupport/weizhe.li/split_imgs/test_001
			
 
				+            
			
 
				+        Note: patch_size is the dimension of the sampled patches, NOT equivalent to openslide's definition
			
 
				+        of tile_size. This implementation was chosen to allow for more intuitive usage.
			
 
				+    '''
			
 
				+    # print command line arguments
			
 
				+    for arg in sys.argv[1:]:
			
 
				+        print (arg)
			
 
				+
			
 
				+    print (os.environ['HOSTNAME'])
			
 
				+    print (os.environ['SGE_TASK_ID'])
			
 
				+    
			
 
				+    x = int(os.environ['SGE_TASK_ID'])
			
 
				+    print ("x = ", x)
			
 
				+
			
 
				+    file_dir = sys.argv[1] 
			
 
				+    file = sys.argv[2]
			
 
				+    p_size = int(sys.argv[3]) 
			
 
				+    s_size = int(sys.argv[4]) 
			
 
				+    split_dir = sys.argv[5] 
			
 
				+
			
 
				+    print ("file_dir = " + file_dir)
			
 
				+    print ("file = " + file)
			
 
				+    print ("p_size = " + str( p_size))
			
 
				+    print ("s_size = " + str( s_size))
			
 
				+    print ("split_dir = " + split_dir)
			
 
				+     
			
 
				+    ''' - limit_bounds      activates OpenSlide's automatic boundary limits (cuts out some background)
			
 
				+    '''
			
 
				+    patch_count = split_and_store_patches( file,
			
 
				+                                file_dir,
			
 
				+                                patch_size=p_size,
			
 
				+                                split_size=s_size, 
			
 
				+                                db_location=split_dir)
			
 
				+# end of main () 
			
 
				+
			
 
				+def split_and_store_patches(file_name,
			
 
				+                             file_dir,
			
 
				+                             patch_size=448,
			
 
				+                             split_size=160, 
			
 
				+                             db_location=''):
			
 
				+    ''' Sample patches of specified size from .tif file.
			
 
				+        - file_name             name of whole slide image to sample from
			
 
				+        - file_dir              directory file is located in
			
 
				+    '''
			
 
				+    pred_size = int(os.environ['PRED_SIZE'])                  # Dimensiuons expand ratio 224 now
			
 
				+
			
 
				+    slide_path = file_dir + "/" + file_name
			
 
				+    slide = openslide.open_slide(slide_path)
			
 
				+
			
 
				+    index_dir=os.environ['INDEX_DIR']
			
 
				+    index_file_path=index_dir + "/" + file_name[:-4] + '.pkl' 
			
 
				+
			
 
				+    index_file = pd.read_pickle(index_file_path)
			
 
				+    
			
 
				+    # Drop the rows with 'is_tissue' is 0. 0 means this image patches is not from a speciman. 
			
 
				+    index_file = index_file[index_file['is_tissue'] > 0] # please pay attention to the index of dataframe
			
 
				+    # num_samples = len(index_file)
			
 
				+    
			
 
				+    hdf5_file = h5py.File(db_location + "/" + file_name[:-4] + '.h5','w')
			
 
				+    
			
 
				+    num_grp = 0 ; 
			
 
				+    count = 0 ;                  # number of subgroups in the h5 file
			
 
				+    patches, coords = [], []
			
 
				+    
			
 
				+    start_time = time.time()
			
 
				+
			
 
				+    for _, batch_sample in index_file.iterrows():
			
 
				+        xy = batch_sample.tile_loc[::-1] # the "-1" is very important. It switches the x, y for openslide.
			
 
				+
			
 
				+        # This will avoid go outside of WSI image.
			
 
				+        # xylarge = [abs(x * pred_size - pred_size//2) for x in xy]
			
 
				+        
			
 
				+        # if xy[0] == 0 or xy[1] == 0:
			
 
				+        #    xylarge = [x * pred_size for x in xy]
			
 
				+        # else:
			
 
				+        #    xylarge = [x * pred_size - pred_size//2 for x in xy]
			
 
				+        
			
 
				+        xylarge = [x * pred_size for x in xy] # placeholder, this will change
			
 
				+        
			
 
				+        if xy[0] == 0 and xy[1] == 0:
			
 
				+             xylarge = [x * pred_size for x in xy]
			
 
				+        elif xy[0] == 0 and xy[1] != 0:
			
 
				+             xylarge [0] = xy[0] * pred_size
			
 
				+             xylarge [1] = xy[1]  * pred_size - pred_size//2
			
 
				+        elif xy[0] != 0 and xy[1] == 0:
			
 
				+             xylarge [0] = xy[0]  * pred_size - pred_size//2
			
 
				+             xylarge [1] = xy[1] * pred_size
			
 
				+        else:
			
 
				+             xylarge = [x * pred_size - pred_size//2 for x in xy]        
			
 
				+        
			
 
				+        # Please double check
			
 
				+        if xylarge[0]+patch_size > slide.dimensions[0]:
			
 
				+            xylarge[0] = xylarge[0] - (xylarge[0] + patch_size - slide.dimensions[0]-1)
			
 
				+        if xylarge[1]+patch_size > slide.dimensions[1]:
			
 
				+            xylarge[1] = xylarge[1] - (xylarge[1] + patch_size - slide.dimensions[1]-1)
			
 
				+
			
 
				+        new_tile = generate_patches(slide, xylarge, patch_size)
			
 
				+        
			
 
				+        # Debug
			
 
				+        # print("Added: ", str (x_inc), " ", str (y_inc), " ", str (x), " ", str (y))
			
 
				+        
			
 
				+        patches.append(new_tile)
			
 
				+        coords.append(np.array([xy[0], xy[1], xylarge[0], xylarge[1], patch_size]))
			
 
				+        count += 1
			
 
				+     
			
 
				+        if ( count % split_size == 0 ) :
			
 
				+            # Write to HDF5 files all in one go.
			
 
				+            save_to_hdf5(db_location, patches, coords, str(count // split_size), hdf5_file)
			
 
				+            num_grp += 1
			
 
				+            
			
 
				+            print("Group " + str(num_grp ) + ": --- %s seconds ---" % (time.time() - start_time))
			
 
				+            start_time = time.time()    # restart timer
			
 
				+            
			
 
				+            # debug 
			
 
				+            print ("len(coords) = " + str(len(coords))) 
			
 
				+            print ("count = " + str(count)) 
			
 
				+            print ("num_grp = " + str(num_grp)) 
			
 
				+
			
 
				+            del patches
			
 
				+            del coords
			
 
				+            patches, coords = [], [] # Reset right away.
			
 
				+                
			
 
				+
			
 
				+    if count % split_size != 0:
			
 
				+        # Write to HDF5 files all in one go.
			
 
				+        save_to_hdf5(db_location, patches, coords, str(count // split_size + 1), hdf5_file)
			
 
				+        num_grp += 1
			
 
				+
			
 
				+        print("Group " + str(num_grp ) + ": --- %s seconds ---" % (time.time() - start_time))
			
 
				+       
			
 
				+        # debug 
			
 
				+        print ("len(coords) = " + str(len(coords))) 
			
 
				+        print ("count = " + str(count)) 
			
 
				+        print ("num_grp = " + str(num_grp)) 
			
 
				+ 
			
 
				+
			
 
				+    row = [ num_grp, file_name[:-4] ]
			
 
				+    # writing to csv file 
			
 
				+    with open(db_location + "/" + file_name[:-4] + '.csv', 'w') as csvfile: 
			
 
				+        csvwriter = csv.writer(csvfile, delimiter = ' ')     # creating a csv writer object 
			
 
				+        csvwriter.writerow(row)           # writing the data rows    
			
 
				+    
			
 
				+    return count
			
 
				+
			
 
				+# end of split_and_store_patches()
			
 
				+
			
 
				+def generate_patches(slide, xylarge, ps):
			
 
				+    """
			
 
				+    To extract 448x448 patches from WSI for prediction
			
 
				+    :param object slide: slide object from OpenSlide
			
 
				+    :param list xylarge: the x, y coordinates for the position where the patch
			
 
				+                          should be extracted
			
 
				+    :return array img: a 448x448 patch, only 3 channels (R, G, B)
			
 
				+    """
			
 
				+    img = slide.read_region(xylarge, 0, (ps, ps))
			
 
				+    img = np.array(img)
			
 
				+    img = img[:, :, :3]
			
 
				+    
			
 
				+    return img
			
 
				+
			
 
				+
			
 
				+###########################################################################
			
 
				+#                Option 2: store to HDF5 files                            #
			
 
				+###########################################################################
			
 
				+
			
 
				+def save_to_hdf5(db_location, patches, coords, file_name, hdf5_file):
			
 
				+    """ Saves the numpy arrays to HDF5 files. A sub-group of patches from a single WSI will be saved
			
 
				+        to the same HDF5 file
			
 
				+        - db_location       folder to save images in
			
 
				+        - patches           numpy images
			
 
				+        - coords            x, y tile coordinates
			
 
				+        - file_name         number, like 1, 2,  
			
 
				+        - hdf5_file        one hdf5 wil contain many datasets
			
 
				+    """
			
 
				+    # Save patches into hdf5 file.
			
 
				+    subgrp='t'+file_name
			
 
				+    grp = hdf5_file.create_group(subgrp) ;
			
 
				+    dataset = grp.create_dataset("img", np.shape(patches), 
			
 
				+                                dtype=np.uint8, data=patches)
			
 
				+                                # dtype=np.uint8, data=patches, compression="szip")
			
 
				+                                # , compression_opts=9)
			
 
				+                                
			
 
				+    dataset2 = grp.create_dataset("coord", np.shape(coords), 
			
 
				+                                  dtype=np.int64, data=coords)
			
 
				+    
			
 
				+# end of save_to_hdf5 ()
			
 
				+
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
 
				+    
			
 
				+