Browse Source

Rename dldp/dldp/heatmap_pred/split_grp.py to dldp/dldp/heatmap_pred/HPC_SC20/split_grp.py

Weizhe Li 5 years ago
parent
commit
f110b1469d
1 changed files with 217 additions and 0 deletions
  1. 217 0
      split_grp.py

+ 217 - 0
split_grp.py

@@ -0,0 +1,217 @@
+import os
+import sys
+
+import numpy as np
+from openslide import open_slide  
+import openslide
+
+import csv
+import time
+
+import tables
+import h5py
+from datetime import timedelta
+
+import pandas as pd
+import numpy as np
+
+def main():
+    ''' Six command line arguments:
+        - $FILE_DIR      location of WSI, tif files, like /scratch/wxc4/CAMELYON16-testing
+        - $FILE          name of WSI, like test_001.tif
+        - $PATCH_SIZE
+        - $SPLIT_SIZE    number of the sampled patches in one hdf5 file
+        - $SPLIT_DIR     output location of split parts/sets, like /projects/mikem/UserSupport/weizhe.li/split_imgs/test_001
+            
+        Note: patch_size is the dimension of the sampled patches, NOT equivalent to openslide's definition
+        of tile_size. This implementation was chosen to allow for more intuitive usage.
+    '''
+    # print command line arguments
+    for arg in sys.argv[1:]:
+        print (arg)
+
+    print (os.environ['HOSTNAME'])
+    print (os.environ['SGE_TASK_ID'])
+    
+    x = int(os.environ['SGE_TASK_ID'])
+    print ("x = ", x)
+
+    file_dir = sys.argv[1] 
+    file = sys.argv[2]
+    p_size = int(sys.argv[3]) 
+    s_size = int(sys.argv[4]) 
+    split_dir = sys.argv[5] 
+
+    print ("file_dir = " + file_dir)
+    print ("file = " + file)
+    print ("p_size = " + str( p_size))
+    print ("s_size = " + str( s_size))
+    print ("split_dir = " + split_dir)
+     
+    ''' - limit_bounds      activates OpenSlide's automatic boundary limits (cuts out some background)
+    '''
+    patch_count = split_and_store_patches( file,
+                                file_dir,
+                                patch_size=p_size,
+                                split_size=s_size, 
+                                db_location=split_dir)
+# end of main () 
+
+def split_and_store_patches(file_name,
+                             file_dir,
+                             patch_size=448,
+                             split_size=160, 
+                             db_location=''):
+    ''' Sample patches of specified size from .tif file.
+        - file_name             name of whole slide image to sample from
+        - file_dir              directory file is located in
+    '''
+    pred_size = int(os.environ['PRED_SIZE'])                  # Dimensiuons expand ratio 224 now
+
+    slide_path = file_dir + "/" + file_name
+    slide = openslide.open_slide(slide_path)
+
+    index_dir=os.environ['INDEX_DIR']
+    index_file_path=index_dir + "/" + file_name[:-4] + '.pkl' 
+
+    index_file = pd.read_pickle(index_file_path)
+    
+    # Drop the rows with 'is_tissue' is 0. 0 means this image patches is not from a speciman. 
+    index_file = index_file[index_file['is_tissue'] > 0] # please pay attention to the index of dataframe
+    # num_samples = len(index_file)
+    
+    hdf5_file = h5py.File(db_location + "/" + file_name[:-4] + '.h5','w')
+    
+    num_grp = 0 ; 
+    count = 0 ;                  # number of subgroups in the h5 file
+    patches, coords = [], []
+    
+    start_time = time.time()
+
+    for _, batch_sample in index_file.iterrows():
+        xy = batch_sample.tile_loc[::-1] # the "-1" is very important. It switches the x, y for openslide.
+
+        # This will avoid go outside of WSI image.
+        # xylarge = [abs(x * pred_size - pred_size//2) for x in xy]
+        
+        # if xy[0] == 0 or xy[1] == 0:
+        #    xylarge = [x * pred_size for x in xy]
+        # else:
+        #    xylarge = [x * pred_size - pred_size//2 for x in xy]
+        
+        xylarge = [x * pred_size for x in xy] # placeholder, this will change
+        
+        if xy[0] == 0 and xy[1] == 0:
+             xylarge = [x * pred_size for x in xy]
+        elif xy[0] == 0 and xy[1] != 0:
+             xylarge [0] = xy[0] * pred_size
+             xylarge [1] = xy[1]  * pred_size - pred_size//2
+        elif xy[0] != 0 and xy[1] == 0:
+             xylarge [0] = xy[0]  * pred_size - pred_size//2
+             xylarge [1] = xy[1] * pred_size
+        else:
+             xylarge = [x * pred_size - pred_size//2 for x in xy]        
+        
+        # Please double check
+        if xylarge[0]+patch_size > slide.dimensions[0]:
+            xylarge[0] = xylarge[0] - (xylarge[0] + patch_size - slide.dimensions[0]-1)
+        if xylarge[1]+patch_size > slide.dimensions[1]:
+            xylarge[1] = xylarge[1] - (xylarge[1] + patch_size - slide.dimensions[1]-1)
+
+        new_tile = generate_patches(slide, xylarge, patch_size)
+        
+        # Debug
+        # print("Added: ", str (x_inc), " ", str (y_inc), " ", str (x), " ", str (y))
+        
+        patches.append(new_tile)
+        coords.append(np.array([xy[0], xy[1], xylarge[0], xylarge[1], patch_size]))
+        count += 1
+     
+        if ( count % split_size == 0 ) :
+            # Write to HDF5 files all in one go.
+            save_to_hdf5(db_location, patches, coords, str(count // split_size), hdf5_file)
+            num_grp += 1
+            
+            print("Group " + str(num_grp ) + ": --- %s seconds ---" % (time.time() - start_time))
+            start_time = time.time()    # restart timer
+            
+            # debug 
+            print ("len(coords) = " + str(len(coords))) 
+            print ("count = " + str(count)) 
+            print ("num_grp = " + str(num_grp)) 
+
+            del patches
+            del coords
+            patches, coords = [], [] # Reset right away.
+                
+
+    if count % split_size != 0:
+        # Write to HDF5 files all in one go.
+        save_to_hdf5(db_location, patches, coords, str(count // split_size + 1), hdf5_file)
+        num_grp += 1
+
+        print("Group " + str(num_grp ) + ": --- %s seconds ---" % (time.time() - start_time))
+       
+        # debug 
+        print ("len(coords) = " + str(len(coords))) 
+        print ("count = " + str(count)) 
+        print ("num_grp = " + str(num_grp)) 
+ 
+
+    row = [ num_grp, file_name[:-4] ]
+    # writing to csv file 
+    with open(db_location + "/" + file_name[:-4] + '.csv', 'w') as csvfile: 
+        csvwriter = csv.writer(csvfile, delimiter = ' ')     # creating a csv writer object 
+        csvwriter.writerow(row)           # writing the data rows    
+    
+    return count
+
+# end of split_and_store_patches()
+
+def generate_patches(slide, xylarge, ps):
+    """
+    To extract 448x448 patches from WSI for prediction
+    :param object slide: slide object from OpenSlide
+    :param list xylarge: the x, y coordinates for the position where the patch
+                          should be extracted
+    :return array img: a 448x448 patch, only 3 channels (R, G, B)
+    """
+    img = slide.read_region(xylarge, 0, (ps, ps))
+    img = np.array(img)
+    img = img[:, :, :3]
+    
+    return img
+
+
+###########################################################################
+#                Option 2: store to HDF5 files                            #
+###########################################################################
+
+def save_to_hdf5(db_location, patches, coords, file_name, hdf5_file):
+    """ Saves the numpy arrays to HDF5 files. A sub-group of patches from a single WSI will be saved
+        to the same HDF5 file
+        - db_location       folder to save images in
+        - patches           numpy images
+        - coords            x, y tile coordinates
+        - file_name         number, like 1, 2,  
+        - hdf5_file        one hdf5 wil contain many datasets
+    """
+    # Save patches into hdf5 file.
+    subgrp='t'+file_name
+    grp = hdf5_file.create_group(subgrp) ;
+    dataset = grp.create_dataset("img", np.shape(patches), 
+                                dtype=np.uint8, data=patches)
+                                # dtype=np.uint8, data=patches, compression="szip")
+                                # , compression_opts=9)
+                                
+    dataset2 = grp.create_dataset("coord", np.shape(coords), 
+                                  dtype=np.int64, data=coords)
+    
+# end of save_to_hdf5 ()
+
+
+
+if __name__ == "__main__":
+    main()
+    
+