Merge pull request #99 from Doodleverse/new_makedatasets

New makedatasets
Doodleverse · Oct 21, 2022 · 9b5c648 · 9b5c648
2 parents 1ccde41 + e89c866
commit 9b5c648
Show file tree

Hide file tree

Showing 7 changed files with 111 additions and 55 deletions.
diff --git a/install/gym.yml b/install/gym.yml
@@ -10,6 +10,7 @@ dependencies:
  - cython
  - tensorflow-gpu
  - ipython
+ - jupyter
  - joblib
  - tqdm
  - pandas

diff --git a/make_nd_dataset.py b/make_nd_dataset.py
@@ -25,7 +25,6 @@
 
 # utility to merge multiple coincident jpeg images into nd numpy arrays
 import sys,os, time, json, shutil
-# sys.path.insert(1, 'src')
 
 from skimage.io import imread, imsave
 import numpy as np
@@ -126,7 +125,6 @@ def do_resize_image(f, TARGET_SIZE):
 for k in config.keys():
     exec(k+'=config["'+k+'"]')
 
-## NCLASSES>=2
 if NCLASSES>1:
     pass
 else:
@@ -152,7 +150,7 @@ def do_resize_image(f, TARGET_SIZE):
         os.environ['CUDA_VISIBLE_DEVICES'] = str(SET_GPU)
     else:
         #use the first available GPU
-        os.environ['CUDA_VISIBLE_DEVICES'] = '0' #'1'
+        os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 else:
    ## to use the CPU (not recommended):
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
@@ -261,7 +259,7 @@ def do_resize_image(f, TARGET_SIZE):
 
 
 ## write padded labels to file
-label_data_path = newdireclabels #label_data_path.replace('labels','padded_labels')
+label_data_path = newdireclabels 
 
 label_files = natsorted(glob(label_data_path+os.sep+'*.png'))
 if len(label_files)<1:
@@ -321,7 +319,6 @@ def do_resize_image(f, TARGET_SIZE):
     if 'REMAP_CLASSES' in locals():
         for k in REMAP_CLASSES.items():
             lab[lab==int(k[0])] = int(k[1])
-        # NCLASSES = len(np.unique(lab.flatten()))
     else:
         lab[lab>NCLASSES]=NCLASSES
 
@@ -449,14 +446,13 @@ def read_seg_dataset_multiclass(example):
                                     alpha=128, colormap=class_label_colormap,
                                      color_class_offset=0, do_alpha=False)
 
-     plt.imshow(color_label,  alpha=0.5)#, vmin=0, vmax=NCLASSES)
+     plt.imshow(color_label,  alpha=0.5)
 
      file = file.numpy()
 
      plt.axis('off')
      plt.title(file)
      plt.savefig(output_data_path+os.sep+'noaug_sample'+os.sep+ ROOT_STRING + 'noaug_ex'+str(counter)+'.png', dpi=200, bbox_inches='tight')
-     #counter +=1
      plt.close('all')
      counter += 1
 
@@ -580,7 +576,6 @@ def read_seg_dataset_multiclass(example):
 
 ######################## generate and print files
 
-
 i = 0
 for copy in tqdm(range(AUG_COPIES)):
     for k in range(AUG_LOOPS):
@@ -644,7 +639,6 @@ def read_seg_dataset_multiclass(example):
             if FILTER_VALUE>1:
 
                 for kk in range(lstack.shape[-1]):
-                    #l = median(lstack[:,:,kk], disk(FILTER_VALUE))
                     l = remove_small_objects(lstack[:,:,kk].astype('uint8')>0, np.pi*(FILTER_VALUE**2))
                     l = remove_small_holes(lstack[:,:,kk].astype('uint8')>0, np.pi*(FILTER_VALUE**2))
                     lstack[:,:,kk] = np.round(l).astype(np.uint8)
@@ -705,7 +699,6 @@ def read_seg_dataset_multiclass(example):
      else:
          plt.imshow(im)
 
-     # print(lab.shape)
      lab = np.argmax(lab.numpy().squeeze(),-1)
 
      color_label = label_to_colors(np.squeeze(lab), tf.cast(im[:,:,0]==0,tf.uint8),
@@ -724,7 +717,6 @@ def read_seg_dataset_multiclass(example):
      plt.axis('off')
 
      plt.savefig(output_data_path+os.sep+'aug_sample'+os.sep+ ROOT_STRING + 'aug_ex'+str(counter)+'.png', dpi=200, bbox_inches='tight')
-     #counter +=1
      plt.close('all')
      counter += 1
 

diff --git a/seg_images_in_folder.py b/seg_images_in_folder.py
@@ -253,6 +253,10 @@
         # Load in the model from the weights which is the location of the weights file        
         model = tf.keras.models.load_model(weights)
 
+        M.append(model)
+        C.append(configfile)
+        T.append(MODEL)
+
     except:
         # Load the metrics mean_iou, dice_coef from doodleverse_utils
         # Load in the custom loss function from doodleverse_utils        
@@ -294,20 +298,24 @@
 
 #look for TTA config
 if not 'TESTTIMEAUG' in locals():
+    print("TESTTIMEAUG not found in config file(s). Setting to False")
     TESTTIMEAUG = False
 #look for do_crf in config
 if not 'DO_CRF' in locals():
+    print("TESTTIMEAUG not found in config file(s). Setting to False")
     DO_CRF = False
 if not 'WRITE_MODELMETADATA' in locals():
+    print("WRITE_MODELMETADATA not found in config file(s). Setting to False")
     WRITE_MODELMETADATA = False
 if not 'OTSU_THRESHOLD' in locals():
+    print("OTSU_THRESHOLD not found in config file(s). Setting to False")
     OTSU_THRESHOLD = False
 
 # Import do_seg() from doodleverse_utils to perform the segmentation on the images
 for f in tqdm(sample_filenames):
     try:
         do_seg(f, M, metadatadict, sample_direc,NCLASSES,N_DATA_BANDS,TARGET_SIZE,TESTTIMEAUG, WRITE_MODELMETADATA,DO_CRF,OTSU_THRESHOLD)
     except:
-        print("{} failed".format(f))
+        print("{} failed. Check config file, and check the path provided contains valid imagery".format(f))
 
 
diff --git a/utils/recreate_saved_model.py → utils/gen_fullmodel_from_h5.py b/utils/recreate_saved_model.py → utils/gen_fullmodel_from_h5.py
@@ -1,5 +1,3 @@
-
-
 # Written by Dr Daniel Buscombe, Marda Science LLC
 # for  the USGS Coastal Change Hazards Program
 #
@@ -26,7 +24,7 @@
 # SOFTWARE.
 
 import sys,os, time
-sys.path.insert(1, '../src')
+# sys.path.insert(1, '../src')
 from tkinter import filedialog
 from tkinter import *
 from tkinter import messagebox
@@ -56,8 +54,8 @@
     for k in config.keys():
         exec(k+'=config["'+k+'"]')
 
-
-    from imports import *
+    from doodleverse_utils.imports import *
+    from doodleverse_utils.model_imports import *
 
     #=======================================================
     # Import the architectures for following models from doodleverse_utils
@@ -92,8 +90,6 @@
                         )
 
     elif MODEL =='simple_resunet':
-        # num_filters = 8 # initial filters
-        # model = res_unet((TARGET_SIZE[0], TARGET_SIZE[1], N_DATA_BANDS), num_filters, NCLASSES, (KERNEL_SIZE, KERNEL_SIZE))
 
         model = simple_resunet((TARGET_SIZE[0], TARGET_SIZE[1], N_DATA_BANDS),
                     kernel = (2, 2),
@@ -107,7 +103,6 @@
                     filters=FILTERS,#8,
                     num_layers=4,
                     strides=(1,1))
-    #346,564
     elif MODEL=='simple_unet':
         model = simple_unet((TARGET_SIZE[0], TARGET_SIZE[1], N_DATA_BANDS),
                     kernel = (2, 2),
@@ -121,10 +116,8 @@
                     filters=FILTERS,#8,
                     num_layers=4,
                     strides=(1,1))
-    #242,812
 
     elif MODEL=='satunet':
-        #model = sat_unet((TARGET_SIZE[0], TARGET_SIZE[1], N_DATA_BANDS), num_classes=NCLASSES)
 
         model = custom_satunet((TARGET_SIZE[0], TARGET_SIZE[1], N_DATA_BANDS),
                     kernel = (2, 2),
@@ -139,9 +132,6 @@
                     num_layers=4,
                     strides=(1,1))
 
-
-
-    # model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = [mean_iou, dice_coef])
     model.compile(optimizer = 'adam', loss = tf.keras.losses.CategoricalCrossentropy())
 
     model.load_weights(weights)
@@ -150,7 +140,7 @@
     # use gym  make"fullmodel.h5" version which zoo can read "fullmodel.h5" 
     model.save(weights.replace('.h5','_fullmodel.h5'))
 
-    new_model = tf.keras.models.load_model(weights.replace('.h5','_fullmodel.h5'))
+    # new_model = tf.keras.models.load_model(weights.replace('.h5','_fullmodel.h5'))
 
 
 

diff --git a/gen_saved_model.py → utils/gen_saved_model.py b/gen_saved_model.py → utils/gen_saved_model.py
@@ -1,11 +1,32 @@
-
+# Written by Dr Daniel Buscombe, Marda Science LLC
+# for the USGS Coastal Change Hazards Program
+#
+# MIT License
+#
+# Copyright (c) 2020-22, Marda Science LLC
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
 
 import sys,os, time
-# sys.path.insert(1, 'src')
 from tqdm import tqdm
 
 USE_GPU = True
-
 SET_GPU = '0'
 
 ## to store interim model outputs and metadata, use True
@@ -22,7 +43,6 @@
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 
 
-# from prediction_imports import *
 #====================================================
 from doodleverse_utils.prediction_imports import *
 #---------------------------------------------------
@@ -81,9 +101,6 @@
                     )
 
 elif MODEL =='simple_resunet':
-    # num_filters = 8 # initial filters
-    # model = res_unet((TARGET_SIZE[0], TARGET_SIZE[1], N_DATA_BANDS), num_filters, NCLASSES, (KERNEL_SIZE, KERNEL_SIZE))
-
     model = simple_resunet((TARGET_SIZE[0], TARGET_SIZE[1], N_DATA_BANDS),
                 kernel = (2, 2),
                 num_classes=[NCLASSES+1 if NCLASSES==1 else NCLASSES][0],
@@ -96,7 +113,6 @@
                 filters=FILTERS,#8,
                 num_layers=4,
                 strides=(1,1))
-#346,564
 elif MODEL=='simple_unet':
     model = simple_unet((TARGET_SIZE[0], TARGET_SIZE[1], N_DATA_BANDS),
                 kernel = (2, 2),
@@ -110,21 +126,17 @@
                 filters=FILTERS,#8,
                 num_layers=4,
                 strides=(1,1))
-#242,812
-
 elif MODEL=='satunet':
-    #model = sat_unet((TARGET_SIZE[0], TARGET_SIZE[1], N_DATA_BANDS), num_classes=NCLASSES)
-
     model = custom_satunet((TARGET_SIZE[0], TARGET_SIZE[1], N_DATA_BANDS),
                 kernel = (2, 2),
                 num_classes=[NCLASSES+1 if NCLASSES==1 else NCLASSES][0],
                 activation="relu",
                 use_batch_norm=True,
-                dropout=DROPOUT,#0.1,
-                dropout_change_per_layer=DROPOUT_CHANGE_PER_LAYER,#0.0,
-                dropout_type=DROPOUT_TYPE,#"standard",
-                use_dropout_on_upsampling=USE_DROPOUT_ON_UPSAMPLING,#False,
-                filters=FILTERS,#8,
+                dropout=DROPOUT,
+                dropout_change_per_layer=DROPOUT_CHANGE_PER_LAYER,
+                dropout_type=DROPOUT_TYPE,
+                use_dropout_on_upsampling=USE_DROPOUT_ON_UPSAMPLING,
+                filters=FILTERS,
                 num_layers=4,
                 strides=(1,1))
 

diff --git a/utils/make_class_balanced_subset.py b/utils/make_class_balanced_subset.py
@@ -1,22 +1,67 @@
+# Written by Dr Daniel Buscombe, Marda Science LLC
+# for the USGS Coastal Change Hazards Program
+#
+# MIT License
+#
+# Copyright (c) 2022, Marda Science LLC
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
 import numpy as np 
 from glob import glob 
 import os, shutil
 from tqdm import tqdm
+from tkinter import filedialog, messagebox
+from tkinter import *
 
-indirec = 'v4'
-outdirec = 'v5_subset'
-
-# read files
-files = glob('v4/*.npz')
+# Request the folder containing the imagery/npz to segment 
+# sample_direc: full path to the directory
+root = Tk()
+root.filename =  filedialog.askdirectory(title = "Select directory of class-imbalanced npzs")
+indirec = root.filename
+print(indirec)
+root.withdraw()
 
-# make directory for outputs
-os.mkdir(outdirec)
-os.mkdir(outdirec+os.sep+'no_use')
+outdirec = os.path.normpath(os.path.dirname(indirec)+os.sep+os.path.basename(indirec)+'_subset')
+print(outdirec)
 
 # set minimum threshold for any proportion
 # if any normalized class frequency distribution is less than this number
 # it is discounted (moved to 'no_use')
-thres = 1e-2
+# thres = 1e-2
+print("Input threshold for minor class [0 - 1], typically <0.25")
+print("This is the smallest acceptable proportion of the minority class. Samples were minority < threshold will not be used")
+print("The smaller the threshold, the fewer the number of samples used in the subset")
+# print("\n")
+thres = float(input())
+
+print("Threshold chosen: {}".format(thres))
+
+# read files
+files = glob(indirec+os.sep+'*.npz')
+
+try:
+    # make directory for outputs
+    os.mkdir(outdirec)
+    os.mkdir(outdirec+os.sep+'no_use')
+except:
+    pass
 
 # read files one by one
 for file in tqdm(files):
@@ -25,12 +70,11 @@
         label = np.argmax(label,-1)
         # get normalized class distributions
         norm_class_dist = np.bincount(label.flatten())/np.sum(label>-1)
-        # if length > 1, copy the file
         # if below thres
         if np.any(norm_class_dist<thres):
             shutil.copyfile(file,file.replace(indirec,outdirec+os.sep+'no_use'))
             # print('below threshold')
         elif not len(norm_class_dist)==1:
             shutil.copyfile(file,file.replace(indirec,outdirec))            
-        else:
+        else: # if length > 1, copy the file
             shutil.copyfile(file,file.replace(indirec,outdirec+os.sep+'no_use'))