arapat · hughaharper · Jun 2, 2020 · Jun 2, 2020 · Jun 2, 2020 · Jun 2, 2020
diff --git a/.gitignore b/.gitignore
@@ -64,3 +64,6 @@ target/
 
 # Editor
 .vscode
+
+# other
+analysis/show*ipynb
diff --git a/DSC291.md b/DSC291.md
diff --git a/README.md b/README.md
@@ -11,6 +11,7 @@ If the input format is tsv, it will be written to disk in pickle files so that n
 * `test.py`: template code to be called by "__main__.py" proper functions for testing. It outputs a pickle file that
 contains scores in addition to some meta information about examples, e.g. cruise ID, longitute, latitude
 * `train.py`: template code to be called by "__main__.py" proper functions for training.
+* `clean_CM.py`: for applying model predictions to CM files. 
 * `config.json`: config such as the input data path, and the directory to write the models
 
 ## Typical usage
@@ -49,7 +50,7 @@ Then specify these three files to the training program in `config.json`.
 
 2. Run training with bootstrap
 
-The bathymetry module is implemented to train the models in different conditions (see `task_type` below). Note that 
+The bathymetry module is implemented to train the models in different conditions (see `task_type` below). Note that
  bootstrap is NOT implemented in this module.
 
 ```
@@ -64,7 +65,7 @@ python bathymetry <data_type> <task_type> <config_path>
    * "test-all": test the model trained on all data on the dataset from research institutions (test n times)
    * "train-instances": training a model using a data that is splitted on the instance level (ignore for now)
    * "test-instances": testing a model using a test set that was splitted on the instance level (ignore for now)
-   
+
 3. Run testing
 
 Testing is implemented in this module (see above).
@@ -79,19 +80,26 @@ Testing is implemented in this module (see above).
 
 ### Label
 
-The label is derived from the column 04 (see below), `sigd`: the example is labeled 0 if sigd == “9999”, and labeled 1 otherwise.
+Each row in the TSV file correponds to one measurement. The descriptions of the columns could be found at
+[README.md](README.md).
+
+The learning task is binary classification, specifically, to decide if a depth measurement is correct or not.
+The label is 0 if it is wrong (or corrupted), and 1 if it is accurate.
+The human annotators put a label "9999" in the column 5, `sigd`, if they think the measurement is wrong,
+and put other values otherwise.
+The program we provide get the data label using a function in the form of `lambda row: row[4] != "9999"`.
 
 ### Description of all columns
 
-Each line in the `.tsv` data files should contain 35 columns. The meaning of the columns are as follows.
+Each line in the `.tsv` data files should contain 37 columns. The meaning of the columns are as follows.
 
 ```
 index name                                      Example              Description
 00    lon                                	143.92639            longitude of the location
 01    lat                                	-43.99727            latitude of the location
 02    depth                              	-4637                the depth measured by the crew
 03    sigh                               	0                    not sure what it means
-04    sigd                               	-1                   state according to human editor: 9999 = bad (do not incorporate into atlas), all other values = Good (incorporate into atlas), 
+04    sigd                               	-1                   state according to human editor: 9999 = bad (do not incorporate into atlas), all other values = Good (incorporate into atlas),
 05    SID                                	10088                Cruise ID, should not be used as features
 06    pred                               	-4633                the predicted depth with the gravity model
 07    ID                                 	1                    not sure what it means
@@ -123,6 +131,7 @@ index name                                      Example              Description
 33    D-MED30m/STD30m                    	0.0102018
 34    year                               	2000                The year of the measurement
 35    kind                               	G                   Device type used for measurements
+36    PRED-ABS(VGG_5m)
 ```
 
 ## Program output

diff --git a/__init__.py b/__init__.py
@@ -1,4 +1 @@
-from .common import TRAINING_FILES_DESC
-from .common import VALIDATION_FILES_DESC
-from .common import TESTING_FILES_DESC
 
diff --git a/__main__.py b/__main__.py
@@ -8,16 +8,20 @@
 from .train import run_training
 from .train import run_training_all
 from .train import run_training_specific_file
+from .train import run_training_n_times
 from .test import get_all_data
 from .test import run_testing
 from .test import run_testing_specific_file
 
 
-regions = ['AGSO', 'JAMSTEC', 'JAMSTEC2', 'NGA', 'NGA2', 'NGDC', 'NOAA_geodas', 'SIO', 'US_multi']
+regions = ['AGSO', 'JAMSTEC', 'JAMSTEC2', 'NGA', 'NGA2', 'NGDC', 'NOAA_geodas',
+        'SIO', 'US_multi', 'US_multi2']
+#regions = ['NGDC','US_multi','US_multi2']
+#regions = ['TEST-ATL','TEST-PAC']
 param1 = ["tsv", "pickle"]
 param2 = ["train", "train-all", "test-self", "test-cross", "test-all",
-          "train-instances", "test-instances"]
-usage_msg = "Usage: ./lgb.py <{}> <{}> <config_path>".format("|".join(param1), "|".join(param2))
+          "train-instances", "test-instances", "train-random"]
+usage_msg = "Usage: python -m bathymetry <{}> <{}> <config_path>".format("|".join(param1), "|".join(param2))
 
 
 @ray.remote
@@ -70,6 +74,12 @@ def run_testing_instances(model_name, regions):
         run_testing_specific_file(model_name, [filename], test_region_name, config, logger)
     run_testing_specific_file(model_name, filenames, "all", config, logger)
 
+@ray.remote
+def run_training_random(regions):
+    logger = Logger()
+    logfile = os.path.join(config["base_dir"], "training_log_all.log")
+    logger.set_file_handle(logfile)
+    run_training_n_times(config, regions, is_read_text, logger)
 
 def get_data():
     logger = Logger()
@@ -98,13 +108,16 @@ def get_data():
     init_setup(config["base_dir"])
     task = sys.argv[2].lower()
 
+
     ray.init(num_cpus=10)
     result_ids = []
     if task == "train":
         for region in regions:
             result_ids.append(run_training_one_region.remote(region))
     elif task == "train-all":
         run_training_all_regions(regions)
+    elif task == "train-random":
+        result_ids.append(run_training_random.remote(regions))
     elif task == "test-cross":
         for region in regions:
             result_ids.append(run_test.remote(region, regions, task))
@@ -121,6 +134,10 @@ def get_data():
     elif task == "test-self":
         for region in regions:
             result_ids.append(run_test.remote(region, [region], task))
+    elif task == "test-usm2":
+        for region in regions:
+            #result_ids.append(run_test(region, ['US_multi2'], "test-cross"))
+            result_ids.append(run_test.remote(region, ['US_multi2'], "test-cross"))
     else:
         assert(False)
     results = ray.get(result_ids)
diff --git a/analysis/01_feature_removal/PRC-ROC.ipynb b/analysis/01_feature_removal/PRC-ROC.ipynb
diff --git a/analysis/01_feature_removal/__init__.py b/analysis/01_feature_removal/__init__.py
diff --git a/analysis/01_feature_removal/feature-importance.ipynb b/analysis/01_feature_removal/feature-importance.ipynb
-Original file line number
+Diff line change
@@ Expand Up / @@ -64,3 +64,6 @@ target/ @@
     # Editor
     .vscode
+    # other
+    analysis/show*ipynb