Update general logic.

kargibora · Mar 6, 2023 · da87464 · da87464
1 parent 9646657
commit da87464
Show file tree

Hide file tree

Showing 16 changed files with 805 additions and 212 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,5 @@ wandb
 *.ipynb_checkpoints
 *.ipynb
 *.DS_Store
-*.pth
+*.pth
+regression_datasets/*
diff --git a/configs/toy_dataset.yaml b/configs/toy_dataset.yaml
@@ -1,19 +1,30 @@
-network: # basic MLP layer configuration
-  num_networks: 5
-  layer_sizes: [16,32,64]
+estimator: # basic MLP layer configuration
+  class: 'ensemble'
+  model:
+    num_networks : 5
+    layer_sizes : [50]
+  optimizer:
+    class : 'Adam'
+    lr : 0.01
 dataset: # for now, just use toy-dataset
   class: 'toy'
   func: 'toy_function_complex'
-  bounds : [0, 4]
+  bounds: [-6, -4, 1, 4]
+  sigmas: [0, 0, 0]
+  imbalance_ratios: [0.1, 0.9, 0.7]
   batch_size : 128   
+  test_ratio : 0.1
+transforms:
+  x :
+    - class : standardize
+  y :
+    - class : standardize
 train:
-  num_iter : 5000
-  print_every : 250
-  weighted : False
-test:
-  batch_size : 32
-logger:
-  type: 'wandb'
-  project: 'uncertainty-estimation'
-  entity: 'kbora'
-  name: 'Toy Dataset Complex Weighted'
+  train_type : iter
+  num_iter : 100
+# logger:
+#   type: 'wandb'
+#   project: 'uncertainty-estimation'
+#   entity: 'kbora'
+#   name: 'Toy Dataset Complex Weighted'
+
diff --git a/configs/xls_dataset.yaml b/configs/xls_dataset.yaml
@@ -1,30 +1,41 @@
+# Estimator configurations
 estimator: # basic MLP layer configuration
   class: 'ensemble'
-  model:
-    num_networks : 5
-    layer_sizes : [100]
+  num_networks : 5
+  network:
+    estimator_network:
+      - fc1 : {class : Linear, in_features : 13, out_features : 32}
+      - fc2 : {class : Linear, in_features : 32, out_features : 64}
+      - projection : {class : LinearVarianceNetworkHead, in_features : 64, out_features : 1}
+    predictor_network: 
+      - fc1 : {class : Linear, in_features : 13, out_features : 32}
+      - fc2 : {class : Linear, in_features : 32, out_features : 64}
+      - projection : {class : Linear, in_features : 64, out_features : 1}
   optimizer:
     class : 'Adam'
     lr : 0.01
-dataset: # for now, just use toy-dataset
+
+# Dataset configurations
+dataset:
   class: 'xls'
-  path: "regression_datasets/year_prediction.csv"
-  batch_size : 128   
-  cv_split_num: 1
+  path: "regression_datasets/boston.csv"
+  batch_size : 512  
+  cv_split_num: 10
   test_ratio: 0.10
-  y_col : [0]
 transforms:
   x :
-    - class : standardize
+    - {class : standardize}
   y :
-    - class : standardize
+    - {class : standardize}
+
+# Training configurations
 train:
   train_type : epoch
   num_iter : 40
-  print_every : 5 
-  val_every: 10
-logger:
-  type: 'wandb'
-  project: 'uncertainty-estimation'
-  entity: 'kbora'
-  name: 'Toy Dataset Complex Weighted'
+
+# Logger configurations
+# logger:
+#   type: 'wandb'
+#   project: 'uncertainty-estimation'
+#   entity: 'kbora'
+#   name: 'Toy Dataset Complex Weighted'
diff --git a/datasets/toy_data.py b/datasets/toy_data.py
@@ -45,7 +45,9 @@ def complex(x):
 class ToyDataset(Dataset):
     def __init__(self, x, y):
         self.x = np.array(x)
-        self.y = np.array(y)  
+        self.y = None
+        if y is not None:
+            self.y = np.array(y)  
 
     def __len__(self):
         """
@@ -57,44 +59,46 @@ def __getitem__(self, idx):
         """
         Transform the data to torch.Tensor
         """
-        return self.x[idx], self.y[idx]
+        if self.y is not None:
+            return self.x[idx], self.y[idx]
+        else:
+            return self.x[idx]
 
 def make_toy_dataset(
     func : Callable, **kwargs) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
 
-    data_range = kwargs.get("data_range", 7)
     data_step = kwargs.get("data_step", 0.001)
-    bounds = kwargs.get("bounds", (-2, 2))
-    sigmas = kwargs.get("sigmas", (0.1, 0.5))
-    test_ratio = kwargs.get("test_ratio", 0.2)
+    bounds = kwargs.get("bounds")
+    sigmas = kwargs.get("sigmas")
+    imbalance_ratios = kwargs.get("imbalance_ratios")
 
-    bound1, bound2 = bounds
-    data_sigma1, data_sigma2 = sigmas
+    assert len(bounds) - 1 == len(sigmas)
+    assert len(bounds) - 1 == len(imbalance_ratios)
 
-    data_x1 = np.arange(-data_range, bound1 + data_step, data_step)
-    data_x2 = np.arange(bound2, data_range + data_step, data_step)
 
-    data_x = np.concatenate((data_x1, data_x2))
-    data_x = np.reshape(data_x, [data_x.shape[0], 1])
+    x,y = [], []
+    for i in range(len(sigmas)):
+        bound = bounds[i]
+        next_bound = bounds[i+1]
+        sigma = sigmas[i]
 
-    data_y = np.zeros([data_x.shape[0], 1])
+        data_x = np.arange(bound, next_bound + data_step, data_step)
+        data_y = func(data_x) + np.random.normal(0, sigma, data_x.shape)
 
-    for i in range(data_x.shape[0]):
-        if (data_x[i,0] < bound1): 
-            data_y[i, 0] = func(data_x[i,0]) + np.random.normal(0, data_sigma1)
-        else:
-            data_y[i, 0] = func(data_x[i,0]) + np.random.normal(0, data_sigma2)
+        imbalance_ratio = imbalance_ratios[i]
+        data_x, data_y = shuffle(data_x, data_y)
+
+        num_to_drop = int(data_x.shape[0] * imbalance_ratio)
 
-    data_x, data_y = shuffle(data_x, data_y)
-
-    num_train_data = int(data_x.shape[0] * (1 - test_ratio))
+        data_x = data_x[num_to_drop:] # we randomly dropped points
+        data_y = data_y[num_to_drop:]
 
-    train_x = data_x[:num_train_data, :]
-    train_y = data_y[:num_train_data, :]
-    test_x  = data_x[num_train_data:, :]
-    test_y  = data_y[num_train_data:, :]
+        x.extend(data_x)
+        y.extend(data_y)
 
-    return train_x, train_y, test_x, test_y
+    x = np.array(x, dtype=np.float32).reshape(-1,1)
+    y = np.array(y, dtype=np.float32).reshape(-1,1)
+    return x, y
 
 def create_toy_dataloader(**kwargs):
     """
@@ -112,13 +116,35 @@ def create_toy_dataloader(**kwargs):
         raise ValueError("Please provide a function to sample from.")
     func = TOY_FUNC_REGISTRY[func]()
 
+    cv_split_num = kwargs.pop("cv_split_num", 1)
     batch_size = kwargs.pop("batch_size", 32)
-    train_x, train_y, test_x, test_y = make_toy_dataset(func = func, **kwargs)
-    train_dataset = ToyDataset(train_x, train_y)
-    test_dataset = ToyDataset(test_x, test_y)
-    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
-    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
-    return train_loader, test_loader, train_dataset, test_dataset
+    test_ratio = kwargs.pop("test_ratio", 0.2)
+    x,y = make_toy_dataset(func = func, **kwargs)
+
+    if batch_size == -1:
+        batch_size = x.shape[0]
+
+    num_train_data = int(x.shape[0] * (1 - test_ratio))
+
+    for i in range(cv_split_num):
+        data_x, data_y = shuffle(x, y)
+        train_x = data_x[:num_train_data, :]
+        train_y = data_y[:num_train_data, :]
+        test_x  = data_x[num_train_data:, :]
+        test_y  = data_y[num_train_data:, :]
+
+        plt.figure()
+        plt.scatter(train_x, train_y, s=1, c="b")
+        plt.scatter(test_x, test_y, s=1, c="r")
+        plt.title("Toy Data")
+        plt.savefig("figures/toy_data.png")
+
+        train_dataset = ToyDataset(train_x, train_y)
+        val_dataset = ToyDataset(test_x, test_y)
+
+        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
+        yield train_loader, val_loader, train_dataset, val_dataset
 
 
 def sample_toy_data(func : Callable, start : float, end : float, step : float):

diff --git a/datasets/xls_data.py b/datasets/xls_data.py
@@ -80,6 +80,9 @@ def create_xls_dataloader(**kwargs):
     x, y = parser.parse(x_col, y_col)
     num_test = int(x.shape[0] * test_ratio)
 
+    if batch_size == -1:
+        batch_size = x.shape[0]
+
     # create train and test data for cross validation
     for i in range(cv_split_num):
         x, y = shuffle(x, y)

diff --git a/estimations/__init__.py b/estimations/__init__.py
@@ -10,7 +10,7 @@ def create_estimator(cfg, **kwargs):
     if estimator_name not in ESTIMATOR_REGISTRY:
         raise ValueError(f"Unknown estimator : {estimator_name}")
     estimator = ESTIMATOR_REGISTRY[estimator_name](
-        network_config=cfg.get("model"),
+        network_config=cfg.get("network"),
         optimizer_config = cfg.get("optimizer"),
         **kwargs)
     return estimator
diff --git a/estimations/base.py b/estimations/base.py
@@ -1,18 +1,29 @@
 import typing
 import torch
-import abc
-
 
+from models import create_model
 
 class UncertaintyEstimator(object):
     def init_estimator(self, **kwargs):
         raise NotImplementedError
 
+    def init_predictor(self, **kwargs):
+        raise NotImplementedError
+
     def train_estimator(self, **kwargs):
         raise NotImplementedError
 
     def test_estimator(self, **kwargs):
         raise NotImplementedError
+
+    def train_predictor(self, **kwargs):
+        raise NotImplementedError
+
+    def test_predictor(self, **kwargs):
+        raise NotImplementedError
+
+    def _build_network(self, network_build_config, network_name):
+        return create_model(network_build_config.get(network_name, None))
 
     def __repr__(self) -> str:
         return self.__class__.__name__ + '()'