code completed

mommermi · Dec 31, 2019 · 6c128dc · 6c128dc
1 parent 93e257c
commit 6c128dc
Show file tree

Hide file tree

Showing 22 changed files with 1,836 additions and 215 deletions.
diff --git a/README.rst b/README.rst
@@ -0,0 +1,70 @@
+Cloudynight - The All-sky Camera Cloud Detector
+===============================================
+
+This repository contains code built for Mommert (2020): `Cloud Identification
+from All-sky Camera Data with Machine Learning`, submitted
+
+The system consists of several parts:
+
+* the ``cloudynight`` Python module, which contains tools for data handling and
+  preparation, feature extraction, model training and prediction
+  (see directory ``cloudynight/``);
+* a Python Django web server application for database management, data
+  visualization, and manual labeling (see directory ``webapp/``);
+* example data used in this work (see directory ``example_data/``);
+* a number of scripts for testing the functionality on the example data
+  (see directory ``scripts/``).
+
+Please that ``cloudynight`` only utilizes the more efficient ``lightgbm``
+classifier. The ResNet code is also included in ``scripts/`` for the sake
+completeness.
+
+Use
+---
+
+``cloudynight`` contains all the parts necessary to build an automated cloud
+detector, but it is not intended as a plug-and-play software.
+
+First, install the ``cloudynight`` module:
+
+  >>> cd /cloudynight
+  >>> python setup.py install
+
+and run the provided example scripts to get familiar with the module.
+
+Then, install the `web application <webapp/README.rst>`_.
+
+To use the software for real-time cloud detection, write a script that
+utilizes ``cloudynight.AllskyCamera.download_latest_data()`` to download data
+from the camera computer. Then, utilize
+``cloudynight.AllskyCamera.process_and_upload_data`` to extract features and
+upload them to the database. Both tasks can be automated with cron jobs.
+
+The ``label/`` task of the web application can be used for manual labeling
+and training data generation. Once enough data are available, a modified
+version of ``scripts/model_lightgbm.py`` can be used to tune model parameters
+and fit a model. With a model being available, the ``check/`` task can be
+utilized for faster manual labeling.
+
+Cloud coverage can be predicted for the latest image obtained from the camera
+using the web API with task ``predictLatestUnlabeled``.
+
+
+Citing cloudynight
+------------------
+
+You can reference Mommert (2020): `Cloud Identification
+from All-sky Camera Data with Machine Learning`, submitted.
+
+Acknowledgements
+----------------
+
+The author would like to thank Ryan J. Kelly and the NAU/NASA Arizona Space Grant program
+for enabling a case study for this project.
+
+License
+-------
+
+This software is distributed under a `3-clause BSD license <LICENSE.rst>`_.
+
+
diff --git a/cloudynight/__init__.py b/cloudynight/__init__.py
@@ -21,13 +21,18 @@ def __init__(self):
         # define base directory structure
         # each directory gets `AllskyCamera.night` appended
         # [here, directories are defined relative to the scripts/ directory]
-        self.DIR_BASE = '../'
+
+        self.DIR_BASE = os.path.join(
+            ' ', *os.path.abspath(__file__).split('/')[:-2]).strip()
+        # location of module base (for example data)
+
         self.DIR_RAW = os.path.join(self.DIR_BASE, 'example_data')
+
         self.DIR_ARCHIVE = os.path.join(self.DIR_BASE, 'workbench')
 
         # data directory location on host machine (where to pull FITS files from)
-        self.HOST_NAME = 'XXX'
-        self.HOST_BASEDIR = 'XXX'
+        self.CAMHOST_NAME = ''
+        self.CAMHOST_BASEDIR = ''
 
         # FITS file prefix and suffix used by allsky images
         self.FITS_PREFIX = ''
@@ -61,20 +66,18 @@ def __init__(self):
         self.THUMBNAIL_SCALE = ZScaleInterval
 
         # mask file
-        self.MASK_FILENAME = os.path.join(self.DIR_RAW, 'mask.fits')
+        self.MASK_FILENAME = os.path.abspath(os.path.join(self.DIR_RAW,
+                                                       'mask.fits'))
 
         # database URL and credentials
-        self.DB_URL = 'http://localhost/dct/'
-        #self.DB_URL = 'http://127.0.0.1:8000/dct/'        
+        self.DB_URL = 'http://127.0.0.1:8000/'
         self.DB_USER = 'writer'
-        self.DB_PWD = 'writelo-clouds'
+        self.DB_PWD = 'writecloud'
 
         # url for retrieving training data
-        #self.TRAINDATA_URL = 'http://lo-clouds.lowell.edu/dct/get_trained'
-        self.TRAINDATA_URL = 'http://localhost/dct/get_trained'
-        self.UNTRAINDATA_URL = 'http://localhost/dct/get_untrained'  
-        # self.UNTRAINDATA_URL = 'http://127.0.0.1:8000/dct/get_untrained'
-
+        self.TRAINDATA_URL = 'http://127.0.0.1:8000/getAllLabeled/'
+        self.UNTRAINDATA_URL = 'http://127.0.0.1:8000/getAllUnlabeled/'
+
         self.LGBMODEL_PARAMETERS = {
             'max_depth': 5,
             'n_estimators': 500,
@@ -93,8 +96,10 @@ def __init__(self):
             'min_child_samples': randint(low=10, high=190),
             'reg_alpha': [1, 5, 10, 50, 100],
             'reg_lambda': [1, 5, 10, 50, 100, 500, 1000]}
-
-
+
+        self.LGBMODEL_FILE = os.path.join(self.DIR_ARCHIVE,
+                                             'lightgbm.pickle')
+
     def update_directories(self, night):
         """prepare directory structure for a given night, provided as string
            in the form "%Y%m%d"""

diff --git a/cloudynight/__pycache__/__init__.cpython-36.pyc b/cloudynight/__pycache__/__init__.cpython-36.pyc
diff --git a/cloudynight/__pycache__/cloudynight.cpython-36.pyc b/cloudynight/__pycache__/cloudynight.cpython-36.pyc
diff --git a/cloudynight/cloudynight.py b/cloudynight/cloudynight.py
@@ -287,7 +287,7 @@ def write_to_database(self):
             return None
 
         post_request = session.post(
-            conf.DB_URL+'data/Unlabeled/',       ## XXX was Untrained
+            conf.DB_URL+'data/Unlabeled/',
             headers=post_headers, auth=(conf.DB_USER, conf.DB_PWD),
             json=data)
 
@@ -332,8 +332,8 @@ def download_latest_data(self):
 
         # build rsync command
         commandline = 'rsync -avr {}:{} {}'.format(
-            conf.HOST_NAME,
-            os.path.join(conf.HOST_BASEDIR, self.night,
+            conf.CAMHOST_NAME,
+            os.path.join(conf.CAMHOST_BASEDIR, self.night,
                          '*.{}'.format(conf.FITS_SUFFIX)), conf.DIR_RAW)
 
         # download data
@@ -535,10 +535,6 @@ def generate_subregions(self):
         arrays, each with the same dimensions as self.maskdata.
         """
 
-        conf.logger.info(
-            'creating subregions: {} rings and {} ring segments'.format(
-                conf.N_RINGS, conf.N_RINGSEGMENTS))
-
         shape = np.array(self.maskdata.data.shape)
         center_coo = shape/2
         radius_borders = np.linspace(0, min(shape)/2,
@@ -601,8 +597,6 @@ def __init__(self):
         self.val_score = None   # model validation sample score
         self.f1_score_val = None  # model validation sample f1 score
 
-        conf.setupLogger()
-
     def retrieve_training_data(self, size_limit=None):
         """Retrieves feature data from webapp database."""
         n_subregions = conf.N_RINGS*conf.N_RINGSEGMENTS+1
@@ -611,7 +605,7 @@ def retrieve_training_data(self, size_limit=None):
         if get.status_code != requests.codes.ok:
             raise ServerError('could not retrieve training data from server')
         raw = pd.DataFrame(get.json())
-        
+
         data = pd.DataFrame()
         for j in range(len(raw['moonalt'])):
             frame = pd.DataFrame(OrderedDict(
@@ -637,10 +631,6 @@ def retrieve_training_data(self, size_limit=None):
         self.data_X = data.drop(['cloudy'], axis=1)
         self.data_y = np.ravel(data.loc[:, ['cloudy']].values).astype(int)
         self.data_X_featurenames = data.drop(['cloudy'], axis=1).columns.values
-
-        conf.logger.info(
-            '{} training subregion data points retrieved from database'.format(
-                len(self.data_y)))
 
         # limit data set size to size_limit subregions
         if size_limit is not None:
@@ -659,23 +649,14 @@ def load_data(self, filename):
         self.data_y = np.ravel(data.loc[:, ['cloudy']].values).astype(int)
         self.data_X_featurenames = data.drop(['cloudy'], axis=1).columns.values
 
-        conf.logger.info(
-            '{} training subregion data points retrieved from file'.format(
-                len(self.data_y)))
-
         return len(self.data_y)
 
     def train(self, parameters=conf.LGBMODEL_PARAMETERS, cv=5):
         """Train """
 
-        conf.logger.info('training model with parameters {}'.format(
-            parameters))
-
         # split data into training and validation sample
         X_cv, X_val, y_cv, y_val = train_test_split(
             self.data_X, self.data_y, test_size=0.1, random_state=42)
-        conf.logger.info('splitting off validation sample: {}/{}'.format(
-            len(y_cv), len(y_val)))
 
         # define model
         lgb = LGBMClassifier(objective='binary', random_state=42,
@@ -692,12 +673,6 @@ def train(self, parameters=conf.LGBMODEL_PARAMETERS, cv=5):
         self.parameters = parameters
         self.val_score = self.model.score(X_val, y_val)
         self.f1_score_val = f1_score(y_val, self.model.predict(X_val))
-        conf.logger.info('training sample accuracy: {}'.format(
-            self.train_score))
-        conf.logger.info('test sample accuracy: {}'.format(
-            self.test_score))
-        conf.logger.info('validation sample accuracy: {}'.format(
-            self.val_score))
 
         return self.val_score
 
@@ -707,14 +682,10 @@ def train_randomizedsearchcv(self, n_iter=100,
         cv=3, scoring="accuracy"):
         """Train the lightGBM model using a combined randomized
         cross-validation search."""
-        conf.logger.info('running randomizedsearch_cv with distributions: {}'.format(
-            distributions))
 
         # split data into training and validation sample
         X_grid, X_val, y_grid, y_val = train_test_split(
             self.data_X, self.data_y, test_size=0.1, random_state=42)
-        conf.logger.info('splitting off validation sample: {}/{}'.format(
-            len(y_grid), len(y_val)))
 
         # initialize model
         lgb = LGBMClassifier(objective='binary', random_state=42, n_jobs=-1)
@@ -725,79 +696,32 @@ def train_randomizedsearchcv(self, n_iter=100,
 
         # fit model
         lgbrand.fit(X_grid, y_grid)
-        conf.logger.info('randomizedsearch_cv: {}'.format(lgbrand))
 
         self.cv_results = lgbrand.cv_results_
-
         self.model = lgbrand.best_estimator_
-        conf.logger.info('best estimator: {}'.format(self.model))
 
         # derive scores
         self.train_score = lgbrand.cv_results_['mean_train_score'][lgbrand.best_index_]
         self.test_score = lgbrand.cv_results_['mean_test_score'][lgbrand.best_index_]
         self.parameters = lgbrand.cv_results_['params'][lgbrand.best_index_]
         self.val_score = self.model.score(X_val, y_val)
         self.f1_score_val = f1_score(y_val, self.model.predict(X_val))
-        conf.logger.info('training sample accuracy: {}'.format(
-            self.train_score))
-        conf.logger.info('test sample accuracy: {}'.format(
-            self.test_score))
-        conf.logger.info('validation sample accuracy: {}'.format(
-            self.val_score))
 
         return self.val_score
 
-    def write_model(self, filename):
+    def write_model(self,
+                    filename=os.path.join(conf.DIR_ARCHIVE+'model.pickle')):
         """Write trained model to file."""
-        conf.logger.info('writing model to file {}'.format(filename))
         self.filename = filename
         dump(self.model, filename)
 
-    def read_model(self, filename):
+    def read_model(self,
+                   filename=os.path.join(conf.DIR_ARCHIVE+'model.pickle')):
         """Read trained model from file."""
-        conf.logger.info('reading model from file {}'.format(filename))
         self.filename = filename
         self.model = load(filename)
 
     def predict(self, X):
         """Predict cloud coverage for feature data."""
-        conf.logger.info('predict clouds for {} data point'.format(
-            len(X)))
         return self.model.predict(X)
 
-    def upload(self):
-        """Upload model parameters and filename to database."""
-        conf.logger.info('upload model parameters to database')
-
-        data = OrderedDict((
-            ('filearchivepath', self.filename),
-            ('n_train', self.data_X.shape[0]),
-            ('train_score', self.train_score),
-            ('test_score', self.test_score),            
-            ('learning_rate', self.model.learning_rate),
-            ('max_depth', self.model.max_depth),
-            ('min_child_samples', self.model.min_child_samples),
-            ('min_child_weight', self.model.min_child_weight),
-            ('min_split_gain', self.model.min_split_gain),
-            ('n_estimators', self.model.n_estimators),
-            ('num_leaves', self.model.num_leaves),
-            ('reg_alpha', self.model.reg_alpha),
-            ('reg_lambda', self.model.reg_lambda),
-            ('subsample', self.model.subsample),
-            ('subsample_for_bin', self.model.subsample_for_bin),
-            ('subsample_freq', self.model.subsample_freq)))
-
-        session = requests.Session()
-        post_headers = {'Content-Type': 'application/json'}
-
-        post_request = session.post(
-            conf.DB_URL+'data/LGBModel/', headers=post_headers,
-            auth=(conf.DB_USER, conf.DB_PWD), json=data)
-
-        if not ((post_request.status_code == requests.codes.ok) or
-                (post_request.status_code == requests.codes.created)):
-            raise ServerError(
-                'could not upload model with error code {}.'.format(
-                    post_request.status_code))
-
-        conf.logger.info('model upload succeeded')
diff --git a/scripts/README.rst b/scripts/README.rst
@@ -0,0 +1,19 @@
+Example Scripts
+===============
+
+The example scripts provided here are intended to give the interested user
+some idea on how to use ``cloudynight`` functionality.
+
+The following example scripts are included here:
+
+* ``generate_mask.py``: generate an image mask to mask the local horizon
+* ``subregions.py``: create subregions and plot their locations on the sky
+* ``extract_features.py``: read in images, mask them, and extract
+  per-subregion image features
+* ``lightgbm.py``: load a feature file, train the `lightGBM` model and
+  predict cloud coverage for individual subregions
+* ``resnet.py``: train the ResNet model and derive accuracy scores
+
+All these example scripts are intended to work with the example data
+provided. However, it should not be hard to modify the scripts to work with
+data from other all-sky cameras.
diff --git a/scripts/lightgbm.py → scripts/model_lightgbm.py b/scripts/lightgbm.py → scripts/model_lightgbm.py
@@ -8,10 +8,10 @@
 
 (c) 2020, Michael Mommert (mommermiscience@gmail.com)
 """
-import cloudynight
+from cloudynight import LightGBMModel
 
 # initialize model
-model = cloudynight.LightGBMModel()
+model = LightGBMModel()
 
 # load feature example data
 model.load_data('../example_data/features/fulltrainingsample_features.dat')

diff --git a/scripts/resnet.py → scripts/model_resnet.py b/scripts/resnet.py → scripts/model_resnet.py
diff --git a/scripts/subregions.py b/scripts/subregions.py
@@ -40,7 +40,7 @@
 # # !!! this part of the script will only work if the webapp is setup properly
 #
 # # setup server credentials
-# url = 'http://127.0.0.1:8000/cam/data/Subregions/' # for use with test server
+# url = conf.HOST_NAME+conf.HOST_BASEDIR+'data/Subregion/' # for use with test server
 # user = 'writer'
 # pwd = '' # add password here
 #
@@ -53,6 +53,7 @@
 #     print('uploading subregion', subi)
 #
 #     # scale polygon coordinates to image size used in webapp
+#     # factors at the end are image sizes used in the webapp
 #     x = cam.polygons[subi][0]/cam.maskdata.data.shape[0]*460
 #     y = cam.polygons[subi][1]/cam.maskdata.data.shape[1]*465
 #