Merge branch 'feature/demo' into develop

esalvucci · Feb 11, 2021 · 47cdb86 · 47cdb86
2 parents 400eff8 + a8e1d97
commit 47cdb86
Show file tree

Hide file tree

Showing 8 changed files with 49,076 additions and 9 deletions.
diff --git a/demo/README.md b/demo/README.md
@@ -0,0 +1,85 @@
+# Demo
+
+## Getting started
+
+### Datasets
+You can find the datasets used in this repository in [](/demo/datasets)
+
+### Prerequisites
+To run a demo of the whole system first install
+* MLFlow
+* Kubeflow
+* Bentoml
+
+And set a Google Cloud Build trigger and a Google Cloud Function following the instructions in the [doc](/doc). 
+
+### Set env variables
+When your infrastructure is ready run [set_environment_variables.sh](set_environment_variables.sh) to set the env
+variables required to compile the two Kubeflow Pipelines (and run MLFlow).
+
+```
+./set_env_variables.sh
+```
+
+## MLFlow
+If you want to run MLFlow locally make sure to set (in the set_env_variables.sh) the URI of your MLFlow server.
+Then move to the folder containing your mlflow code and
+
+Start the mlflow ui
+```
+mlflow ui --backend-store-uri sqlite:///mlflow.db
+```
+
+and run your mlflow code
+```
+mlflow run . # followed the required parameters
+```
+
+For example, in [/components/linear_regression_training](/components/linear_regression_training) you will run the
+following command
+
+```
+mlflow run . -P dataset_path=/tmp/dataset.csv -P original_dataset_path=/tmp/it.csv
+```
+
+## Kubeflow pipelines
+To compile one of the two pipelines go in the pipeline directory and run the dsl-compile command
+
+(from the root of this repository)
+```
+cd training_pipeline # or cd prediction_pipeline
+dsl-compile --py main.py --output pipeline.tar.gz
+```
+
+You can manually upload your compiled pipeline to Kubeflow Pipelines, create a new Experiment (or use an existing one)
+and run the pipeline.
+
+## Google Cloud Build
+To set a Cloud Build trigger follow the instruction in [/doc/google_cloud_build](/doc/google_cloud_build).
+You will note a new trigger in the "History" whenever you push to the selected branch in the trigger settings.
+
+The whole Kubeflow pipeline will be rebuilt and run using the code you have just pushed.
+
+## Google Cloud Functions
+To set a Cloud Function follow the instruction in [/doc/google_cloud_functions](/doc/google_cloud_functions).
+You will note a new trigger in the "History" whenever you push to the selected branch in the trigger settings.
+
+The Kubeflow (training) pipeline will be compiled and run using, as input, the data that have been added in the target
+ bucket.
+
+To deploy a new Function first delete the existing one
+
+```
+gcloud functions delete <function name>
+```
+
+```gcloud functions delete run_pipeline``` according to this example.
+
+then run 
+
+ ```
+gcloud functions deploy run_pipeline --runtime python37 --trigger-resource ${TRIGGER_BUCKET}
+--trigger-event google.storage.object.finalize --env-vars-file .env.yaml
+```
+
+(where run_pipeline is the name of the function, in your python code, to be run )
diff --git a/demo/datasets/it+1_day.csv b/demo/datasets/it+1_day.csv
@@ -0,0 +1,25 @@
+start,end,load
+2020-07-31 00:00:00+00:00,2020-07-31 01:00:00+00:00,31346.0
+2020-07-31 01:00:00+00:00,2020-07-31 02:00:00+00:00,30501.0
+2020-07-31 02:00:00+00:00,2020-07-31 03:00:00+00:00,30229.0
+2020-07-31 03:00:00+00:00,2020-07-31 04:00:00+00:00,31037.0
+2020-07-31 04:00:00+00:00,2020-07-31 05:00:00+00:00,33482.0
+2020-07-31 05:00:00+00:00,2020-07-31 06:00:00+00:00,38265.0
+2020-07-31 06:00:00+00:00,2020-07-31 07:00:00+00:00,43026.0
+2020-07-31 07:00:00+00:00,2020-07-31 08:00:00+00:00,45914.0
+2020-07-31 08:00:00+00:00,2020-07-31 09:00:00+00:00,47394.0
+2020-07-31 09:00:00+00:00,2020-07-31 10:00:00+00:00,48363.0
+2020-07-31 10:00:00+00:00,2020-07-31 11:00:00+00:00,48093.0
+2020-07-31 11:00:00+00:00,2020-07-31 12:00:00+00:00,48353.0
+2020-07-31 12:00:00+00:00,2020-07-31 13:00:00+00:00,49152.0
+2020-07-31 13:00:00+00:00,2020-07-31 14:00:00+00:00,49418.0
+2020-07-31 14:00:00+00:00,2020-07-31 15:00:00+00:00,49528.0
+2020-07-31 15:00:00+00:00,2020-07-31 16:00:00+00:00,48434.0
+2020-07-31 16:00:00+00:00,2020-07-31 17:00:00+00:00,47289.0
+2020-07-31 17:00:00+00:00,2020-07-31 18:00:00+00:00,46327.0
+2020-07-31 18:00:00+00:00,2020-07-31 19:00:00+00:00,45148.0
+2020-07-31 19:00:00+00:00,2020-07-31 20:00:00+00:00,44005.0
+2020-07-31 20:00:00+00:00,2020-07-31 21:00:00+00:00,40940.0
+2020-07-31 21:00:00+00:00,2020-07-31 22:00:00+00:00,37607.0
+2020-07-31 22:00:00+00:00,2020-07-31 23:00:00+00:00,34817.0
+2020-07-31 23:00:00+00:00,2020-08-01 00:00:00+00:00,32453.0
diff --git a/demo/datasets/it.csv b/demo/datasets/it.csv
diff --git a/demo/set_environment_variables.sh b/demo/set_environment_variables.sh
@@ -0,0 +1,14 @@
+# Change DOCKER_CONTAINER_REGISTRY_BASE_URL, KUBEFLOW_HOST and MLFLOW_TRACKING_URI variables according to your
+# Docker container registry url, kubeflow endpoint URL and MLFLow endpoint URL
+
+# e.g. docker.io/repository_name or gcr.io/repository_name
+export DOCKER_CONTAINER_REGISTRY_BASE_URL=<DOCKER_CONTAINER_REGISTRY>
+export PROJECT_NAME='forecasting_example'
+export DATA_INGESTION='data_ingestion'
+export DATA_PREPARATION='data_preparation'
+export BATCH_PREDICTION='scikit_learn_batch_prediction'
+export INFERENCE_SERVICE='scikit_learn_inference_service'
+export TAG='latest'
+export MLFLOW_TRACKING_URI=<MLFLOW_TRACKING_URI> # e.g. http://34.91.32.10:5000
+export KUBEFLOW_HOST=<KUBEFLOW_HOST>
+export GOOGLE_APPLICATION_CREDENTIALS=<YOUR SERVICE ACCOUNT JSON FILE>
diff --git a/prediction_pipeline/cloudbuild.yaml b/prediction_pipeline/cloudbuild.yaml
@@ -5,7 +5,10 @@ steps:
     # Args contain the arguments to be passed to the entry points.
     args: ['build', '-t', '$_DOCKER_CONTAINER_REGISTRY_BASE_URL/$_PROJECT_NAME/$_DATA_INGESTION:$SHORT_SHA', '.']
     dir: $_COMPONENTS_FOLDER/$_DATA_INGESTION
-
+  # Build the component of the Kubeflow Pipeline
+  - name: 'gcr.io/cloud-builders/docker'
+    args: ['build', '-t', '$_DOCKER_CONTAINER_REGISTRY_BASE_URL/$_PROJECT_NAME/$_DATA_PREPARATION:$SHORT_SHA', '.']
+    dir: $_COMPONENTS_FOLDER/$_DATA_PREPARATION
   # Build the component of the Kubeflow Pipeline
   - name: 'gcr.io/cloud-builders/docker'
     args: ['build', '-t', '$_DOCKER_CONTAINER_REGISTRY_BASE_URL/$_PROJECT_NAME/$_MODEL_LOADER:$SHORT_SHA', '.']
@@ -16,8 +19,10 @@ steps:
   - name: 'gcr.io/cloud-builders/docker'
     args: ['build', '-t', '$_DOCKER_CONTAINER_REGISTRY_BASE_URL/$_PROJECT_NAME/$_INFERENCE_SERVICE:$SHORT_SHA', '.']
     dir: $_COMPONENTS_FOLDER/$_INFERENCE_SERVICE
-    env:
-      - 'MLFLOW_TRACKING_URI=$_MLFLOW_TRACKING_URI'
+  # Build the component of the Kubeflow Pipeline
+  - name: 'gcr.io/cloud-builders/docker'
+    args: [ 'build', '-t', '$_DOCKER_CONTAINER_REGISTRY_BASE_URL/$_PROJECT_NAME/$_BATCH_PREDICTION:$SHORT_SHA', '.' ]
+    dir: $_COMPONENTS_FOLDER/$_BATCH_PREDICTION
 
   # Through the kfp-cli compile the pipeline
   - name: 'gcr.io/$_PROJECT_ID/kfp-cli'
@@ -29,8 +34,10 @@ steps:
       - 'HOST=$_ENDPOINT'
       - 'PROJECT_NAME=$_PROJECT_NAME'
       - 'DATA_INGESTION=$_DATA_INGESTION'
+      - 'DATA_PREPARATION=$_DATA_PREPARATION'
       - 'MODEL_LOADER=$_MODEL_LOADER'
       - 'INFERENCE_SERVICE=$_INFERENCE_SERVICE'
+      - 'BATCH_PREDICTION=$_BATCH_PREDICTION'
       - 'MLFLOW_TRACKING_URI=$_MLFLOW_TRACKING_URI'
     args:
       - '-c'
@@ -74,3 +81,7 @@ steps:
 images:
   - '$_DOCKER_CONTAINER_REGISTRY_BASE_URL/$_PROJECT_NAME/$_DATA_INGESTION:$SHORT_SHA'
   - '$_DOCKER_CONTAINER_REGISTRY_BASE_URL/$_PROJECT_NAME/$_MODEL_LOADER:$SHORT_SHA'
+  - '$_DOCKER_CONTAINER_REGISTRY_BASE_URL/$_PROJECT_NAME/$_DATA_PREPARATION:$SHORT_SHA'
+  - '$_DOCKER_CONTAINER_REGISTRY_BASE_URL/$_PROJECT_NAME/$_INFERENCE_SERVICE:$SHORT_SHA'
+  - '$_DOCKER_CONTAINER_REGISTRY_BASE_URL/$_PROJECT_NAME/$_BATCH_PREDICTION:$SHORT_SHA'
+timeout: 900s
diff --git a/training_pipeline/.env.yaml b/training_pipeline/.env.yaml
@@ -3,4 +3,5 @@ PROJECT_NAME: 'forecasting_example'
 DATA_INGESTION: 'data_ingestion'
 DATA_PREPARATION: 'data_preparation'
 TAG: 'latest'
-MLFLOW_TRACKING_URI: <MLFLOW_TRACKING_URI> # http://34.91.32.10:5000
+MLFLOW_TRACKING_URI: <MLFLOW_TRACKING_URI> # e.g. http://34.91.32.10:5000
+KUBEFLOW_HOST:<KUBEFLOW_HOST>
diff --git a/training_pipeline/cloudbuild.yaml b/training_pipeline/cloudbuild.yaml
@@ -12,10 +12,24 @@ steps:
 
   # Build the component of the Kubeflow Pipeline
   - name: 'gcr.io/cloud-builders/docker'
-    args: ['build', '-t', '$_DOCKER_CONTAINER_REGISTRY_BASE_URL/$_PROJECT_NAME/$_MODEL_TRAINING:$SHORT_SHA', '.']
+    args: ['build', '-t', '$_DOCKER_CONTAINER_REGISTRY_BASE_URL/$_PROJECT_NAME/$_RANDOM_FOREST_REGRESSOR_TRAINING:$SHORT_SHA', '.']
     env:
       - 'MLFLOW_TRACKING_URI=$_MLFLOW_TRACKING_URI'
-    dir: $_COMPONENTS_FOLDER/$_MODEL_TRAINING
+    dir: $_COMPONENTS_FOLDER/$_RANDOM_FOREST_REGRESSOR_TRAINING
+
+  # Build the component of the Kubeflow Pipeline
+  - name: 'gcr.io/cloud-builders/docker'
+    args: [ 'build', '-t', '$_DOCKER_CONTAINER_REGISTRY_BASE_URL/$_PROJECT_NAME/$_LINEAR_REGRESSION_TRAINING:$SHORT_SHA', '.' ]
+    env:
+      - 'MLFLOW_TRACKING_URI=$_MLFLOW_TRACKING_URI'
+    dir: $_COMPONENTS_FOLDER/$_LINEAR_REGRESSION_TRAINING
+
+  # Build the component of the Kubeflow Pipeline
+  - name: 'gcr.io/cloud-builders/docker'
+    args: ['build', '-t', '$_DOCKER_CONTAINER_REGISTRY_BASE_URL/$_PROJECT_NAME/$_PROMOTE_MODEL:$SHORT_SHA', '.']
+    env:
+      - 'MLFLOW_TRACKING_URI=$_MLFLOW_TRACKING_URI'
+    dir: $_COMPONENTS_FOLDER/$_PROMOTE_MODEL
 
   # Through the kfp-cli compile the pipeline
   - name: 'gcr.io/$_PROJECT_ID/kfp-cli'
@@ -30,6 +44,7 @@ steps:
       - 'DATA_PREPARATION=$_DATA_PREPARATION'
       - 'MODEL_TRAINING=$_MODEL_TRAINING'
       - 'MLFLOW_TRACKING_URI=$_MLFLOW_TRACKING_URI'
+      - 'KUBEFLOW_HOST=$_ENDPOINT'
     args:
       - '-c'
       # dsl-compile --py <PYHTON_FILE_NAME> --output <COMPILED_PIPELINE_FILENAME>
@@ -70,4 +85,7 @@ steps:
 images:
   - '$_DOCKER_CONTAINER_REGISTRY_BASE_URL/$_PROJECT_NAME/$_DATA_INGESTION:$SHORT_SHA'
   - '$_DOCKER_CONTAINER_REGISTRY_BASE_URL/$_PROJECT_NAME/$_DATA_PREPARATION:$SHORT_SHA'
-  - '$_DOCKER_CONTAINER_REGISTRY_BASE_URL/$_PROJECT_NAME/$_MODEL_TRAINING:$SHORT_SHA'
+  - '$_DOCKER_CONTAINER_REGISTRY_BASE_URL/$_PROJECT_NAME/$_LINEAR_REGRESSION_TRAINING:$SHORT_SHA'
+  - '$_DOCKER_CONTAINER_REGISTRY_BASE_URL/$_PROJECT_NAME/$_RANDOM_FOREST_REGRESSOR_TRAINING:$SHORT_SHA'
+  - '$_DOCKER_CONTAINER_REGISTRY_BASE_URL/$_PROJECT_NAME/$_PROMOTE_MODEL:$SHORT_SHA'
+timeout: 900s
diff --git a/training_pipeline/main.py b/training_pipeline/main.py
@@ -6,7 +6,7 @@
 import logging
 from kubernetes.client import V1EnvVar
 
-HOST = 'https://34727f9010a6d1aa-dot-asia-east1.pipelines.googleusercontent.com'
+HOST = os.environ['KUBEFLOW_HOST']
 EXPERIMENT_NAME = 'Forecast Example - Training'
 
 
@@ -32,7 +32,7 @@ def run_pipeline(data, context):
 
 @kfp.dsl.pipeline(name='Forecasting Example')
 def pipeline(bucket_name: str = 'forecast-example'):
-    original_dataset_path = str(os.path.join('gs://', 'forecast-example', 'it.csv'))
+    original_dataset_path = 'gs://forecast-example/it.csv'
 
     # Data Ingestion step
     data_ingestion = __data_ingestion_step(bucket_name)