From b3c5f9b6ff174abac0e53b956d0bf4004f1fddd1 Mon Sep 17 00:00:00 2001
From: Neylson Crepalde <neylsoncrepalde@gmail.com>
Date: Tue, 22 Feb 2022 11:47:30 -0300
Subject: [PATCH] doc/aws-example (#10)

* doc/aws-example

Include AWS example with secret generation and spark code configuration to access S3

* doc/aws-example

change inner README

* update chart version

Co-authored-by: Neylson Crepalde <neylson.crepalde@a3data.com.br>
---
 README.md                          | 60 ++++++++++++++++++++++++++++--
 charts/pyspark-notebook/Chart.yaml |  2 +-
 charts/pyspark-notebook/README.md  | 60 ++++++++++++++++++++++++++++--
 3 files changed, 115 insertions(+), 7 deletions(-)
diff --git a/README.md b/README.md
index ef8f0b6..54da6d8 100644
--- a/README.md
+++ b/README.md
@@ -57,7 +57,7 @@ You should be able to access the frontend via http://localhost:8888.
 
 ```(shell)
 kubectl exec -it pod/pyspark-0 -- bash
-jupyter notebook list
+jupyter server list
 ```
 
 ## LoadBalancer
@@ -72,7 +72,7 @@ Create secret
 ```sh
 kubectl create secret generic gcs-credentials --from-file="./config/key.json"
 ```
-Alter values.yaml
+Alter `values.yaml`
 
 ```yaml
 env: 
@@ -88,4 +88,58 @@ extraVolumeMounts:
   - name: secrets
     mountPath: "/mnt/secrets"
     readOnly: true 
-```
\ No newline at end of file
+```
+
+
+## AWS Example
+
+Create secret from a `key.json` file.
+```sh
+kubectl create secret generic aws-credentials --from-file="./config/key.json"
+```
+
+Or you can create a secret directly in the terminal:
+```sh
+kubectl create secret generic aws-credentials --from-literal=aws_access_key_id=<YOUR_KEY_ID> --from-literal=aws_secret_access_key=<YOUR_SECRET_KEY> 
+```
+
+Alter `values.yaml` to set your AWS credentials as environment variables
+```yaml
+# Allows you to load environment variables from kubernetes secret               
+secret:                                                                         
+  - envName: AWS_ACCESS_KEY_ID                                                  
+    secretName: aws-credentials                                                 
+    secretKey: aws_access_key_id                                                
+  - envName: AWS_SECRET_ACCESS_KEY                                              
+    secretName: aws-credentials                                                 
+    secretKey: aws_secret_access_key   
+```
+
+And deploy the helm chart with `helm install` command shown above.
+
+For the notebook to connect with AWS S3, you have to setup the correct spark configurations in your `.py` file. An example:
+```python
+from pyspark import SparkConf, SparkContext
+from pyspark.sql import functions as f
+from pyspark.sql import SparkSession
+
+#spark configuration
+conf = (
+    SparkConf().set('spark.executor.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true')
+    .set('spark.driver.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true')
+    .set("spark.hadoop.fs.s3a.fast.upload", True)
+    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
+    .set('spark.jars.packages', 'software.amazon.awssdk:s3:2.17.133,org.apache.hadoop:hadoop-aws:3.2.0')
+    .set('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.EnvironmentVariableCredentialsProvider')
+)
+sc=SparkContext(conf=conf).getOrCreate()
+
+spark=SparkSession(sc)
+
+df = spark.read.parquet("s3a:/<BUCKET-NAME>/<TABLE-NAME>/")
+
+df.printSchema()
+```
+
+Make sure the credentials you passed as env variables do have access to the S3 bucket.
+
diff --git a/charts/pyspark-notebook/Chart.yaml b/charts/pyspark-notebook/Chart.yaml
index c5c85a0..78f6681 100644
--- a/charts/pyspark-notebook/Chart.yaml
+++ b/charts/pyspark-notebook/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.2.0
+version: 0.2.1
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
diff --git a/charts/pyspark-notebook/README.md b/charts/pyspark-notebook/README.md
index ef8f0b6..54da6d8 100644
--- a/charts/pyspark-notebook/README.md
+++ b/charts/pyspark-notebook/README.md
@@ -57,7 +57,7 @@ You should be able to access the frontend via http://localhost:8888.
 
 ```(shell)
 kubectl exec -it pod/pyspark-0 -- bash
-jupyter notebook list
+jupyter server list
 ```
 
 ## LoadBalancer
@@ -72,7 +72,7 @@ Create secret
 ```sh
 kubectl create secret generic gcs-credentials --from-file="./config/key.json"
 ```
-Alter values.yaml
+Alter `values.yaml`
 
 ```yaml
 env: 
@@ -88,4 +88,58 @@ extraVolumeMounts:
   - name: secrets
     mountPath: "/mnt/secrets"
     readOnly: true 
-```
\ No newline at end of file
+```
+
+
+## AWS Example
+
+Create secret from a `key.json` file.
+```sh
+kubectl create secret generic aws-credentials --from-file="./config/key.json"
+```
+
+Or you can create a secret directly in the terminal:
+```sh
+kubectl create secret generic aws-credentials --from-literal=aws_access_key_id=<YOUR_KEY_ID> --from-literal=aws_secret_access_key=<YOUR_SECRET_KEY> 
+```
+
+Alter `values.yaml` to set your AWS credentials as environment variables
+```yaml
+# Allows you to load environment variables from kubernetes secret               
+secret:                                                                         
+  - envName: AWS_ACCESS_KEY_ID                                                  
+    secretName: aws-credentials                                                 
+    secretKey: aws_access_key_id                                                
+  - envName: AWS_SECRET_ACCESS_KEY                                              
+    secretName: aws-credentials                                                 
+    secretKey: aws_secret_access_key   
+```
+
+And deploy the helm chart with `helm install` command shown above.
+
+For the notebook to connect with AWS S3, you have to setup the correct spark configurations in your `.py` file. An example:
+```python
+from pyspark import SparkConf, SparkContext
+from pyspark.sql import functions as f
+from pyspark.sql import SparkSession
+
+#spark configuration
+conf = (
+    SparkConf().set('spark.executor.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true')
+    .set('spark.driver.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true')
+    .set("spark.hadoop.fs.s3a.fast.upload", True)
+    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
+    .set('spark.jars.packages', 'software.amazon.awssdk:s3:2.17.133,org.apache.hadoop:hadoop-aws:3.2.0')
+    .set('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.EnvironmentVariableCredentialsProvider')
+)
+sc=SparkContext(conf=conf).getOrCreate()
+
+spark=SparkSession(sc)
+
+df = spark.read.parquet("s3a:/<BUCKET-NAME>/<TABLE-NAME>/")
+
+df.printSchema()
+```
+
+Make sure the credentials you passed as env variables do have access to the S3 bucket.
+