From b3c5f9b6ff174abac0e53b956d0bf4004f1fddd1 Mon Sep 17 00:00:00 2001 From: Neylson Crepalde Date: Tue, 22 Feb 2022 11:47:30 -0300 Subject: [PATCH] doc/aws-example (#10) * doc/aws-example Include AWS example with secret generation and spark code configuration to access S3 * doc/aws-example change inner README * update chart version Co-authored-by: Neylson Crepalde --- README.md | 60 ++++++++++++++++++++++++++++-- charts/pyspark-notebook/Chart.yaml | 2 +- charts/pyspark-notebook/README.md | 60 ++++++++++++++++++++++++++++-- 3 files changed, 115 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index ef8f0b6..54da6d8 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ You should be able to access the frontend via http://localhost:8888. ```(shell) kubectl exec -it pod/pyspark-0 -- bash -jupyter notebook list +jupyter server list ``` ## LoadBalancer @@ -72,7 +72,7 @@ Create secret ```sh kubectl create secret generic gcs-credentials --from-file="./config/key.json" ``` -Alter values.yaml +Alter `values.yaml` ```yaml env: @@ -88,4 +88,58 @@ extraVolumeMounts: - name: secrets mountPath: "/mnt/secrets" readOnly: true -``` \ No newline at end of file +``` + + +## AWS Example + +Create secret from a `key.json` file. +```sh +kubectl create secret generic aws-credentials --from-file="./config/key.json" +``` + +Or you can create a secret directly in the terminal: +```sh +kubectl create secret generic aws-credentials --from-literal=aws_access_key_id= --from-literal=aws_secret_access_key= +``` + +Alter `values.yaml` to set your AWS credentials as environment variables +```yaml +# Allows you to load environment variables from kubernetes secret +secret: + - envName: AWS_ACCESS_KEY_ID + secretName: aws-credentials + secretKey: aws_access_key_id + - envName: AWS_SECRET_ACCESS_KEY + secretName: aws-credentials + secretKey: aws_secret_access_key +``` + +And deploy the helm chart with `helm install` command shown above. + +For the notebook to connect with AWS S3, you have to setup the correct spark configurations in your `.py` file. An example: +```python +from pyspark import SparkConf, SparkContext +from pyspark.sql import functions as f +from pyspark.sql import SparkSession + +#spark configuration +conf = ( + SparkConf().set('spark.executor.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true') + .set('spark.driver.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true') + .set("spark.hadoop.fs.s3a.fast.upload", True) + .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") + .set('spark.jars.packages', 'software.amazon.awssdk:s3:2.17.133,org.apache.hadoop:hadoop-aws:3.2.0') + .set('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.EnvironmentVariableCredentialsProvider') +) +sc=SparkContext(conf=conf).getOrCreate() + +spark=SparkSession(sc) + +df = spark.read.parquet("s3a:///") + +df.printSchema() +``` + +Make sure the credentials you passed as env variables do have access to the S3 bucket. + diff --git a/charts/pyspark-notebook/Chart.yaml b/charts/pyspark-notebook/Chart.yaml index c5c85a0..78f6681 100644 --- a/charts/pyspark-notebook/Chart.yaml +++ b/charts/pyspark-notebook/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.2.0 +version: 0.2.1 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/charts/pyspark-notebook/README.md b/charts/pyspark-notebook/README.md index ef8f0b6..54da6d8 100644 --- a/charts/pyspark-notebook/README.md +++ b/charts/pyspark-notebook/README.md @@ -57,7 +57,7 @@ You should be able to access the frontend via http://localhost:8888. ```(shell) kubectl exec -it pod/pyspark-0 -- bash -jupyter notebook list +jupyter server list ``` ## LoadBalancer @@ -72,7 +72,7 @@ Create secret ```sh kubectl create secret generic gcs-credentials --from-file="./config/key.json" ``` -Alter values.yaml +Alter `values.yaml` ```yaml env: @@ -88,4 +88,58 @@ extraVolumeMounts: - name: secrets mountPath: "/mnt/secrets" readOnly: true -``` \ No newline at end of file +``` + + +## AWS Example + +Create secret from a `key.json` file. +```sh +kubectl create secret generic aws-credentials --from-file="./config/key.json" +``` + +Or you can create a secret directly in the terminal: +```sh +kubectl create secret generic aws-credentials --from-literal=aws_access_key_id= --from-literal=aws_secret_access_key= +``` + +Alter `values.yaml` to set your AWS credentials as environment variables +```yaml +# Allows you to load environment variables from kubernetes secret +secret: + - envName: AWS_ACCESS_KEY_ID + secretName: aws-credentials + secretKey: aws_access_key_id + - envName: AWS_SECRET_ACCESS_KEY + secretName: aws-credentials + secretKey: aws_secret_access_key +``` + +And deploy the helm chart with `helm install` command shown above. + +For the notebook to connect with AWS S3, you have to setup the correct spark configurations in your `.py` file. An example: +```python +from pyspark import SparkConf, SparkContext +from pyspark.sql import functions as f +from pyspark.sql import SparkSession + +#spark configuration +conf = ( + SparkConf().set('spark.executor.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true') + .set('spark.driver.extraJavaOptions','-Dcom.amazonaws.services.s3.enableV4=true') + .set("spark.hadoop.fs.s3a.fast.upload", True) + .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") + .set('spark.jars.packages', 'software.amazon.awssdk:s3:2.17.133,org.apache.hadoop:hadoop-aws:3.2.0') + .set('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.EnvironmentVariableCredentialsProvider') +) +sc=SparkContext(conf=conf).getOrCreate() + +spark=SparkSession(sc) + +df = spark.read.parquet("s3a:///") + +df.printSchema() +``` + +Make sure the credentials you passed as env variables do have access to the S3 bucket. +