From 70f0aa1937aa72d564fd82982951a46c944279eb Mon Sep 17 00:00:00 2001 From: hiboyang <14280154+hiboyang@users.noreply.github.com> Date: Fri, 8 Sep 2023 17:22:55 -0700 Subject: [PATCH] Add Quick Start Guide for user use this repo as a library --- README.md | 6 ++- quick-start.md | 103 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 quick-start.md diff --git a/README.md b/README.md index 092fbe9..8b15743 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ not be used in any production setting. In addition, the PMC of the Apache Spark project reserves the right to withdraw and abandon the development of this project if it is not sustainable. -### Getting started +## Getting started ``` git clone https://github.com/apache/spark-connect-go.git git submodule update --init --recursive @@ -21,6 +21,10 @@ make gen && make test ``` > Ensure you have installed `buf CLI`; [more info](https://buf.build/docs/installation/) +## How to write Spark Connect Go Application in your own project + +See [Quick Start Guide](quick-start.md) + ## Spark Connect Go Application Example A very simple example in Go looks like following: diff --git a/quick-start.md b/quick-start.md new file mode 100644 index 0000000..e2cbfe6 --- /dev/null +++ b/quick-start.md @@ -0,0 +1,103 @@ +# Quick Start Guide to Write Spark Connect Client Application in Go + +## Add Reference to `spark-connect-go` Library + +In your Go project `go.mod` file, add `spark-connect-go` library: +``` +require ( + github.com/apache/spark-connect-go/v34 master +) +``` + +In your Go project, run `go mod tidy` to download the library on your local machine. + +## Write Spark Connect Client Application + +Create `main.go` file with following code: +``` +package main + +import ( + "flag" + "log" + + "github.com/apache/spark-connect-go/v34/client/sql" +) + +var ( + remote = flag.String("remote", "sc://localhost:15002", + "the remote address of Spark Connect server to connect to") +) + +func main() { + flag.Parse() + spark, err := sql.SparkSession.Builder.Remote(*remote).Build() + if err != nil { + log.Fatalf("Failed: %s", err.Error()) + } + defer spark.Stop() + + df, err := spark.Sql("select 'apple' as word, 123 as count union all select 'orange' as word, 456 as count") + if err != nil { + log.Fatalf("Failed: %s", err.Error()) + } + + log.Printf("DataFrame from sql: select 'apple' as word, 123 as count union all select 'orange' as word, 456 as count") + err = df.Show(100, false) + if err != nil { + log.Fatalf("Failed: %s", err.Error()) + } + + rows, err := df.Collect() + if err != nil { + log.Fatalf("Failed: %s", err.Error()) + } + + for _, row := range rows { + log.Printf("Row: %v", row) + } + + err = df.Write().Mode("overwrite"). + Format("parquet"). + Save("file:///tmp/spark-connect-write-example-output.parquet") + if err != nil { + log.Fatalf("Failed: %s", err.Error()) + } + + df, err = spark.Read().Format("parquet"). + Load("file:///tmp/spark-connect-write-example-output.parquet") + if err != nil { + log.Fatalf("Failed: %s", err.Error()) + } + + log.Printf("DataFrame from reading parquet") + df.Show(100, false) + + err = df.CreateTempView("view1", true, false) + if err != nil { + log.Fatalf("Failed: %s", err.Error()) + } + + df, err = spark.Sql("select count, word from view1 order by count") + if err != nil { + log.Fatalf("Failed: %s", err.Error()) + } + + log.Printf("DataFrame from sql: select count, word from view1 order by count") + df.Show(100, false) +} +``` + +## Start Spark Connect Server (Driver) + +Download a Spark distribution (3.4.0+), unzip the folder, run command: +``` +sbin/start-connect-server.sh --packages org.apache.spark:spark-connect_2.12:3.4.0 +``` + +## Run Spark Connect Client Application +``` +go run main.go +``` + +You will see the client application connects to the Spark Connect server and prints out the output from your application.