From d04e4c3e3c352144a53c3422a220f71d7a339744 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Tue, 27 Aug 2024 13:28:32 +0200 Subject: [PATCH 1/5] docs: Doucment how to switch to uif wrapper (#91) --- stacks/end-to-end-security/trino-policies.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/stacks/end-to-end-security/trino-policies.yaml b/stacks/end-to-end-security/trino-policies.yaml index 4edac1d1..bb5df220 100644 --- a/stacks/end-to-end-security/trino-policies.yaml +++ b/stacks/end-to-end-security/trino-policies.yaml @@ -174,6 +174,7 @@ data: # TODO: Once 24.11 is out, switch to https://github.com/stackabletech/opa-operator/pull/580 instead of doing the # http call itself + # extra_groups := data.stackable.opa.userinfo.v1.userInfoByUsername(input.context.identity.user).groups extra_groups := groups if { request := { "method": "POST", From 9bf5ba88eba661d07b74e2e72f4291d8fa33ed37 Mon Sep 17 00:00:00 2001 From: Techassi Date: Thu, 5 Sep 2024 16:07:05 +0200 Subject: [PATCH 2/5] ci: Add pre-commit workflow (#92) * ci: Add pre-commit workflow * chore: Remove unused env vars from workflow --- .github/workflows/pr_pre-commit.yml | 19 ++++++++++++++++ .markdownlint.yaml | 20 +++++++++++++++++ .pre-commit-config.yaml | 34 +++++++++++++++++++++++++++++ .yamllint.yaml | 9 ++++++++ 4 files changed, 82 insertions(+) create mode 100644 .github/workflows/pr_pre-commit.yml create mode 100644 .markdownlint.yaml create mode 100644 .pre-commit-config.yaml create mode 100644 .yamllint.yaml diff --git a/.github/workflows/pr_pre-commit.yml b/.github/workflows/pr_pre-commit.yml new file mode 100644 index 00000000..aeff8806 --- /dev/null +++ b/.github/workflows/pr_pre-commit.yml @@ -0,0 +1,19 @@ +--- +name: pre-commit + +on: + pull_request: + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + fetch-depth: 0 + - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + with: + python-version: '3.12' + - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 + with: + extra_args: "--from-ref ${{ github.event.pull_request.base.sha }} --to-ref ${{ github.event.pull_request.head.sha }}" diff --git a/.markdownlint.yaml b/.markdownlint.yaml new file mode 100644 index 00000000..dbfa4558 --- /dev/null +++ b/.markdownlint.yaml @@ -0,0 +1,20 @@ +--- +# All defaults or options can be checked here: +# https://github.com/DavidAnson/markdownlint/blob/main/schema/.markdownlint.yaml + +# Default state for all rules +default: true + +# MD013/line-length - Line length +MD013: + # Number of characters + line_length: 9999 + # Number of characters for headings + heading_line_length: 9999 + # Number of characters for code blocks + code_block_line_length: 9999 + +# MD024/no-duplicate-heading/no-duplicate-header - Multiple headings with the same content +MD024: + # Only check sibling headings + siblings_only: true diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..15287ec7 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,34 @@ +--- +default_language_version: + node: system + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: 2c9f875913ee60ca25ce70243dc24d5b6415598c # 4.6.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: detect-aws-credentials + args: ["--allow-missing-credentials"] + - id: detect-private-key + + - repo: https://github.com/adrienverge/yamllint + rev: 81e9f98ffd059efe8aa9c1b1a42e5cce61b640c6 # 1.35.1 + hooks: + - id: yamllint + + - repo: https://github.com/igorshubovych/markdownlint-cli + rev: f295829140d25717bc79368d3f966fc1f67a824f # 0.41.0 + hooks: + - id: markdownlint + + - repo: https://github.com/koalaman/shellcheck-precommit + rev: 2491238703a5d3415bb2b7ff11388bf775372f29 # 0.10.0 + hooks: + - id: shellcheck + args: ["--severity=info"] + + - repo: https://github.com/rhysd/actionlint + rev: 62dc61a45fc95efe8c800af7a557ab0b9165d63b # 1.7.1 + hooks: + - id: actionlint diff --git a/.yamllint.yaml b/.yamllint.yaml new file mode 100644 index 00000000..a944498c --- /dev/null +++ b/.yamllint.yaml @@ -0,0 +1,9 @@ +--- +extends: default + +rules: + line-length: disable + truthy: + check-keys: false + comments: + min-spaces-from-content: 1 # Needed due to https://github.com/adrienverge/yamllint/issues/443 From bffebdd4e59e463bc8bd477d8d97ed2bae99f112 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Tue, 10 Sep 2024 10:36:51 +0100 Subject: [PATCH 3/5] Remove the trino-subsea-data demo again (#94) --- demos/demos-v2.yaml | 19 ---- .../create-table-in-trino.yaml | 83 --------------- demos/trino-subsea-data/load-test-data.yaml | 21 ---- demos/trino-subsea-data/setup-superset.yaml | 95 ------------------ demos/trino-subsea-data/superset-assets.zip | Bin 6544 -> 0 bytes stacks/trino-superset-s3/superset.yaml | 5 - 6 files changed, 223 deletions(-) delete mode 100644 demos/trino-subsea-data/create-table-in-trino.yaml delete mode 100644 demos/trino-subsea-data/load-test-data.yaml delete mode 100644 demos/trino-subsea-data/setup-superset.yaml delete mode 100644 demos/trino-subsea-data/superset-assets.zip diff --git a/demos/demos-v2.yaml b/demos/demos-v2.yaml index e41560d3..4e04efb6 100644 --- a/demos/demos-v2.yaml +++ b/demos/demos-v2.yaml @@ -147,25 +147,6 @@ demos: cpu: 6800m memory: 15822Mi pvc: 28Gi - trino-subsea-data: - description: Demo loading ca. 600m^2 of ocean floor in a surface plot to visualize the irregularities of the ocean floor. - # documentation: -- Currently not documented - stackableStack: trino-superset-s3 - labels: - - trino - - superset - - minio - - s3 - - parquet - manifests: - - plainYaml: https://raw.githubusercontent.com/stackabletech/demos/main/demos/trino-subsea-data/load-test-data.yaml - - plainYaml: https://raw.githubusercontent.com/stackabletech/demos/main/demos/trino-subsea-data/create-table-in-trino.yaml - - plainYaml: https://raw.githubusercontent.com/stackabletech/demos/main/demos/trino-subsea-data/setup-superset.yaml - supportedNamespaces: [] - resourceRequests: - cpu: 6800m - memory: 15822Mi - pvc: 28Gi data-lakehouse-iceberg-trino-spark: description: Data lakehouse using Iceberg lakehouse on S3, Trino as query engine, Spark for streaming ingest and Superset for data visualization. Multiple datasources like taxi data, water levels in Germany, earthquakes, e-charging stations and more are loaded. documentation: https://docs.stackable.tech/stackablectl/stable/demos/data-lakehouse-iceberg-trino-spark.html diff --git a/demos/trino-subsea-data/create-table-in-trino.yaml b/demos/trino-subsea-data/create-table-in-trino.yaml deleted file mode 100644 index 1c5dec46..00000000 --- a/demos/trino-subsea-data/create-table-in-trino.yaml +++ /dev/null @@ -1,83 +0,0 @@ ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: create-subsea-multibeam-table-in-trino -spec: - template: - spec: - containers: - - name: create-subsea-multibeam-table-in-trino - image: docker.stackable.tech/stackable/testing-tools:0.2.0-stackable24.7.0 - command: ["bash", "-c", "python -u /tmp/script/script.py"] - volumeMounts: - - name: script - mountPath: /tmp/script - - name: trino-users - mountPath: /trino-users - volumes: - - name: script - configMap: - name: create-subsea-multibeam-table-in-trino-script - - name: trino-users - secret: - secretName: trino-users - restartPolicy: OnFailure - backoffLimit: 50 ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: create-subsea-multibeam-table-in-trino-script -data: - script.py: | - import sys - import trino - - if not sys.warnoptions: - import warnings - warnings.simplefilter("ignore") - - def get_connection(): - connection = trino.dbapi.connect( - host="trino-coordinator", - port=8443, - user="admin", - http_scheme='https', - auth=trino.auth.BasicAuthentication("admin", open("/trino-users/admin").read()), - ) - connection._http_session.verify = False - return connection - - def run_query(connection, query): - print(f"[DEBUG] Executing query {query}") - cursor = connection.cursor() - cursor.execute(query) - return cursor.fetchall() - - connection = get_connection() - - run_query(connection, "CREATE SCHEMA IF NOT EXISTS hive.demo WITH (location = 's3a://demo/')") - run_query(connection, """ - CREATE TABLE IF NOT EXISTS hive.demo.subsea ( - footprint_x DOUBLE, - footprint_y DOUBLE, - water_depth DOUBLE, - data_point_density DOUBLE, - geometry VARBINARY - ) WITH ( - external_location = 's3a://demo/subsea/', - format = 'parquet' - ) - """) - - loaded_rows = run_query(connection, "SELECT COUNT(*) FROM hive.demo.subsea")[0][0] - print(f"Loaded {loaded_rows} rows") - assert loaded_rows > 0 - - print("Analyzing table subsea") - analyze_rows = run_query(connection, """ANALYZE hive.demo.subsea""")[0][0] - assert analyze_rows == loaded_rows - stats = run_query(connection, """show stats for hive.demo.subsea""") - print("Produced the following stats:") - print(*stats, sep="\n") diff --git a/demos/trino-subsea-data/load-test-data.yaml b/demos/trino-subsea-data/load-test-data.yaml deleted file mode 100644 index 8971b06c..00000000 --- a/demos/trino-subsea-data/load-test-data.yaml +++ /dev/null @@ -1,21 +0,0 @@ ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: load-subsea-multibeam-data -spec: - template: - spec: - containers: - - name: load-subsea-multibeam-data - image: "bitnami/minio:2024-debian-12" - command: ["bash", "-c", "cd /tmp && curl -O https://repo.stackable.tech/repository/misc/marispace/multibeam_data_point_density_example.parquet && mc --insecure alias set minio http://minio:9000/ $(cat /minio-s3-credentials/accessKey) $(cat /minio-s3-credentials/secretKey) && mc cp multibeam_data_point_density_example.parquet minio/demo/subsea"] - volumeMounts: - - name: minio-s3-credentials - mountPath: /minio-s3-credentials - volumes: - - name: minio-s3-credentials - secret: - secretName: minio-s3-credentials - restartPolicy: OnFailure - backoffLimit: 50 diff --git a/demos/trino-subsea-data/setup-superset.yaml b/demos/trino-subsea-data/setup-superset.yaml deleted file mode 100644 index 51fa8b5d..00000000 --- a/demos/trino-subsea-data/setup-superset.yaml +++ /dev/null @@ -1,95 +0,0 @@ ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: setup-superset -spec: - template: - spec: - containers: - - name: setup-superset - image: docker.stackable.tech/stackable/testing-tools:0.2.0-stackable24.7.0 - command: ["bash", "-c", "curl -o superset-assets.zip https://raw.githubusercontent.com/stackabletech/demos/main/demos/trino-subsea-data/superset-assets.zip && python -u /tmp/script/script.py"] - volumeMounts: - - name: script - mountPath: /tmp/script - - name: trino-users - mountPath: /trino-users - - name: superset-credentials - mountPath: /superset-credentials - volumes: - - name: script - configMap: - name: setup-superset-script - - name: superset-credentials - secret: - secretName: superset-credentials - - name: trino-users - secret: - secretName: trino-users - restartPolicy: OnFailure - backoffLimit: 50 ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: setup-superset-script -data: - script.py: | - import logging - import requests - - base_url = "http://superset-external:8088" # For local testing / developing replace it, afterwards change back to http://superset-external:8088 - superset_username = open("/superset-credentials/adminUser.username").read() - superset_password = open("/superset-credentials/adminUser.password").read() - trino_username = "admin" - trino_password = open("/trino-users/admin").read() - - logging.basicConfig(level=logging.INFO) - logging.info("Starting setup of Superset") - - logging.info("Getting access token from /api/v1/security/login") - session = requests.session() - access_token = session.post(f"{base_url}/api/v1/security/login", json={"username": superset_username, "password": superset_password, "provider": "db", "refresh": True}).json()['access_token'] - # print(f"access_token: {access_token}") - - logging.info("Getting csrf token from /api/v1/security/csrf_token") - csrf_token = session.get(f"{base_url}/api/v1/security/csrf_token", headers={"Authorization": f"Bearer {access_token}"}).json()["result"] - # print(f"csrf_token: {csrf_token}") - - headers = { - "accept": "application/json", - "Authorization": f"Bearer {access_token}", - "X-CSRFToken": csrf_token, - } - - # To retrieve all of the assets (datasources, datasets, charts and dashboards) run the following commands - # logging.info("Exporting all assets") - # result = session.get(f"{base_url}/api/v1/assets/export", headers=headers) - # assert result.status_code == 200 - # with open("superset-assets.zip", "wb") as f: - # f.write(result.content) - - - ######################### - # IMPORTANT - ######################### - # The exported zip file had to be modified, otherwise we get: - # - # {"errors": [{"message": "Error importing assets", "error_type": "GENERIC_COMMAND_ERROR", "level": "warning", "extra": {"databases/Trino.yaml": {"extra": {"disable_data_preview": ["Unknown field."]}}, "issue_codes": [{"code": 1010, "message": "Issue 1010 - Superset encountered an error while running a command."}]}}]} - # - # The file databases/Trino.yaml was modified and the attribute "extra.disable_data_preview" was removed - ######################### - logging.info("Importing all assets") - files = { - "bundle": ("superset-assets.zip", open("superset-assets.zip", "rb")), - } - data = { - "passwords": '{"databases/Trino.yaml": "' + trino_password + '"}' - } - result = session.post(f"{base_url}/api/v1/assets/import", headers=headers, files=files, data=data) - print(result) - print(result.text) - assert result.status_code == 200 - - logging.info("Finished setup of Superset") diff --git a/demos/trino-subsea-data/superset-assets.zip b/demos/trino-subsea-data/superset-assets.zip deleted file mode 100644 index 60a02552b51e712549fd728628993760d7a6ba33..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6544 zcmdT}eT-Dq6(3ksm`V*1D^jS>XUQxwZ+E_SX7)8I3%km??1#Gx3mSO6^WL4A2XEdR z-pA|=8?3fgiAJqe8XFTpt+hn3fkLr>)%sDA#?)ww(u$y|6x61znu-OrJ@?-C=4%I6 zlm62oGt8WO?m6e4kKa4z?m$n|l$k#Ey3DtG@91x@gCFF*0N%*5ux&}Wm^VyYii9Gu zP%_*;91h1~kwj|_+ekq+YAKkz@P)wwlA1jdFRB3lJef zC@D}eVk{;`5-ssaGT9zm5DJN*P{2P>W74~?9^QO(lFvupkqJ#A=A@8?t=3^v)eUNy zN0W4v1DoiA-&&_3P0r$6Npeh8Oq*)$WwvkoC zCl~4nN=~cWVihDOuNg@3=(uQ`Nc3|YRb@#9)wCU?N#GC-lMrf#3CE|_aGV!ADUVE) z1MOoQU@O)$s*XJx5I9aYbREmKgv^XZq>~rKT1}Tc#e~PQC5W(^LpG?=iN>;B9A^aLiwq zXtmjbW~|(E((m(;cVfb9k+aCOt=4`SBVD>igBVM3nlhHA%Gk{py2{a2zk)5 zxB+?m78+tnHZ&)vTT+ol1yT)@95D<#54mhhMH(zf!meW8#1<<~({Oa9xr&Lqf(wV{ zQ&vb3mv5LB@6h#2fet|{t16vERq2bUDov;ga?(^~rlYYjkW01aL4{1SFbY{`f}`_x zmYsyT#&tPKSd___I0I}|w2He&kg2PBrjpMbmqpfM$FhywQrE4tT8Svv&y5uuG&TAt z&al%+r9?!-YBXsiC0tsw4zhU7I-U4c$r`ehRyD94vIA;><4k}EP0gu*C4iIU6#%LR zC5fP73@BiU))E|p{l8o_RYkRcsc6Xv${YfZ%>sf%XbS0}C;7X`1Ld;_m5ZOh>^2veAB~fUlW=JG!T(DG- zl}#DQ^iCG>uXC_jfl) zTV@~ziXMd4R2fjmWjpXNnY>7F4ODv-<`F6lWAd{MOUS+fZ_8Bi{Ai_ zLb`6)41G0Jg7RYLE>{;Xvf_esQdhH@|E9$u5|u!36|@zXWrj;rzfl;r3)ykt4~NfI?weh@}&80pa!(G{(s| zPT)9h!_oRqm?AE=B_=_^;c&b?g#?t4+l5#gw9)O+C=#OKL^|G{Y)@b%RX1uy1L4PC zJJ;tU@A!nxbH!_|Lr%)VNQ$`Qmg`M;4A~mADr~h;?@Zc?K)`Ql4s;*1Kdx=3Va+Af zH`B0^cT$>aWh;XQJ9kovw8$W?l3EW_R|FG2EN%accwS(QAm0yGX zBf>yiUx$#63=FYRfz2iKh_I!zV`#~u{*J*;X?fS+PbG89|sq+>)-2&aOc)nrR|p$|R}{o+xzWCB9%imz!*qK&KeC0l1;D?CI13~x@V1pcp&oI z&SEZwG+UK9C=|{h11Faz1E=3l#S(ivk+~L|bdNJuq2k{uRr|~G4GD+fbDJPe=g2Vy~RzND`zNMzL5=qLPqTr!h+pLZMQsbGWgv#uY0x$_A>QDXf-D$<>J#2nA}5g6RjzYcsU_ z61hbki+ipe>|fT`DGd*H^bHMk40iPqb2;vF^1k-i-Z4k~L@}K?Kmf%SGG3(BjAPPd zb4QVkWESqEj`stNZwG2rya)=C-R53s@g4tA& zC}Mj24MdQ}LV-b+X;yPN2kx-EO;i*eVZC9yazW*ecM47;`h@p}@xa)IK>gFzyJwo} zd&InG<;#yHm1A%tdBJ%;U(z>cf{Rl3$?RTAwp#RYNgtNWhNtWn!KIjSA3JE zoNV&>=*RKlUgP!9`A?l0`S{7{`5A%VpV;w6VOD7O!ku3|+`V<)6*IV=&7Je!zv|J%+zplCR-an4rbL-SKcfGjqd;5>v{^!e;>CMkgx&FB)5A8oPWk>Xu&EGoo z!(Hh`-<;BW{$1G}k^8^B;L(G{2k@MaCtb4b+}tczLqAD{g72mifAKJ~RZv8(1z zeQKWp=67(lPep!Yx}r3|<+zB((bWt)2b*f+Mfb_OJLbcfD}p@znkm3wJiZX6|`I zd#(B1?H%SFXI3qLd+`GYj>l*H?t@P@KJm+&>`(sk+#f#OJ9_ESgLmHFyJ~FHT_>6k zp1JeAkzf7%u^BtR|F=UgPx8F12(2o7v zinDsIet+7&cslmx1wY+AckcFgwk-VU+0APct4}@^+81oQFni$Pr$;_}=dE}C{@#i2 z-1yP7)5kA9{K{LsCpW!1_}=OHTaG<&^gs5NM}G3qvQl*akMWg{%zpU!x6Z8Ic*7Gv z3bp=q)8Uzi)(dw|fBxzN+uvO?f8MhPw(mMN&@*ZB%%-#7y+Z>=UpGUVeW{!HdcpNK z=uE87ck`tCCKOCExBfo8UVi%(Ka}M;`KHoX;&j*fLe=+&>U;X7= Ap#T5? diff --git a/stacks/trino-superset-s3/superset.yaml b/stacks/trino-superset-s3/superset.yaml index b8363fae..24a8cfd4 100644 --- a/stacks/trino-superset-s3/superset.yaml +++ b/stacks/trino-superset-s3/superset.yaml @@ -14,11 +14,6 @@ spec: roleGroups: default: replicas: 1 - configOverrides: - superset_config.py: - # Needed by trino-subsea-data demo - ROW_LIMIT: "200000" - SQL_MAX_ROW: "200000" --- apiVersion: v1 kind: Secret From 994b3338767a49dc3892f78433ac39d17f3ecb04 Mon Sep 17 00:00:00 2001 From: Felix Hennig Date: Mon, 16 Sep 2024 11:11:12 +0100 Subject: [PATCH 4/5] Add descriptions and fix formatting (#97) * Add descriptions and fix formatting * Add descriptions and fix formatting * fix formatting and HBase spelling * Update docs/modules/demos/pages/hbase-hdfs-load-cycling-data.adoc Co-authored-by: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> * Update docs/modules/demos/pages/hbase-hdfs-load-cycling-data.adoc Co-authored-by: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> * Update docs/modules/demos/pages/jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data.adoc Co-authored-by: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> * Update docs/modules/demos/pages/nifi-kafka-druid-earthquake-data.adoc Co-authored-by: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> * Update docs/modules/demos/pages/spark-k8s-anomaly-detection-taxi-data.adoc Co-authored-by: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> --------- Co-authored-by: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> --- .../demos/pages/airflow-scheduled-job.adoc | 6 +- .../data-lakehouse-iceberg-trino-spark.adoc | 207 +++++++++--------- .../demos/pages/end-to-end-security.adoc | 5 +- .../pages/hbase-hdfs-load-cycling-data.adoc | 46 ++-- docs/modules/demos/pages/index.adoc | 37 ++-- ...park-hdfs-anomaly-detection-taxi-data.adoc | 84 +++---- docs/modules/demos/pages/logging.adoc | 1 + .../nifi-kafka-druid-earthquake-data.adoc | 165 +++++++------- .../nifi-kafka-druid-water-level-data.adoc | 3 +- .../demos/pages/signal-processing.adoc | 31 +-- ...spark-k8s-anomaly-detection-taxi-data.adoc | 40 ++-- docs/modules/demos/pages/trino-iceberg.adoc | 115 +++++----- docs/modules/demos/pages/trino-taxi-data.adoc | 14 +- stacks/stacks-v2.yaml | 7 +- 14 files changed, 368 insertions(+), 393 deletions(-) diff --git a/docs/modules/demos/pages/airflow-scheduled-job.adoc b/docs/modules/demos/pages/airflow-scheduled-job.adoc index 1f7e6f0c..1ecf3d80 100644 --- a/docs/modules/demos/pages/airflow-scheduled-job.adoc +++ b/docs/modules/demos/pages/airflow-scheduled-job.adoc @@ -1,5 +1,6 @@ = airflow-scheduled-job :page-aliases: stable@stackablectl::demos/airflow-scheduled-job.adoc +:description: This demo installs Airflow with Postgres and Redis on Kubernetes, showcasing DAG scheduling, job runs, and status verification via the Airflow UI. Install this demo on an existing Kubernetes cluster: @@ -102,9 +103,10 @@ Click on the `run_every_minute` box in the centre of the page and then select `L [WARNING] ==== -In this demo, the logs are not available when the KubernetesExecutor is deployed. See the https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/executor/kubernetes.html#managing-dags-and-logs[Airflow Documentation] for more details. +In this demo, the logs are not available when the KubernetesExecutor is deployed. +See the https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/executor/kubernetes.html#managing-dags-and-logs[Airflow Documentation] for more details. -If you are interested in persisting the logs, please take a look at the xref:logging.adoc[] demo. +If you are interested in persisting the logs, take a look at the xref:logging.adoc[] demo. ==== image::airflow-scheduled-job/airflow_9.png[] diff --git a/docs/modules/demos/pages/data-lakehouse-iceberg-trino-spark.adoc b/docs/modules/demos/pages/data-lakehouse-iceberg-trino-spark.adoc index 0b007674..0432f6f6 100644 --- a/docs/modules/demos/pages/data-lakehouse-iceberg-trino-spark.adoc +++ b/docs/modules/demos/pages/data-lakehouse-iceberg-trino-spark.adoc @@ -1,5 +1,6 @@ = data-lakehouse-iceberg-trino-spark :page-aliases: stable@stackablectl::demos/data-lakehouse-iceberg-trino-spark.adoc +:description: This demo shows a data workload with real-world data volumes using Trino, Kafka, Spark, NiFi, Superset and OPA. :demo-code: https://github.com/stackabletech/demos/blob/main/demos/data-lakehouse-iceberg-trino-spark/create-spark-ingestion-job.yaml :iceberg-table-maintenance: https://iceberg.apache.org/docs/latest/spark-procedures/#metadata-management @@ -11,17 +12,17 @@ [IMPORTANT] ==== -This demo shows a data workload with real-world data volumes and uses significant resources to ensure acceptable -response times. It will likely not run on your workstation. +This demo shows a data workload with real-world data volumes and uses significant resources to ensure acceptable response times. +It will likely not run on your workstation. -There is also the smaller xref:trino-iceberg.adoc[] demo focusing on the abilities a lakehouse using Apache -Iceberg offers. The `trino-iceberg` demo has no streaming data and can be executed on a local workstation. +There is also the smaller xref:trino-iceberg.adoc[] demo focusing on the abilities a lakehouse using Apache Iceberg offers. +The `trino-iceberg` demo has no streaming data and can be executed on a local workstation. ==== [CAUTION] ==== -This demo only runs in the `default` namespace, as a `ServiceAccount` will be created. Additionally, we have to use the -FQDN service names (including the namespace), so that the used TLS certificates are valid. +This demo only runs in the `default` namespace, as a `ServiceAccount` will be created. +Additionally, we have to use the FQDN service names (including the namespace), so that the used TLS certificates are valid. ==== Install this demo on an existing Kubernetes cluster: @@ -37,9 +38,9 @@ $ stackablectl demo install data-lakehouse-iceberg-trino-spark The demo was developed and tested on a kubernetes cluster with about 12 nodes (4 cores with hyperthreading/SMT, 20GiB RAM and 30GB HDD). Instance types that loosely correspond to this on the Hyperscalers are: -- *Google*: `e2-standard-8` -- *Azure*: `Standard_D4_v2` -- *AWS*: `m5.2xlarge` +* *Google*: `e2-standard-8` +* *Azure*: `Standard_D4_v2` +* *AWS*: `m5.2xlarge` In addition to these nodes the operators will request multiple persistent volumes with a total capacity of about 300Gi. @@ -49,26 +50,26 @@ This demo will * Install the required Stackable operators. * Spin up the following data products: -** *Trino*: A fast distributed SQL query engine for big data analytics that helps you explore your data universe. This - demo uses it to enable SQL access to the data. -** *Apache Spark*: A multi-language engine for executing data engineering, data science, and machine learning. This demo - uses it to stream data from Kafka into the lakehouse. -** *MinIO*: S3 compatible object store. This demo uses it as persistent storage to store all the data used -** *Apache Kafka*: A distributed event streaming platform for high-performance data pipelines, streaming analytics and - data integration. This demo uses it as an event streaming platform to stream the data in near real-time. -** *Apache NiFi*: An easy-to-use, robust system to process and distribute data. This demo uses it to fetch multiple - online real-time data sources and ingest it into Kafka. -** *Apache Hive metastore*: A service that stores metadata related to Apache Hive and other services. This demo uses it - as metadata storage for Trino and Spark. -** *Open policy agent* (OPA): An open-source, general-purpose policy engine unifying policy enforcement across the - stack. This demo uses it as the authorizer for Trino, which decides which user can query which data. -** *Apache Superset*: A modern data exploration and visualization platform. This demo utilizes Superset to retrieve data - from Trino via SQL queries and build dashboards on top of that data. +** *Trino*: A fast distributed SQL query engine for big data analytics that helps you explore your data universe. + This demo uses it to enable SQL access to the data. +** *Apache Spark*: A multi-language engine for executing data engineering, data science, and machine learning. + This demo uses it to stream data from Kafka into the lakehouse. +** *MinIO*: S3 compatible object store. + This demo uses it as persistent storage to store all the data used +** *Apache Kafka*: A distributed event streaming platform for high-performance data pipelines, streaming analytics and data integration. + This demo uses it as an event streaming platform to stream the data in near real-time. +** *Apache NiFi*: An easy-to-use, robust system to process and distribute data. + This demo uses it to fetch multiple online real-time data sources and ingest it into Kafka. +** *Apache Hive metastore*: A service that stores metadata related to Apache Hive and other services. + This demo uses it as metadata storage for Trino and Spark. +** *Open policy agent* (OPA): An open-source, general-purpose policy engine unifying policy enforcement across the stack. + This demo uses it as the authorizer for Trino, which decides which user can query which data. +** *Apache Superset*: A modern data exploration and visualization platform. + This demo utilizes Superset to retrieve data from Trino via SQL queries and build dashboards on top of that data. * Copy multiple data sources in CSV and Parquet format into the S3 staging area. -* Let Trino copy the data from the staging area into the lakehouse area. During the copy, transformations such as - validating, casting, parsing timestamps and enriching the data by joining lookup tables are done. -* Simultaneously, start a NiFi workflow, which fetches datasets in real-time via the internet and ingests the data as - JSON records into Kafka. +* Let Trino copy the data from the staging area into the lakehouse area. + During the copy, transformations such as validating, casting, parsing timestamps and enriching the data by joining lookup tables are done. +* Simultaneously, start a NiFi workflow, which fetches datasets in real-time via the internet and ingests the data as JSON records into Kafka. * Spark structured streaming job is started, which streams the data out of Kafka into the lakehouse. * Create Superset dashboards for visualization of the different datasets. @@ -83,9 +84,8 @@ As Apache Iceberg states on their https://iceberg.apache.org/docs/latest/[websit > Apache Iceberg is an open table format for huge analytic datasets. Iceberg adds tables to compute engines including Spark, Trino, PrestoDB, Flink, Hive and Impala using a high-performance table format that works just like a SQL table. -This demo uses Iceberg, which plays nicely with object storage and has integrations for Trino and Spark. It also -provides the following benefits among other things, instead of putting https://parquet.apache.org/[Apache Parquet] files -directly into S3 using the https://trino.io/docs/current/connector/hive.html[Hive connector]: +This demo uses Iceberg, which plays nicely with object storage and has integrations for Trino and Spark. +It also provides the following benefits among other things, instead of putting https://parquet.apache.org/[Apache Parquet] files directly into S3 using the https://trino.io/docs/current/connector/hive.html[Hive connector]: * *Standardized table storage:* Using this standardized specification, multiple tools such as Trino, Spark and Flink can read and write Iceberg tables. @@ -101,11 +101,11 @@ directly into S3 using the https://trino.io/docs/current/connector/hive.html[Hiv column `day` is not needed anymore, and the query `select count(\*) where ts > now() - interval 1 day` would use partition pruning as expected to read only one the partitions from today and yesterday. * *Branching and tagging:* Iceberg enables git-like semantics on your lakehouse. You can create tags pointing to a -specific snapshot of your data and branches. For details, please read +specific snapshot of your data and branches. For details, read https://www.dremio.com/blog/exploring-branch-tags-in-apache-iceberg-using-spark/[this excellent blog post]. Currently, this is only supported in Spark. Trino is https://github.com/trinodb/trino/issues/12844[working on support]. -If you want to read more about the motivation and the working principles of Iceberg, please have a read on their +If you want to read more about the motivation and the working principles of Iceberg, have a read on their https://iceberg.apache.org[website] or https://github.com/apache/iceberg/[GitHub repository]. == List the deployed Stackable services @@ -226,18 +226,15 @@ On the right side are three strands, that . Fetch the current shared bike station status . Fetch the current shared bike status -For details on the NiFi workflow ingesting water-level data, please read the -xref:nifi-kafka-druid-water-level-data.adoc#_nifi[nifi-kafka-druid-water-level-data documentation on NiFi]. +For details on the NiFi workflow ingesting water-level data, read the xref:nifi-kafka-druid-water-level-data.adoc#_nifi[nifi-kafka-druid-water-level-data documentation on NiFi]. == Spark -https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html[Spark Structured Streaming] is used to -stream data from Kafka into the lakehouse. +https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html[Spark Structured Streaming] is used to stream data from Kafka into the lakehouse. === Accessing the web interface -To have access to the Spark web interface you need to run the following command to forward port 4040 to your local -machine. +To have access to the Spark web interface you need to run the following command to forward port 4040 to your local machine. [source,console] ---- @@ -250,23 +247,27 @@ image::data-lakehouse-iceberg-trino-spark/spark_1.png[] === Listing the running Structured Streaming jobs -The UI displays the last job runs. Each running Structured Streaming job creates lots of Spark jobs internally. Click on -the `Structured Streaming` tab to see the running streaming jobs. +The UI displays the last job runs. +Each running Structured Streaming job creates lots of Spark jobs internally. +Click on the `Structured Streaming` tab to see the running streaming jobs. image::data-lakehouse-iceberg-trino-spark/spark_2.png[] -Five streaming jobs are currently running. You can also click on a streaming job to get more details. For the job -`ingest smart_city shared_bikes_station_status` click, on the `Run ID` highlighted in blue to open them up. +Five streaming jobs are currently running. +You can also click on a streaming job to get more details. +For the job `ingest smart_city shared_bikes_station_status` click, on the `Run ID` highlighted in blue to open them up. image::data-lakehouse-iceberg-trino-spark/spark_3.png[] === How the Structured Streaming jobs work -The demo has started all the running streaming jobs. Look at the {demo-code}[demo code] to see the actual code -submitted to Spark. This document will explain one specific ingestion job - `ingest water_level measurements`. +The demo has started all the running streaming jobs. Look at the {demo-code}[demo code] to see the actual code submitted to Spark. +This document will explain one specific ingestion job - `ingest water_level measurements`. -The streaming job is written in Python using `pyspark`. First off, the schema used to parse the JSON coming from Kafka -is defined. Nested structures or arrays are supported as well. The schema differs from job to job. +The streaming job is written in Python using `pyspark`. +First off, the schema used to parse the JSON coming from Kafka is defined. +Nested structures or arrays are supported as well. +The schema differs from job to job. [source,python] ---- @@ -277,16 +278,13 @@ schema = StructType([ \ ]) ---- -Afterwards, a streaming read from Kafka is started. It reads from our Kafka at `kafka:9093` with the topic -`water_levels_measurements`. When starting up, the job will ready all the existing messages in Kafka (read from -earliest) and will process 50000000 records as a maximum in a single batch. As Kafka has retention set up, Kafka records -might alter out of the topic before Spark has read the records, which can be the case when the Spark application wasn't -running or crashed for too long. In the case of this demo, the streaming job should not error out. For a production job, -`failOnDataLoss` should be set to `true` so that missing data does not go unnoticed - and Kafka offsets need to be -adjusted manually, as well as some post-loading of data. +Afterwards, a streaming read from Kafka is started. It reads from our Kafka at `kafka:9093` with the topic `water_levels_measurements`. +When starting up, the job will ready all the existing messages in Kafka (read from earliest) and will process 50000000 records as a maximum in a single batch. +As Kafka has retention set up, Kafka records might alter out of the topic before Spark has read the records, which can be the case when the Spark application wasn't running or crashed for too long. +In the case of this demo, the streaming job should not error out. +For a production job, `failOnDataLoss` should be set to `true` so that missing data does not go unnoticed - and Kafka offsets need to be adjusted manually, as well as some post-loading of data. -*Note:* The following Python snippets belong to a single Python statement but are split into separate blocks for better -explanation. +*Note:* The following Python snippets belong to a single Python statement but are split into separate blocks for better explanation. [source,python] ---- @@ -301,8 +299,7 @@ spark \ .load() \ ---- -So far we have a `readStream` reading from Kafka. Records on Kafka are simply a byte-stream, so they must be converted -to strings and the json needs to be parsed. +So far we have a `readStream` reading from Kafka. Records on Kafka are simply a byte-stream, so they must be converted to strings and the json needs to be parsed. [source,python] ---- @@ -310,19 +307,20 @@ to strings and the json needs to be parsed. .withColumn("json", from_json(col("value"), schema)) \ ---- -Afterwards, we only select the needed fields (coming from JSON). We are not interested in all the other fields, such as -`key`, `value`, `topic` or `offset`. The metadata of the Kafka records, such as `topic`, `timestamp`, `partition` and -`offset`, are also available. Please have a look at the {spark-streaming-docs}[Spark streaming documentation on Kafka]. +Afterwards, we only select the needed fields (coming from JSON). +We are not interested in all the other fields, such as `key`, `value`, `topic` or `offset`. +The metadata of the Kafka records, such as `topic`, `timestamp`, `partition` and `offset`, are also available. +Have a look at the {spark-streaming-docs}[Spark streaming documentation on Kafka]. [source,python] ---- .select("json.station_uuid", "json.timestamp", "json.value") \ ---- -After all these transformations, we need to specify the sink of the stream, in this case, the Iceberg lakehouse. We are -writing in the `iceberg` format using the `update` mode rather than the "normal" `append` mode. Spark will aim for a -micro-batch every 2 minutes and save its checkpoints (its current offsets on the Kafka topic) in the specified S3 -location. Afterwards, the streaming job will be started by calling `.start()`. +After all these transformations, we need to specify the sink of the stream, in this case, the Iceberg lakehouse. +We are writing in the `iceberg` format using the `update` mode rather than the "normal" `append` mode. +Spark will aim for a micro-batch every 2 minutes and save its checkpoints (its current offsets on the Kafka topic) in the specified S3 location. +Afterwards, the streaming job will be started by calling `.start()`. [source,python] ---- @@ -345,10 +343,9 @@ One important part was skipped during the walkthrough: .foreachBatch(upsertWaterLevelsMeasurements) \ ---- -`upsertWaterLevelsMeasurements` is a Python function that describes inserting the records from Kafka into the lakehouse -table. This specific streaming job removes all duplicate records that can occur because of how the PegelOnline API works -and gets called. As we don't want duplicate rows in our lakehouse tables, we need to filter the duplicates out as -follows. +`upsertWaterLevelsMeasurements` is a Python function that describes inserting the records from Kafka into the lakehouse table. +This specific streaming job removes all duplicate records that can occur because of how the PegelOnline API works and gets called. +As we don't want duplicate rows in our lakehouse tables, we need to filter the duplicates out as follows. [source,python] ---- @@ -363,17 +360,16 @@ def upsertWaterLevelsMeasurements(microBatchOutputDF, batchId): """) ---- -First, the data frame containing the upserts (records from Kafka) will be registered as a temporary view so that they -can be accessed via Spark SQL. Afterwards, the `MERGE INTO` statement adds the new records to the lakehouse table. +First, the data frame containing the upserts (records from Kafka) will be registered as a temporary view so that they can be accessed via Spark SQL. +Afterwards, the `MERGE INTO` statement adds the new records to the lakehouse table. -The incoming records are first de-duplicated (using `SELECT DISTINCT * FROM waterLevelsMeasurementsUpserts`) so that the -data from Kafka does not contain duplicates. Afterwards, the - now duplication-free - records get added to the -`lakehouse.water_levels.measurements`, but *only* if they still need to be present. +The incoming records are first de-duplicated (using `SELECT DISTINCT * FROM waterLevelsMeasurementsUpserts`) so that the data from Kafka does not contain duplicates. +Afterwards, the - now duplication-free - records get added to the `lakehouse.water_levels.measurements`, but *only* if they still need to be present. === The Upsert mechanism -The `MERGE INTO` statement can be used for de-duplicating data and updating existing rows in the lakehouse table. The -`ingest water_level stations` streaming job uses the following `MERGE INTO` statement: +The `MERGE INTO` statement can be used for de-duplicating data and updating existing rows in the lakehouse table. +The `ingest water_level stations` streaming job uses the following `MERGE INTO` statement: [source,sql] ---- @@ -389,25 +385,25 @@ WHEN MATCHED THEN UPDATE SET * WHEN NOT MATCHED THEN INSERT * ---- -First, the data within a batch is de-deduplicated as well. The record containing the station update with the highest -Kafka timestamp is the newest and will be used during Upsert. +First, the data within a batch is de-deduplicated as well. +The record containing the station update with the highest Kafka timestamp is the newest and will be used during Upsert. -If a record for a station (detected by the same `station_uuid`) already exists, its contents will be updated. If the -station is yet to be discovered, it will be inserted. The `MERGE INTO` also supports updating subsets of fields and more -complex calculations, e.g. incrementing a counter. For details, have a look at the -{iceberg-merge-docs}[Iceberg MERGE INTO documentation]. +If a record for a station (detected by the same `station_uuid`) already exists, its contents will be updated. +If the station is yet to be discovered, it will be inserted. +The `MERGE INTO` also supports updating subsets of fields and more complex calculations, e.g. incrementing a counter. +For details, have a look at the {iceberg-merge-docs}[Iceberg MERGE INTO documentation]. === The Delete mechanism -The `MERGE INTO` statement can de-duplicate data and update existing lakehouse table rows. For details have a look at -the {iceberg-merge-docs}[Iceberg MERGE INTO documentation]. +The `MERGE INTO` statement can de-duplicate data and update existing lakehouse table rows. +For details have a look at the {iceberg-merge-docs}[Iceberg MERGE INTO documentation]. === Table maintenance As mentioned, Iceberg supports out-of-the-box {iceberg-table-maintenance}[table maintenance] such as compaction. -This demo executes some maintenance functions in a rudimentary Python loop with timeouts in between. When running in -production, the maintenance can be scheduled using Kubernetes {k8s-cronjobs}[CronJobs] or {airflow}[Apache Airflow], +This demo executes some maintenance functions in a rudimentary Python loop with timeouts in between. +When running in production, the maintenance can be scheduled using Kubernetes {k8s-cronjobs}[CronJobs] or {airflow}[Apache Airflow], which the Stackable Data Platform also supports. [source,python] @@ -439,7 +435,8 @@ while True: time.sleep(25 * 60) # Assuming compaction takes 5 min run every 30 minutes ---- -The scripts have a dictionary of all the tables to run maintenance on. The following procedures are run: +The scripts have a dictionary of all the tables to run maintenance on. +The following procedures are run: ==== https://iceberg.apache.org/docs/latest/spark-procedures/#expire_snapshots[expire_snapshots] @@ -459,8 +456,7 @@ data files causes an unnecessary amount of metadata and less efficient queries f data files in parallel using Spark with the rewriteDataFiles action. This will combine small files into larger files to reduce metadata overhead and runtime file open cost. -Some tables will also be sorted during rewrite, please have a look at the -{iceberg-rewrite}[documentation on rewrite_data_files]. +Some tables will also be sorted during rewrite, have a look at the {iceberg-rewrite}[documentation on rewrite_data_files]. == Trino @@ -479,8 +475,8 @@ image::data-lakehouse-iceberg-trino-spark/trino_2.png[] === Connect to Trino -Please have a look at the xref:home:trino:usage-guide/connect_to_trino.adoc[trino-operator documentation on how to -connect to Trino]. This demo recommends to use DBeaver, as Trino consists of many schemas and tables you can explore. +Have a look at the xref:home:trino:usage-guide/connect_to_trino.adoc[trino-operator documentation on how to connect to Trino]. +This demo recommends to use DBeaver, as Trino consists of many schemas and tables you can explore. image::data-lakehouse-iceberg-trino-spark/dbeaver_1.png[] @@ -496,8 +492,8 @@ Here you can see all the available Trino catalogs. == Superset -Superset provides the ability to execute SQL queries and build dashboards. Open the Superset endpoint -`external-http` in your browser (http://87.106.122.58:32452 in this case). +Superset provides the ability to execute SQL queries and build dashboards. +Open the Superset endpoint `external-http` in your browser (http://87.106.122.58:32452 in this case). image::data-lakehouse-iceberg-trino-spark/superset_1.png[] @@ -507,13 +503,13 @@ image::data-lakehouse-iceberg-trino-spark/superset_2.png[] === Viewing the Dashboard -The demo has created dashboards to visualize the different data sources. Select the `Dashboards` tab at the top to view -these dashboards. +The demo has created dashboards to visualize the different data sources. +Select the `Dashboards` tab at the top to view these dashboards. image::data-lakehouse-iceberg-trino-spark/superset_3.png[] -Click on the dashboard called `House sales`. It might take some time until the dashboards renders all the included -charts. +Click on the dashboard called `House sales`. +It might take some time until the dashboards renders all the included charts. image::data-lakehouse-iceberg-trino-spark/superset_4.png[] @@ -529,27 +525,28 @@ There are multiple other dashboards you can explore on your own. === Viewing Charts -The dashboards consist of multiple charts. To list the charts, select the `Charts` tab at the top. +The dashboards consist of multiple charts. +To list the charts, select the `Charts` tab at the top. === Executing arbitrary SQL statements -Within Superset, you can create dashboards and run arbitrary SQL statements. On the top click on the tab `SQL` -> -`SQL Lab`. +Within Superset, you can create dashboards and run arbitrary SQL statements. +On the top click on the tab `SQL` -> `SQL Lab`. image::data-lakehouse-iceberg-trino-spark/superset_7.png[] -On the left, select the database `Trino lakehouse`, the schema `house_sales`, and set `See table schema` to -`house_sales`. +On the left, select the database `Trino lakehouse`, the schema `house_sales`, and set `See table schema` to `house_sales`. [IMPORTANT] ==== -The older screenshot below shows how the table preview would look like. Currently, there is an https://github.com/apache/superset/issues/25307[open issue] -with previewing trino tables using the Iceberg connector. This doesn't affect the execution the following execution of the SQL statement. +The older screenshot below shows how the table preview would look like. Currently, there is an https://github.com/apache/superset/issues/25307[open issue] with previewing trino tables using the Iceberg connector. +This doesn't affect the execution the following execution of the SQL statement. ==== image::data-lakehouse-iceberg-trino-spark/superset_8.png[] -In the right textbox, you can enter the desired SQL statement. If you want to avoid making one up, use the following: +In the right textbox, you can enter the desired SQL statement. +If you want to avoid making one up, use the following: [source,sql] ---- diff --git a/docs/modules/demos/pages/end-to-end-security.adoc b/docs/modules/demos/pages/end-to-end-security.adoc index 2fc6a724..2f956aec 100644 --- a/docs/modules/demos/pages/end-to-end-security.adoc +++ b/docs/modules/demos/pages/end-to-end-security.adoc @@ -1,6 +1,6 @@ = end-to-end-security - :k8s-cpu: https://kubernetes.io/docs/tasks/debug/debug-cluster/resource-metrics-pipeline/#cpu +:description: This demo showcases end-to-end security in Stackable Data Platform with OPA, featuring row/column access control, OIDC, Kerberos, and flexible group policies. This is a demo to showcase what can be done with Open Policy Agent around authorization in the Stackable Data Platform. It covers the following aspects of security: @@ -55,8 +55,7 @@ You can see the deployed products and their relationship in the following diagra image::end-to-end-security/overview.png[Architectural overview] -Please note the different types of arrows used to connect the technologies in here, which symbolize -how authentication happens along that route and if impersonation is used for queries executed. +Note the different types of arrows used to connect the technologies in here, which symbolize how authentication happens along that route and if impersonation is used for queries executed. The Trino schema (with schemas, tables and views) is shown below. diff --git a/docs/modules/demos/pages/hbase-hdfs-load-cycling-data.adoc b/docs/modules/demos/pages/hbase-hdfs-load-cycling-data.adoc index 9eaec607..5ef8d790 100644 --- a/docs/modules/demos/pages/hbase-hdfs-load-cycling-data.adoc +++ b/docs/modules/demos/pages/hbase-hdfs-load-cycling-data.adoc @@ -1,5 +1,6 @@ = hbase-hdfs-cycling-data :page-aliases: stable@stackablectl::demos/hbase-hdfs-load-cycling-data.adoc +:description: Load cyclist data from HDFS to HBase on Kubernetes using Stackable's demo. Install, copy data, create HFiles, and query efficiently. :kaggle: https://www.kaggle.com/datasets/timgid/cyclistic-dataset-google-certificate-capstone?select=Divvy_Trips_2020_Q1.csv :k8s-cpu: https://kubernetes.io/docs/tasks/debug/debug-cluster/resource-metrics-pipeline/#cpu @@ -14,10 +15,7 @@ Install this demo on an existing Kubernetes cluster: $ stackablectl demo install hbase-hdfs-load-cycling-data ---- -[WARNING] -==== -This demo should not be run alongside other demos. -==== +WARNING: This demo should not be run alongside other demos. [#system-requirements] == System requirements @@ -34,11 +32,11 @@ This demo will * Install the required Stackable operators. * Spin up the following data products: -** *Hbase:* An open source distributed, scalable, big data store. This demo uses it to store the +** *HBase:* An open source distributed, scalable, big data store. This demo uses it to store the {kaggle}[cyclist dataset] and enable access. -** *HDFS:* A distributed file system used to intermediately store the dataset before importing it into Hbase +** *HDFS:* A distributed file system used to intermediately store the dataset before importing it into HBase * Use {distcp}[distcp] to copy a {kaggle}[cyclist dataset] from an S3 bucket into HDFS. -* Create HFiles, a File format for hbase consisting of sorted key/value pairs. Both keys and values are byte arrays. +* Create HFiles, a File format for hBase consisting of sorted key/value pairs. Both keys and values are byte arrays. * Load Hfiles into an existing table via the `Importtsv` utility, which will load data in `TSV` or `CSV` format into HBase. * Query data via the `hbase` shell, which is an interactive shell to execute commands on the created table @@ -86,10 +84,9 @@ This demo will run two jobs to automatically load data. === distcp-cycling-data -{distcp}[DistCp] (distributed copy) is used for large inter/intra-cluster copying. It uses MapReduce to effect its -distribution, error handling, recovery, and reporting. It expands a list of files and directories into input to map -tasks, each of which will copy a partition of the files specified in the source list. Therefore, the first Job uses -DistCp to copy data from a S3 bucket into HDFS. Below, you'll see parts from the logs. +{distcp}[DistCp] (distributed copy) efficiently transfers large amounts of data from one location to another. +Therefore, the first Job uses DistCp to copy data from a S3 bucket into HDFS. +Below, you'll see parts from the logs. [source] ---- @@ -110,11 +107,12 @@ Copying s3a://public-backup-nyc-tlc/cycling-tripdata/demo-cycling-tripdata.csv.g The second Job consists of 2 steps. -First, we use `org.apache.hadoop.hbase.mapreduce.ImportTsv` (see {importtsv}[ImportTsv Docs]) to create a table and -Hfiles. Hfile is an Hbase dedicated file format which is performance optimized for hbase. It stores meta-information -about the data and thus increases the performance of hbase. When connecting to the hbase master, opening a hbase shell -and executing `list`, you will see the created table. However, it'll contain 0 rows at this point. You can connect to -the shell via: +First, we use `org.apache.hadoop.hbase.mapreduce.ImportTsv` (see {importtsv}[ImportTsv Docs]) to create a table and Hfiles. +Hfile is an HBase dedicated file format which is performance optimized for HBase. +It stores meta-information about the data and thus increases the performance of HBase. +When connecting to the HBase master, opening a HBase shell and executing `list`, you will see the created table. +However, it'll contain 0 rows at this point. +You can connect to the shell via: [source,console] ---- @@ -135,7 +133,7 @@ cycling-tripdata ---- Secondly, we'll use `org.apache.hadoop.hbase.tool.LoadIncrementalHFiles` (see {bulkload}[bulk load docs]) to import -the Hfiles into the table and ingest rows. +the Hfiles into the table and ingest rows. Now we will see how many rows are in the `cycling-tripdata` table: @@ -162,7 +160,7 @@ Took 13.4666 seconds == Inspecting the Table -You can now use the table and the data. You can use all available hbase shell commands. +You can now use the table and the data. You can use all available HBase shell commands. [source,sql] ---- @@ -190,15 +188,15 @@ COLUMN FAMILIES DESCRIPTION {NAME => 'started_at', BLOOMFILTER => 'ROW', IN_MEMORY => 'false', VERSIONS => '1', KEEP_DELETED_CELLS => 'FALSE', DATA_BLOCK_ENCODING => 'NONE', COMPRESSION => 'NONE', TTL => 'FOREVER', MIN_VERSIONS => '0', BLOCKCACHE => 'true', BLOCKSIZE => '65536', REPLICATION_SCOPE => '0'} ---- -== Accessing the Hbase web interface +== Accessing the HBase web interface [TIP] ==== Run `stackablectl stacklet list` to get the address of the _ui-http_ endpoint. -If the UI is unavailable, please do a port-forward `kubectl port-forward hbase-master-default-0 16010`. +If the UI is unavailable, do a port-forward `kubectl port-forward hbase-master-default-0 16010`. ==== -The Hbase web UI will give you information on the status and metrics of your Hbase cluster. See below for the start page. +The HBase web UI will give you information on the status and metrics of your HBase cluster. See below for the start page. image::hbase-hdfs-load-cycling-data/hbase-ui-start-page.png[] @@ -208,8 +206,7 @@ image::hbase-hdfs-load-cycling-data/hbase-table-ui.png[] == Accessing the HDFS web interface -You can also see HDFS details via a UI by running `stackablectl stacklet list` and following the link next to one of -the namenodes. +You can also see HDFS details via a UI by running `stackablectl stacklet list` and following the link next to one of the namenodes. Below you will see the overview of your HDFS cluster. @@ -223,7 +220,8 @@ You can also browse the file system by clicking on the `Utilities` tab and selec image::hbase-hdfs-load-cycling-data/hdfs-data.png[] -Navigate in the file system to the folder `data` and then the `raw` folder. Here you can find the raw data from the distcp job. +Navigate in the file system to the folder `data` and then the `raw` folder. +Here you can find the raw data from the distcp job. image::hbase-hdfs-load-cycling-data/hdfs-data-raw.png[] diff --git a/docs/modules/demos/pages/index.adoc b/docs/modules/demos/pages/index.adoc index 7aa7ba4e..65ada08a 100644 --- a/docs/modules/demos/pages/index.adoc +++ b/docs/modules/demos/pages/index.adoc @@ -1,33 +1,30 @@ = Demos :page-aliases: stable@stackablectl::demos/index.adoc +:description: Explore Stackable demos showcasing data platform architectures. Includes external components for evaluation. -The pages below this section guide you on how to use the demos provided by Stackable. To install a demo please follow -the xref:management:stackablectl:quickstart.adoc[quickstart guide] or have a look at the -xref:management:stackablectl:commands/demo.adoc[demo command]. We currently offer the following list of demos: +The pages in this section guide you on how to use the demos provided by Stackable. +To install a demo follow the xref:management:stackablectl:quickstart.adoc[quickstart guide] or have a look at the xref:management:stackablectl:commands/demo.adoc[demo command]. +These are the available demos: include::partial$demos.adoc[] [IMPORTANT] .External Components in these demos ==== -These demos are provided by Stackable as showcases to demonstrate potential architectures that could be built with the -Stackable Data Platform. As such they may include components that are not supported by Stackable as part of our -commercial offering. +These demos are provided by Stackable as showcases to demonstrate potential architectures that could be built with the Stackable Data Platform. +As such they may include components that are not supported by Stackable as part of our commercial offering. -If you are evaluating one or more of these demos with the intention of purchasing a subscription, please make sure to -double-check the list of supported operators, anything that is not mentioned on there is not part of our commercial -offering. +If you are evaluating one or more of these demos with the intention of purchasing a subscription, make sure to double-check the list of supported operators; anything that is not mentioned on there is not part of our commercial offering. -Below you can find a list of components that are currently contained in one or more of the demos for reference, if -something is missing from this list and also not mentioned on our operators list, then this component is not supported: +Below you can find a list of components that are currently contained in one or more of the demos for reference, if something is missing from this list and also not mentioned on our operators list, then this component is not supported: -- Grafana -- JupyterHub -- MinIO -- OpenLDAP -- OpenSearch -- OpenSearch Dashboards -- PostgreSQL -- Prometheus -- Redis +* Grafana +* JupyterHub +* MinIO +* OpenLDAP +* OpenSearch +* OpenSearch Dashboards +* PostgreSQL +* Prometheus +* Redis ==== diff --git a/docs/modules/demos/pages/jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data.adoc b/docs/modules/demos/pages/jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data.adoc index cdf63df4..5391cae6 100644 --- a/docs/modules/demos/pages/jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data.adoc +++ b/docs/modules/demos/pages/jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data.adoc @@ -14,13 +14,11 @@ :hadoop: https://hadoop.apache.org/ :jupyter: https://jupyter.org -This demo showcases the integration between {jupyter}[Jupyter] and {hadoop}[Apache Hadoop] deployed on the Stackable -Data Platform (SDP) Kubernetes cluster. {jupyterlab}[JupyterLab] is deployed using the -{jupyterhub-k8s}[pyspark-notebook stack] provided by the Jupyter community. The SDP makes this integration easy by -publishing a discovery `ConfigMap` for the HDFS cluster. This `ConfigMap` is then mounted in all `Pods` running -{pyspark}[PySpark] notebooks so that these have access to HDFS data. For this demo, the HDFS cluster is provisioned with -a small sample of the {nyc-taxi}[NYC taxi trip dataset], which is analyzed with a notebook that is provisioned -automatically in the JupyterLab interface. +This demo showcases the integration between {jupyter}[Jupyter] and {hadoop}[Apache Hadoop] deployed on the Stackable Data Platform (SDP) Kubernetes cluster. +{jupyterlab}[JupyterLab] is deployed using the {jupyterhub-k8s}[pyspark-notebook stack] provided by the Jupyter community. +The SDP makes this integration easy by publishing a discovery ConfigMap for the HDFS cluster. +This ConfigMap is then mounted in all Pods running {pyspark}[PySpark] notebooks so that these have access to HDFS data. +For this demo, the HDFS cluster is provisioned with a small sample of the {nyc-taxi}[NYC taxi trip dataset], which is analyzed with a notebook that is provisioned automatically in the JupyterLab interface. Install this demo on an existing Kubernetes cluster: @@ -29,10 +27,7 @@ Install this demo on an existing Kubernetes cluster: $ stackablectl demo install jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data ---- -[WARNING] -==== -This demo should not be run alongside other demos. -==== +WARNING: This demo should not be run alongside other demos. [#system-requirements] == System requirements @@ -45,34 +40,33 @@ To run this demo, your system needs at least: == Aim / Context -This demo does not use the Stackable spark-k8s-operator but rather delegates the creation of executor pods to -JupyterHub. The intention is to demonstrate how to interact with SDP components when designing and testing Spark jobs: -the resulting script and Spark job definition can then be transferred with a Stackable `SparkApplication` resource. When -logging in to JupyterHub (described below), a pod will be created with the username as a suffix, e.g. `jupyter-admin`. -Doing so runs a container hosting a Jupyter Notebook with pre-installed Spark, Java and Python. When the user creates a -`SparkSession`, temporary spark executors are constructed that are persisted until the notebook kernel is shut down or -restarted. The notebook can thus be used as a sandbox for writing, testing and benchmarking Spark jobs before they are -moved into production. +This demo does not use the Stackable operator for Spark but rather delegates the creation of executor pods to JupyterHub. +The intention is to demonstrate how to interact with SDP components when designing and testing Spark jobs: +the resulting script and Spark job definition can then be transferred with a Stackable SparkApplication resource. +When logging in to JupyterHub (described below), a pod will be created with the username as a suffix, e.g. `jupyter-admin`. +Doing so runs a container hosting a Jupyter Notebook with pre-installed Spark, Java and Python. +When the user creates a SparkSession, temporary spark executors are constructed that are persisted until the notebook kernel is shut down or restarted. +The notebook can thus be used as a sandbox for writing, testing and benchmarking Spark jobs before they are moved into production. == Overview This demo will: -* Install the required Stackable Data Platform operators -* Spin up the following data products +* Install the required Stackable Data Platform operators. +* Spin up the following data products: ** *JupyterHub*: A multi-user server for Jupyter notebooks ** *Apache HDFS*: A distributed file system used to store the taxi dataset -* Download a sample of the NY taxi dataset into HDFS -* Install Jupyter notebook -* Train an anomaly detection model using PySpark on the data available in HDFS -* Perform some predictions and visualize anomalies +* Download a sample of the NY taxi dataset into HDFS. +* Install Jupyter notebook. +* Train an anomaly detection model using PySpark on the data available in HDFS. +* Perform some predictions and visualize anomalies. == HDFS -The Stackable Operator for Apache HDFS will spin up an HDFS cluster to store the taxi dataset in -{parquet}[Apache Parquet] format. This dataset will be read and processed via PySpark. +The Stackable Operator for Apache HDFS will spin up an HDFS cluster to store the taxi dataset in {parquet}[Apache Parquet] format. +This dataset will be read and processed via PySpark. Before trying out the notebook example in Jupyter, check if the taxi data was loaded to HDFS successfully: @@ -103,7 +97,8 @@ proxy-7bf49bb844-mhx66 1/1 Running 0 5m36s zookeeper-server-default-0 1/1 Running 0 5m12s ---- -JupyterHub will create a Pod for each active user. In order to reach the JupyterHub web interface, create a port-forward: +JupyterHub will create a Pod for each active user. +In order to reach the JupyterHub web interface, create a port-forward: [source,console] ---- @@ -145,25 +140,21 @@ Click on the double arrow (⏩️) to execute the Python scripts. image::jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data/jupyter_hub_run_notebook.png[] -You can also inspect the `hdfs` folder where the `core-site.xml` and `hdfs-site.xml` from -the discovery `ConfigMap` of the HDFS cluster are located. +You can also inspect the `hdfs` folder where the `core-site.xml` and `hdfs-site.xml` from the discovery ConfigMap of the HDFS cluster are located. [NOTE] ==== -The image defined for the spark job must contain all dependencies needed for that job to run. For pyspark jobs, this -will mean that Python libraries either need to be baked into the image (this demo contains a Dockerfile that was used to -generate an image containing scikit-learn, pandas and their dependencies) or {spark-pkg}[packaged in some other way]. +The image defined for the spark job must contain all dependencies needed for that job to run. +For pyspark jobs, this will mean that Python libraries either need to be baked into the image (this demo contains a Dockerfile that was used to generate an image containing scikit-learn, pandas and their dependencies) or {spark-pkg}[packaged in some other way]. ==== == Model details -The job uses an implementation of the Isolation Forest {forest-algo}[algorithm] provided by the scikit-learn -{scikit-lib}[library]: the model is trained and then invoked by a user-defined function (see {forest-article}[this -article] for how to call the sklearn library with a pyspark UDF), all of which is run using the Spark executors spun up -in the current SparkSession. This type of model attempts to isolate each data point by continually partitioning the -data. Data closely packed together will require more partitions to separate data points. In contrast, any outliers will -require less: the number of partitions needed for a particular data point is thus inversely proportional to the anomaly -"score". +The job uses an implementation of the Isolation Forest {forest-algo}[algorithm] provided by the scikit-learn {scikit-lib}[library]: +the model is trained and then invoked by a user-defined function (see {forest-article}[this article] for how to call the sklearn library with a pyspark UDF), all of which is run using the Spark executors spun up in the current SparkSession. +This type of model attempts to isolate each data point by continually partitioning the data. +Data closely packed together will require more partitions to separate data points. +In contrast, any outliers will require less: the number of partitions needed for a particular data point is thus inversely proportional to the anomaly "score". == Visualization @@ -171,15 +162,12 @@ The notebook shows how to plot the outliers against a particular metric (e.g. "n image::jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data/jupyter_hub_graph.png[] -However, this is mainly for convenience - the anomaly score is derived from the *_entire_* feature space, i.e., it -considers all dimensions (or features/columns) when scoring data, meaning that not only are the results challenging to -visualize (how can multidimensional data be represented in only 3-D dimensional space?), but that a root cause analysis -has to be a separate process. It would be tempting to look at just one metric and assume causal effects, but the model -"sees" all features as a set of numerical values and derives patterns accordingly. +However, this is mainly for convenience - the anomaly score is derived from the *_entire_* feature space, i.e., it considers all dimensions (or features/columns) when scoring data, meaning that not only are the results challenging to visualize (how can multidimensional data be represented in only 3-D dimensional space?), but that a root cause analysis has to be a separate process. +It would be tempting to look at just one metric and assume causal effects, but the model "sees" all features as a set of numerical values and derives patterns accordingly. -We can tackle the first of these issues by collapsing - or projecting - our data into a manageable number of dimensions -that can be plotted. Once the script has finished successfully, plots should be displayed on the bottom that show the -same data in 2D and 3D representation. The 3D plot should look like this: +We can tackle the first of these issues by collapsing - or projecting - our data into a manageable number of dimensions that can be plotted. +Once the script has finished successfully, plots should be displayed on the bottom that show the same data in 2D and 3D representation. +The 3D plot should look like this: image::jupyterhub-pyspark-hdfs-anomaly-detection-taxi-data/jupyter_hub_3d_isolation_forest.png[] diff --git a/docs/modules/demos/pages/logging.adoc b/docs/modules/demos/pages/logging.adoc index ddb87631..0466d0eb 100644 --- a/docs/modules/demos/pages/logging.adoc +++ b/docs/modules/demos/pages/logging.adoc @@ -1,5 +1,6 @@ = logging :page-aliases: stable@stackablectl::demos/logging.adoc +:description: Deploy a logging stack with OpenSearch, Vector, and Zookeeper for log data analysis using OpenSearch Dashboards in Kubernetes. :k8s-cpu: https://kubernetes.io/docs/tasks/debug/debug-cluster/resource-metrics-pipeline/#cpu diff --git a/docs/modules/demos/pages/nifi-kafka-druid-earthquake-data.adoc b/docs/modules/demos/pages/nifi-kafka-druid-earthquake-data.adoc index 847fc05b..8f0d495f 100644 --- a/docs/modules/demos/pages/nifi-kafka-druid-earthquake-data.adoc +++ b/docs/modules/demos/pages/nifi-kafka-druid-earthquake-data.adoc @@ -1,5 +1,6 @@ = nifi-kafka-druid-earthquake-data :page-aliases: stable@stackablectl::demos/nifi-kafka-druid-earthquake-data.adoc +:description: Install this demo for a showcase of using Kafka, Druid and Superset to view the global earthquake distribution. :superset-docs: https://superset.apache.org/docs/using-superset/creating-your-first-dashboard/#creating-charts-in-explore-view :druid-tutorial: https://druid.apache.org/docs/latest/tutorials/tutorial-kafka.html#loading-data-with-the-data-loader @@ -17,8 +18,8 @@ $ stackablectl demo install nifi-kafka-druid-earthquake-data [CAUTION] ==== -This demo only runs in the `default` namespace, as a `ServiceAccount` will be created. Additionally, we have to use the -FQDN service names (including the namespace), so that the used TLS certificates are valid. +This demo only runs in the `default` namespace, as a ServiceAccount will be created. +Additionally, we have to use the FQDN service names (including the namespace), so that the used TLS certificates are valid. ==== [#system-requirements] @@ -94,16 +95,15 @@ include::partial$instance-hint.adoc[] == Inspect the data in Kafka -Kafka is an event streaming platform to stream the data in near real-time. All the messages put in and read from Kafka -are structured in dedicated queues called topics. The test data will be put into a topic called earthquakes. The records -are produced (put in) by the test data generator and consumed (read) by Druid afterwards in the same order they were -created. +Kafka is an event streaming platform to stream the data in near real-time. +All the messages put in and read from Kafka are structured in dedicated queues called topics. +The test data will be put into a topic called earthquakes. +The records are produced (written) by the test data generator and consumed (read) by Druid afterwards in the same order they were created. -As Kafka has no web interface, you must use a Kafka client like {kcat}[kcat]. Kafka uses mutual TLS, so clients -wanting to connect to Kafka must present a valid TLS certificate. The easiest way to obtain this is to shell into the -`kafka-broker-default-0` Pod, as we will do in the following section for demonstration purposes. For a production setup, -you should spin up a dedicated Pod provisioned with a certificate acting as a Kafka client instead of shell-ing into the -Kafka Pod. +As Kafka has no web interface, you must use a Kafka client like {kcat}[kcat]. +Kafka uses mutual TLS, so clients wanting to connect to Kafka must present a valid TLS certificate. +The easiest way to obtain this is to shell into the `kafka-broker-default-0` Pod, as we will do in the following section for demonstration purposes. +For a production setup, you should spin up a dedicated Pod provisioned with a certificate acting as a Kafka client instead of shell-ing into the Kafka Pod. === List the available Topics @@ -169,8 +169,8 @@ Below is an example of the output of one record: } ---- -If you are interested in how many records have been produced to the Kafka topic so far, use the following command. It -will print the last record produced to the topic partition, formatted with the pattern specified in the `-f` parameter. +If you are interested in how many records have been produced to the Kafka topic so far, use the following command. +It will print the last record produced to the topic partition, formatted with the pattern specified in the `-f` parameter. The given pattern will print some metadata of the record. [source,console] @@ -186,23 +186,23 @@ Topic earthquakes / Partition 0 / Offset: 385017 / Timestamp: 1680607795571 Topic earthquakes / Partition 0 / Offset: 385018 / Timestamp: 1680607795571 ---- -If you calculate `385,011` records * `8` partitions, you end up with ~ 3,080,088 records. The output also shows that the -last measurement record was produced at the timestamp `1680607795568`, which translates to `Di 4. Apr 13:29:55 CEST 2023` +If you calculate `385,011` records * `8` partitions, you end up with ~ 3,080,088 records. +The output also shows that the last measurement record was produced at the timestamp `1680607795568`, which translates to `Di 4. Apr 13:29:55 CEST 2023` (using the command `date -d @1680607795`). == NiFi -NiFi is used to fetch earthquake data from the internet and ingest it into Kafka. This demo includes a workflow -("process group") that downloads a large CSV file, converts it to individual JSON records and produces the records into -Kafka. +NiFi is used to fetch earthquake data from the internet and ingest it into Kafka. +This demo includes a workflow ("process group") that downloads a large CSV file, converts it to individual JSON records and produces the records into Kafka. === View the testdata-generation Job You can have a look at the ingestion job running in NiFi by opening the endpoint `https` from your -`stackablectl stacklet list` command output. In this case, it is https://172.18.0.2:30596. Open it with your favourite -browser. Suppose you get a warning regarding the self-signed certificate generated by the -xref:home:secret-operator:index.adoc[Secret Operator] (e.g. Warning: Potential Security Risk Ahead). In that case, you must -tell your browser to trust the website and continue. +`stackablectl stacklet list` command output. +In this case, it is https://172.18.0.2:30596. +Open it with your favourite browser. +Suppose you get a warning regarding the self-signed certificate generated by the xref:home:secret-operator:index.adoc[Secret Operator] (e.g. Warning: Potential Security Risk Ahead). +In that case, you must tell your browser to trust the website and continue. image::nifi-kafka-druid-earthquake-data/nifi_1.png[] @@ -210,9 +210,9 @@ Log in with the username `admin` and password `adminadmin`. image::nifi-kafka-druid-earthquake-data/nifi_2.png[] -You can see the started ProcessGroup consisting of two processors. The first one - `InvokeHTTP`, fetches the CSV file -from the Internet and puts it into the queue of the next processor. The second processor - `PublishKafkaRecord_2_6`, -parses the CSV file, converts it to JSON records and writes them out into Kafka. +You can see the started ProcessGroup consisting of two processors. +The first one - `InvokeHTTP`, fetches the CSV file from the Internet and puts it into the queue of the next processor. +The second processor - `PublishKafkaRecord_2_6`, parses the CSV file, converts it to JSON records and writes them out into Kafka. Double-click on the `InvokeHTTP` processor to show the processor details. @@ -222,25 +222,24 @@ Head over to the tab `PROPERTIES`. image::nifi-kafka-druid-earthquake-data/nifi_4.png[] -Here, you can see the setting `HTTP URL`, which specifies the download URL from where the CSV file is retrieved. Close -the processor details popup by clicking `OK`. Afterwards, double-click on the processor `PublishKafkaRecord_2_6`. +Here, you can see the setting `HTTP URL`, which specifies the download URL from where the CSV file is retrieved. +Close the processor details popup by clicking `OK`. +Afterwards, double-click on the processor `PublishKafkaRecord_2_6`. image::nifi-kafka-druid-earthquake-data/nifi_5.png[] -The Kafka connection details within this processor - like broker addresses and topic names - are specified. It uses the -`CSVReader` to parse the downloaded CSV and the `JsonRecordSetWriter` to split it into individual JSON records before -writing it out. +The Kafka connection details within this processor - like broker addresses and topic names - are specified. +It uses the `CSVReader` to parse the downloaded CSV and the `JsonRecordSetWriter` to split it into individual JSON records before writing it out. == Druid -Druid is used to ingest the near real-time data from Kafka, store it and enable SQL access. The demo has started an -ingestion job reading earthquake records from the Kafka topic earthquakes and saving them into Druid's deep storage. The -Druid deep storage is based on the S3 store provided by MinIO. +Druid is used to ingest the near real-time data from Kafka, store it and enable SQL access. +The demo has started an ingestion job reading earthquake records from the Kafka topic earthquakes and saving them into Druid's deep storage. +The Druid deep storage is based on the S3 store provided by MinIO. === View the Ingestion job -You can have a look at the ingestion job running in Druid by opening the endpoint `router-https` from your -`stackablectl stacklet list` command output (http://172.18.0.2:31642 in this case). +You can have a look at the ingestion job running in Druid by opening the endpoint `router-https` from your `stackablectl stacklet list` command output (http://172.18.0.2:31642 in this case). image::nifi-kafka-druid-earthquake-data/druid_1.png[] @@ -248,30 +247,28 @@ By clicking on `Supervisors` at the top, you can see the running ingestion jobs. image::nifi-kafka-druid-earthquake-data/druid_2.png[] -You can see additional information after clicking on the magnification glass to the right side of the `RUNNING` -supervisor. On the tab `Statistics` on the left, you can see the number of processed records as well as the number of -errors. +You can see additional information after clicking on the magnification glass to the right side of the `RUNNING` supervisor. +On the tab `Statistics` on the left, you can see the number of processed records as well as the number of errors. image::nifi-kafka-druid-earthquake-data/druid_3.png[] -The statistics show that Druid ingested `5074` records during the last minute and has ingested 3 million records already. All -entries have been consumed successfully, indicated by having no `processWithError`, `thrownAway` or `unparseable` -records. +The statistics show that Druid ingested `5074` records during the last minute and has ingested 3 million records already. +All entries have been consumed successfully, indicated by having no `processWithError`, `thrownAway` or `unparseable` records. === Query the Data Source -The ingestion job has automatically created the Druid data source `earthquakes`. You can see the available data sources -by clicking on `Datasources` at the top. +The ingestion job has automatically created the Druid data source `earthquakes`. +You can see the available data sources by clicking on `Datasources` at the top. image::nifi-kafka-druid-earthquake-data/druid_4.png[] -You can see the data source's segments by clicking on `segments` under `Availability` for the `earthquake` data source. In this case, the `earthquake` data -source is partitioned by the year of the earthquakes, resulting in 73 segments. +You can see the data source's segments by clicking on `segments` under `Availability` for the `earthquake` data source. +In this case, the `earthquake` data source is partitioned by the year of the earthquakes, resulting in 73 segments. image::nifi-kafka-druid-earthquake-data/druid_5.png[] -Druid offers a web-based way of querying the data sources via SQL. To achieve this, you must first click on `Query` at -the top. +Druid offers a web-based way of querying the data sources via SQL. +To achieve this, you must first click on `Query` at the top. image::nifi-kafka-druid-earthquake-data/druid_6.png[] @@ -300,8 +297,8 @@ image::nifi-kafka-druid-earthquake-data/druid_8.png[] == Superset -Superset provides the ability to execute SQL queries and build dashboards. Open the endpoint `external-http` in your -browser (http://172.18.0.2:32569 in this case). +Superset provides the ability to execute SQL queries and build dashboards. +Open the endpoint `external-http` in your browser (http://172.18.0.2:32569 in this case). image::nifi-kafka-druid-earthquake-data/superset_1.png[] @@ -311,22 +308,25 @@ image::nifi-kafka-druid-earthquake-data/superset_2.png[] === View the dashboard -The demo has created a Dashboard to visualize the earthquake data. To open it, click on the tab `Dashboards` at the top. +The demo has created a Dashboard to visualize the earthquake data. +To open it, click on the tab `Dashboards` at the top. image::nifi-kafka-druid-earthquake-data/superset_3.png[] -Click on the dashboard called `Earthquakes`. It might take some time until the dashboard renders all included charts. +Click on the dashboard called `Earthquakes`. +It might take some time until the dashboard renders all included charts. image::nifi-kafka-druid-earthquake-data/superset_4.png[] === View the charts -The dashboard `Earthquakes` consists of multiple charts. To list the charts, click on the tab `Charts` at the top. +The dashboard `Earthquakes` consists of multiple charts. +To list the charts, click on the tab `Charts` at the top. image::nifi-kafka-druid-earthquake-data/superset_5.png[] -Click on the Chart `Number of earthquakes by magnitude`. On the left side you can modify the chart and click on `Update Chart` to -see the effect. +Click on the Chart `Number of earthquakes by magnitude`. +On the left side you can modify the chart and click on `Update Chart` to see the effect. image::nifi-kafka-druid-earthquake-data/superset_6.png[] @@ -337,23 +337,23 @@ Afterwards click on the chart `Earthquake distribution`. image::nifi-kafka-druid-earthquake-data/superset_7.png[] -The distribution of the earthquakes matches the continental plate margins. This is the expected distribution from the -{wikipedia}[Wikipedia article on Earthquakes]. +The distribution of the earthquakes matches the continental plate margins. +This is the expected distribution from the {wikipedia}[Wikipedia article on Earthquakes]. -You can move and zoom the map with your mouse to interactively explore the map. You can e.g. have a detailed look at the -detected earthquakes in Germany. +You can move and zoom the map with your mouse to interactively explore the map. +You can e.g. have a detailed look at the detected earthquakes in Germany. image::nifi-kafka-druid-earthquake-data/superset_8.png[] -You can also click on the magnitudes in the legend on the top right side to enable/disable printing the earthquakes of -that magnitude. By only enabling magnitudes greater or equal to 8 you can plot only the most severe earthquakes. +You can also click on the magnitudes in the legend on the top right side to enable/disable printing the earthquakes of that magnitude. +By only enabling magnitudes greater or equal to 8 you can plot only the most severe earthquakes. image::nifi-kafka-druid-earthquake-data/superset_9.png[] === Execute arbitrary SQL statements -Within Superset you can not only create dashboards but also run arbitrary SQL statements. On the top click on the tab -`SQL` -> `SQL Lab`. +Within Superset you can not only create dashboards but also run arbitrary SQL statements. +On the top click on the tab `SQL` -> `SQL Lab`. image::nifi-kafka-druid-earthquake-data/superset_10.png[] @@ -361,7 +361,8 @@ On the left select the database `druid`, the schema `druid` and set `See table s image::nifi-kafka-druid-earthquake-data/superset_11.png[] -On the right textbox enter the desired SQL statement. If you do not want to make one up, you can use the following: +On the right textbox enter the desired SQL statement. +If you do not want to make one up, you can use the following: [source,sql] ---- @@ -377,8 +378,8 @@ image::nifi-kafka-druid-earthquake-data/superset_12.png[] == MinIO -The S3 provided by MinIO is used as a persistent deep storage for Druid to store all the data used. Open the `minio` -endpoint `http` in your browser (http://172.18.0.2:30902 in this case). +The S3 provided by MinIO is used as a persistent deep storage for Druid to store all the data used. +Open the `minio` endpoint `http` in your browser (http://172.18.0.2:30902 in this case). image::nifi-kafka-druid-earthquake-data/minio_1.png[] @@ -390,26 +391,26 @@ Click on the bucket `demo` and open the folders `data` -> `earthquakes`. image::nifi-kafka-druid-earthquake-data/minio_3.png[] -As you can see Druid saved 201.5 MiB of data within 73 prefixes (folders). One prefix corresponds to one segment which in -turn contains all the data of a year. If you don't see any folders or files, the reason is that Druid has not saved its -data from memory to the deep storage yet. After waiting for roughly an hour, the data should have been flushed to S3 and -show up. +As you can see Druid saved 201.5 MiB of data within 73 prefixes (folders). +One prefix corresponds to one segment which in turn contains all the data of a year. +If you don't see any folders or files, the reason is that Druid has not saved its data from memory to the deep storage yet. +After waiting for roughly an hour, the data should have been flushed to S3 and show up. image::nifi-kafka-druid-earthquake-data/minio_4.png[] -If you open up a prefix for a specific year you can see that Druid has placed a file containing the data of that year -there. +If you open up a prefix for a specific year you can see that Druid has placed a file containing the data of that year there. == Summary The demo streamed 10,000 earthquake records/s for a total of ~3 million earthquakes into a Kafka steaming pipeline. -Druid ingested the data near real-time into its data source and enabled SQL access to it. Superset was used as a -web-based frontend to execute SQL statements and build dashboards. +Druid ingested the data near real-time into its data source and enabled SQL access to it. +Superset was used as a web-based frontend to execute SQL statements and build dashboards. == Where to go from here -There are multiple paths to go from here. The following sections give you some ideas on what to explore next. You can -find the description of the earthquake data {earthquake}[on the United States Geological Survey website]. +There are multiple paths to go from here. +The following sections give you some ideas on what to explore next. +You can find the description of the earthquake data {earthquake}[on the United States Geological Survey website]. === Execute arbitrary SQL statements @@ -417,13 +418,13 @@ Within Superset (or the Druid web interface), you can execute arbitrary SQL stat === Create additional dashboards -You also can create additional charts and bundle them together in a Dashboard. Have a look at -{superset-docs}[the Superset documentation] on how to do that. +You also can create additional charts and bundle them together in a Dashboard. +Have a look at {superset-docs}[the Superset documentation] on how to do that. === Load additional data -You can use the NiFi web interface to collect arbitrary data and write it to Kafka (it's recommended to use new Kafka -topics for that). Alternatively, you can use a Kafka client like {kcat}[kcat] to create new topics and ingest data. -Using the Druid web interface, you can start an ingestion job that consumes and stores the data in an internal data -source. There is an excellent {druid-tutorial}[tutorial] from Druid on how to do this. Afterwards, the data source can -be analyzed within Druid and Superset, like the earthquake data. +You can use the NiFi web interface to collect arbitrary data and write it to Kafka (it's recommended to use new Kafka topics for that). +Alternatively, you can use a Kafka client like {kcat}[kcat] to create new topics and ingest data. +Using the Druid web interface, you can start an ingestion job that consumes and stores the data in an internal data source. +There is an excellent {druid-tutorial}[tutorial] from Druid on how to do this. +Afterwards, the data source can be analyzed within Druid and Superset, like the earthquake data. diff --git a/docs/modules/demos/pages/nifi-kafka-druid-water-level-data.adoc b/docs/modules/demos/pages/nifi-kafka-druid-water-level-data.adoc index 54e2924b..aaee353b 100644 --- a/docs/modules/demos/pages/nifi-kafka-druid-water-level-data.adoc +++ b/docs/modules/demos/pages/nifi-kafka-druid-water-level-data.adoc @@ -1,5 +1,6 @@ = nifi-kafka-druid-water-level-data :page-aliases: stable@stackablectl::demos/nifi-kafka-druid-water-level-data.adoc +:description: Install this demo for a showcase of using Kafka, Druid and Superset to visualize water levels in across Germany. :superset: https://superset.apache.org/docs/using-superset/creating-your-first-dashboard/#creating-charts-in-explore-view :druid-tutorial: https://druid.apache.org/docs/latest/tutorials/tutorial-kafka.html#loading-data-with-the-data-loader @@ -528,7 +529,7 @@ What might also be interesting is the average and current measurement of the sta [source,sql] ---- -select +select stations.longname as station, avg("value") as avg_measurement, latest_by("value", measurements."__time") as current_measurement, diff --git a/docs/modules/demos/pages/signal-processing.adoc b/docs/modules/demos/pages/signal-processing.adoc index 4da8589c..987ccd9d 100644 --- a/docs/modules/demos/pages/signal-processing.adoc +++ b/docs/modules/demos/pages/signal-processing.adoc @@ -1,4 +1,5 @@ = signal-processing +:description: Install a Kubernetes demo to process time-series data using Apache NiFi, TimescaleDB, Grafana, and JupyterHub for real-time insights. :k8s-cpu: https://kubernetes.io/docs/tasks/debug/debug-cluster/resource-metrics-pipeline/#cpu @@ -9,10 +10,7 @@ Install this demo on an existing Kubernetes cluster: $ stackablectl demo install signal-processing ---- -[WARNING] -==== -This demo should not be run alongside other demos. -==== +WARNING: This demo should not be run alongside other demos. [#system-requirements] == System Requirements @@ -42,10 +40,10 @@ image::signal-processing/overview.png[] == Data ingestion -The data used in this demo is a set of gas sensor measurements*. -The dataset provides resistance values (r-values hereafter) for each of 14 gas sensors. -In order to simulate near-real-time ingestion of this data, it is downloaded and batch-inserted into a Timescale table. -It's then updated in-place retaining the same time offsets but shifting the timestamps such that the notebook code can "move through" the data using windows as if it were being streamed. +The data used in this demo is a set of gas sensor measurements*. +The dataset provides resistance values (r-values hereafter) for each of 14 gas sensors. +In order to simulate near-real-time ingestion of this data, it is downloaded and batch-inserted into a Timescale table. +It's then updated in-place retaining the same time offsets but shifting the timestamps such that the notebook code can "move through" the data using windows as if it were being streamed. The Nifi flow that does this can easily be extended to process other sources of (actually streamed) data. == JupyterHub @@ -71,9 +69,9 @@ The notebook reads the measurement data in windowed batches using a loop, comput The enriched data is calculated using an online, unsupervised https://docs.seldon.io/projects/alibi-detect/en/stable/od/methods/sr.html[model] that uses a technique called http://www.houxiaodi.com/assets/papers/cvpr07.pdf[Spectral Residuals]. -- `online`: in this context this means that the model does not require a training phase and can apply the algorithm to a batch in isolation. However, a light-weight training phase can be introduced if an estimated threshold value is required. This would take the form of a single batch "up front" used to determine some baselines. This is not implemented in this demo. -- `unsupervised`: the data is not labelled and the algorithm seeks to extract useful information (alerts etc.) from the raw data alone. -- `spectral residuals`: this algorithm executes a double Fourier transformation to reconstruct the original data, with the reconstruction error being used as a measure of outlier-ness. +* `online`: in this context this means that the model does not require a training phase and can apply the algorithm to a batch in isolation. However, a light-weight training phase can be introduced if an estimated threshold value is required. This would take the form of a single batch "up front" used to determine some baselines. This is not implemented in this demo. +* `unsupervised`: the data is not labelled and the algorithm seeks to extract useful information (alerts etc.) from the raw data alone. +* `spectral residuals`: this algorithm executes a double Fourier transformation to reconstruct the original data, with the reconstruction error being used as a measure of outlier-ness. == Visualization @@ -95,17 +93,22 @@ $ stackablectl stacklet list ---- -Log in to Grafana with username `admin` and password `adminadmin` and navigate to the dashboards. There are two located in the "Stackable Data Platform" folder. +Log in to Grafana with username `admin` and password `adminadmin` and navigate to the dashboards. +There are two located in the "Stackable Data Platform" folder. === Measurements -This is the original data. The first graph plots two measurments (`r1`, `r2`), together with the model scores (`r1_score`, `r2_score`, `r1_score_lttb`). These are superimposed on each other for ease of comparison. +This is the original data. The first graph plots two measurments (`r1`, `r2`), together with the model scores (`r1_score`, `r2_score`, `r1_score_lttb`).# +These are superimposed on each other for ease of comparison. image::signal-processing/measurements.png[] === Predictions -In this second dashboard the predictions for all r-values are plotted: the top graph takes an average across all measurements, with a threshold marked as a red line across the top. This can be used for triggering email alerts. Underneath the individual r-values are plotted, firstly as raw data and then the same using downsampling. Downsampling uses a built-in Timescale extension to significantly reduce the number of data plotted while retaining the same overall shape. +In this second dashboard the predictions for all r-values are plotted: the top graph takes an average across all measurements, with a threshold marked as a red line across the top. +This can be used for triggering email alerts. +Underneath the individual r-values are plotted, firstly as raw data and then the same using downsampling. +Downsampling uses a built-in Timescale extension to significantly reduce the number of data plotted while retaining the same overall shape. image::signal-processing/predictions.png[] diff --git a/docs/modules/demos/pages/spark-k8s-anomaly-detection-taxi-data.adoc b/docs/modules/demos/pages/spark-k8s-anomaly-detection-taxi-data.adoc index 6308a3a7..319c523e 100644 --- a/docs/modules/demos/pages/spark-k8s-anomaly-detection-taxi-data.adoc +++ b/docs/modules/demos/pages/spark-k8s-anomaly-detection-taxi-data.adoc @@ -1,5 +1,6 @@ = spark-k8s-anomaly-detection-taxi-data :page-aliases: stable@stackablectl::demos/spark-k8s-anomaly-detection-taxi-data.adoc +:description: Deploy a Kubernetes-based Spark demo for anomaly detection using the popular New York taxi dataset, featuring Trino, Spark, MinIO, and Superset. :scikit-lib: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html :k8s-cpu: https://kubernetes.io/docs/tasks/debug/debug-cluster/resource-metrics-pipeline/#cpu @@ -87,8 +88,8 @@ include::partial$instance-hint.adoc[] === List Buckets -The S3 provided by MinIO is used as persistent storage to store all the data used. Open the endpoint `http` -retrieved by `stackablectl stacklet list` in your browser (http://172.18.0.2:32276 in this case). +The S3 provided by MinIO is used as persistent storage to store all the data used. +Open the endpoint `http` retrieved by `stackablectl stacklet list` in your browser (http://172.18.0.2:32276 in this case). image::spark-k8s-anomaly-detection-taxi-data/minio_0.png[] @@ -109,10 +110,9 @@ Click on the bucket `demo` and then on `ny-taxi-data` and `raw` respectively. image::spark-k8s-anomaly-detection-taxi-data/minio_3.png[] -This folder (called prefixes in S3) contains a dataset of similarly structured data files. The data is partitioned by month -and contains several hundred MBs, which may seem small for a dataset. Still, the model is a time-series model where the -data has decreasing relevance the "older" it is, especially when the data is subject to multiple external factors, many -of which are unknown and fluctuating in scope and effect. +This folder (called prefixes in S3) contains a dataset of similarly structured data files. +The data is partitioned by month and contains several hundred MBs, which may seem small for a dataset. +Still, the model is a time-series model where the data has decreasing relevance the "older" it is, especially when the data is subject to multiple external factors, many of which are unknown and fluctuating in scope and effect. The second bucket prediction contains the output from the model scoring process under `prediction/anomaly-detection/iforest/data`: @@ -122,15 +122,12 @@ This is a much smaller file, as it only contains scores for each aggregated peri == Spark -The Spark job ingests the raw data and performs straightforward data wrangling and feature engineering. Any windowing -features designed to capture the time-series nature of the data - such as lags or rolling averages - need to use evenly -distributed partitions so that Spark can execute these tasks in parallel. The job uses an implementation of the -Isolation Forest {forest-algo}[algorithm] provided by the scikit-learn {scikit-lib}[library]: the model is trained in a -single task but is then distributed to each executor from where a user-defined function invokes it (see -{forest-article}[this article] for how to call the sklearn library with a pyspark UDF). The Isolation Forest algorithm -is used for unsupervised model training, meaning that a labelled set of data - against which the model is trained - is -unnecessary. This makes model preparation easier as we do not have to divide the data set into training and validation -datasets. +The Spark job ingests the raw data and performs straightforward data wrangling and feature engineering. +Any windowing features designed to capture the time-series nature of the data - such as lags or rolling averages - need to use evenly distributed partitions so that Spark can execute these tasks in parallel. +The job uses an implementation of the Isolation Forest {forest-algo}[algorithm] provided by the scikit-learn {scikit-lib}[library]: +the model is trained in a single task but is then distributed to each executor from where a user-defined function invokes it (see {forest-article}[this article] for how to call the sklearn library with a pyspark UDF). +The Isolation Forest algorithm is used for unsupervised model training, meaning that a labelled set of data - against which the model is trained - is unnecessary. +This makes model preparation easier as we do not have to divide the data set into training and validation datasets. You can inspect a running Spark job by forwarding the port used by the Spark-UI: @@ -145,14 +142,13 @@ image::spark-k8s-anomaly-detection-taxi-data/spark_job.png[] == Dashboard -Open the `external-http` Superset endpoint found in the output of the `stackablectl stacklet list` command. The anomaly detection -dashboard is pre-defined and accessible under the `Dashboards` tab when logged in to Superset using the username `admin` -password `adminadmin`: +Open the `external-http` Superset endpoint found in the output of the `stackablectl stacklet list` command. +The anomaly detection dashboard is pre-defined and accessible under the `Dashboards` tab when logged in to Superset using the username `admin` password `adminadmin`: image::spark-k8s-anomaly-detection-taxi-data/superset_anomaly_scores.png[] -The model does not yield data that can be used directly for a root cause analysis. An isolation forest is a type of -random forest that measures how many branches are needed in its underlying decision trees to isolate each data point: +The model does not yield data that can be used directly for a root cause analysis. +An isolation forest is a type of random forest that measures how many branches are needed in its underlying decision trees to isolate each data point: the more abnormal the data, the easier this will be - a clear outlier may only need a single partition to isolate it, -whereas tightly clustered data will require significantly more. The number of partitions to isolate is, therefore, in -inverse proportion to the anomaly-ness of the data. +whereas tightly clustered data will require significantly more. +The number of partitions to isolate is, therefore, in inverse proportion to the anomaly-ness of the data. diff --git a/docs/modules/demos/pages/trino-iceberg.adoc b/docs/modules/demos/pages/trino-iceberg.adoc index ef9601b6..ef6439f5 100644 --- a/docs/modules/demos/pages/trino-iceberg.adoc +++ b/docs/modules/demos/pages/trino-iceberg.adoc @@ -1,5 +1,6 @@ = trino-iceberg :page-aliases: stable@stackablectl::demos/trino-iceberg.adoc +:description: Install and explore Trino with Apache Iceberg for efficient SQL queries and scalable data management in a demo environment. :k8s-cpu: https://kubernetes.io/docs/tasks/debug/debug-cluster/resource-metrics-pipeline/#cpu :tcph-spec: https://www.tpc.org/tpc_documents_current_versions/pdf/tpc-h_v3.0.1.pdf @@ -11,10 +12,7 @@ Install this demo on an existing Kubernetes cluster: $ stackablectl demo install trino-iceberg ---- -[WARNING] -==== -This demo should not be run alongside other demos. -==== +WARNING: This demo should not be run alongside other demos. [#system-requirements] == System requirements @@ -27,10 +25,9 @@ To run this demo, your system needs at least: [NOTE] ==== -This demo is a condensed form of the xref:data-lakehouse-iceberg-trino-spark.adoc[] demo focusing on using the -lakehouse to store and modify data. It focuses on the Trino and Iceberg integration and should run on your local -workstation. If you want a more complex lakehouse setup, please look at the -xref:data-lakehouse-iceberg-trino-spark.adoc[] demo. +This demo is a condensed form of the xref:data-lakehouse-iceberg-trino-spark.adoc[] demo focusing on using the lakehouse to store and modify data. +It focuses on the Trino and Iceberg integration and should run on your local workstation. +If you want a more complex lakehouse setup, look at the xref:data-lakehouse-iceberg-trino-spark.adoc[] demo. ==== == Overview @@ -71,22 +68,21 @@ include::partial$instance-hint.adoc[] == MinIO -You can view the available buckets and objects (think of files) described in the -xref:data-lakehouse-iceberg-trino-spark.adoc#_minio[data-lakehouse-iceberg-trino-spark] demo. Currently, the bucket `lakehouse` -is still empty, but will be filled during the next steps. +You can view the available buckets and objects (think of files) described in the xref:data-lakehouse-iceberg-trino-spark.adoc#_minio[data-lakehouse-iceberg-trino-spark] demo. +Currently, the bucket `lakehouse` is still empty, but will be filled during the next steps. == Connect to Trino -Have a look at the xref:trino:usage-guide/connect_to_trino#_connect_with_dbeaver[documentation] on how to -connect with DBeaver. As an alternative, you can use https://trino.io/download.html[trino-cli] by running: +Have a look at the xref:trino:usage-guide/connect_to_trino#_connect_with_dbeaver[documentation] on how to connect with DBeaver. +As an alternative, you can use https://trino.io/download.html[trino-cli] by running: [source,console] ---- $ java -jar ~/Downloads/trino-cli-451-executable.jar --user admin --insecure --password --server https://172.18.0.2:30856 ---- -Make sure to replace the server endpoint with the endpoint listed in the `stackablectl stacklet list` output. When -prompted, enter the password `adminadmin`. +Make sure to replace the server endpoint with the endpoint listed in the `stackablectl stacklet list` output. +When prompted, enter the password `adminadmin`. == Create test data @@ -99,8 +95,7 @@ First, you must create a schema in the lakehouse to store the test data: CREATE SCHEMA lakehouse.tpch WITH (location = 's3a://lakehouse/tpch'); ---- -Afterwards, you can set the context to the freshly created schema so that you don't need to write out every table as -`lakehouse.tpch.` but instead can use `` directly. +Afterwards, you can set the context to the freshly created schema so that you don't need to write out every table as `lakehouse.tpch.` but instead can use `` directly. [source,sql] ---- @@ -109,9 +104,8 @@ use lakehouse.tpch; === Create the tables -You can use the https://www.tpc.org/tpch/[TPC-H dataset] to have some test data to work with. Trino offers a special -https://trino.io/docs/current/connector/tpch.html[TPCH connector] that generates the test data deterministically on the -fly. +You can use the https://www.tpc.org/tpch/[TPC-H dataset] to have some test data to work with. +Trino offers a special https://trino.io/docs/current/connector/tpch.html[TPCH connector] that generates the test data deterministically on the fly. You can list the tables that are part of the dataset using: @@ -132,14 +126,14 @@ show tables in tpch.sf5; (8 rows) ---- -The dataset comes with different scale factors. This demo is intended to run on a laptop, so it starts with a scale -factor of 5 (hence the `sf5` in the above query). If you have a sufficiently large S3 and Trino deployed, you can easily -re-run the statements below with a different scale factor. This demo has been tested up to a scale factor of 10000, but -you can choose any scale in between or even more if desired. +The dataset comes with different scale factors. +This demo is intended to run on a laptop, so it starts with a scale factor of 5 (hence the `sf5` in the above query). +If you have a sufficiently large S3 and Trino deployed, you can easily re-run the statements below with a different scale factor. +This demo has been tested up to a scale factor of 10000, but you can choose any scale in between or even more if desired. -If you have decided on your scale factor, run the queries below to create tables in the lakehouse and propagate them -with test data. Depending on the scale factor, this can take considerable time (the queries are ordered by size -ascending). The progress of the query can be tracked in the web interface. +After you have decided on your scale factor, run the queries below to create tables in the lakehouse and propagate them with test data. +Depending on the scale factor, this can take considerable time (the queries are ordered by size ascending). +The progress of the query can be tracked in the web interface. [source,sql] ---- @@ -171,8 +165,7 @@ drop table if exists customers_to_prioritize; ---- ==== -Afterwards, your database overview in DBeaver should look like the following (you might need to refresh the contents -with `F5`). +Afterwards, your database overview in DBeaver should look like the following (you might need to refresh the contents with `F5`). image::trino-iceberg/dbeaver_1.png[] @@ -180,7 +173,7 @@ image::trino-iceberg/dbeaver_1.png[] === Basic table information -To create a view containing some basic information about the tables, please execute the statement below: +To create a view containing some basic information about the tables, execute the statement below: .Statement to create table_information view [%collapsible] @@ -256,8 +249,8 @@ select * from table_information order by records desc; === Query the data -You can now use standard SQL to analyze the data. The relation of the tables to each other is explained in the -{tcph-spec}[TPC-H specification] and looks as follows: +You can now use standard SQL to analyze the data. +The relation of the tables to each other is explained in the {tcph-spec}[TPC-H specification] and looks as follows: image::trino-iceberg/tpch_schema.png[] @@ -289,27 +282,26 @@ order by returnflag, linestatus; (4 rows) ---- -The query is inspired by the first query `Q1` of the {tcph-spec}[TPC-H benchmark]. The only difference is that the -`where shipdate <= date '1998-12-01' - interval '[DELTA]' day` clause was omitted to produce a full-table scan. +The query is inspired by the first query `Q1` of the {tcph-spec}[TPC-H benchmark]. +The only difference is that the `where shipdate <= date '1998-12-01' - interval '[DELTA]' day` clause was omitted to produce a full-table scan. === Row-level deletes -So far, the tables have been written once and have only been read afterwards. Trino - combined with Iceberg - can read -data and do row-level deletes (deleting single rows out of a table). They achieve this by writing so-called "delete -files", which mark rows for deletion. +So far, the tables have been written once and have only been read afterwards. +Trino - combined with Iceberg - can read data and do row-level deletes (deleting single rows out of a table). +They achieve this by writing so-called "delete files", which mark rows for deletion. -First, imagine a situation where some customers want all their data deleted. You track all the deletion requests in a -dedicated table and have a nightly job that deletes all your data about the user. Let's create a table -`customers_to_delete` containing a random sample of 1% of our user base. You can leave the command unchanged if you run -with a larger scale factor. +First, imagine a situation where some customers want all their data deleted. +You track all the deletion requests in a dedicated table and have a nightly job that deletes all your data about the user. +Let's create a table `customers_to_delete` containing a random sample of 1% of our user base. +You can leave the command unchanged if you run with a larger scale factor. [source,sql] ---- create table customers_to_delete as select custkey from customer tablesample bernoulli (1); ---- -If you want to add new users to be scheduled for deletion, you can insert new users into the `customers_to_delete` table -using the following query: +If you want to add new users to be scheduled for deletion, you can insert new users into the `customers_to_delete` table using the following query: .Statement to add new users to customers_to_delete [%collapsible] @@ -320,8 +312,7 @@ insert into customers_to_delete select custkey from customer tablesample bernoul ---- ==== -The next step is the actual deletion process. It starts with the `lineitem` table and deletes all items in the -customer's orders: +The next step is the actual deletion process. It starts with the `lineitem` table and deletes all items in the customer's orders: [source,sql] ---- @@ -344,7 +335,8 @@ As a last step the actual users get deleted: delete from customer where custkey in (select custkey from customers_to_delete); ---- -Let's check that we actually deleted the data. Both of the queries below should return `0`: +Let's check that we actually deleted the data. +Both of the queries below should return `0`: [source,sql] ---- @@ -354,8 +346,9 @@ select count(*) from orders where custkey in (select custkey from customers_to_d === Row Level Updates -Iceberg does not only offer row-level deletes but also updates. For example, a customer relocation requires an address -change. The customer is identified by its key `112501` and name `Customer#000112501`. +Iceberg does not only offer row-level deletes but also updates. +For example, a customer relocation requires an address change. +The customer is identified by its key `112501` and name `Customer#000112501`. First, let's see its current status: @@ -379,12 +372,12 @@ Afterwards, the records should look the same as before, with the difference that === The MERGE INTO Statement -Trino also offers a https://trino.io/docs/current/sql/merge.html[MERGE INTO] statement, which gives you great -flexibility. +Trino also offers a https://trino.io/docs/current/sql/merge.html[MERGE INTO] statement, which gives you great flexibility. -We want to pick some customers and give them VIP status to show their usage. We do this by giving all of their orders -maximum priority. We could easily do this with an `UPDATE` statement, but here, we want to add some additional -requirements and use the `MERGE INTO` statement. So, we need to track the previous priority. +We want to pick some customers and give them VIP status to show their usage. +We do this by giving all of their orders maximum priority. +We could easily do this with an `UPDATE` statement, but here, we want to add some additional requirements and use the `MERGE INTO` statement. +So, we need to track the previous priority. Let's inspect the `orders` table first: @@ -486,22 +479,22 @@ select orderpriority_prev, count(*) from orders where custkey in (select custkey == Scaling up to a larger amount of data -So far, we have executed all the queries against a dataset created from TPC-H with a scale factor of 5. The demo can -handle much larger data volumes. +So far, we have executed all the queries against a dataset created from TPC-H with a scale factor of 5. +The demo can handle much larger data volumes. -This section describes how to scale up your environment to drop and re-create the tables with a more significant scale -factor. After creating the tables, you should be able to execute the above queries again without changing anything. +This section describes how to scale up your environment to drop and re-create the tables with a more significant scale factor. +After creating the tables, you should be able to execute the above queries again without changing anything. [NOTE] ==== -Your Kubernetes cluster must be large enough to handle the scale-up. If you are running, e.g. on your local machine and -try to spin up 8 Trino workers with 16GB RAM each, chances are high that Pods will be stuck in `Pending` as the -resources required can't be fulfilled. +Your Kubernetes cluster must be large enough to handle the scale-up. +If you are running, e.g. on your local machine and try to spin up 8 Trino workers with 16GB RAM each, chances are high that Pods will be stuck in `Pending` as the resources required can't be fulfilled. ==== === Scale Trino -Run `kubectl edit trinocluster trino`. Modify the following settings to your needs: +Run `kubectl edit trinocluster trino`. +Modify the following settings to your needs: [source,yaml] ---- diff --git a/docs/modules/demos/pages/trino-taxi-data.adoc b/docs/modules/demos/pages/trino-taxi-data.adoc index c5d51ae3..e8e9328f 100644 --- a/docs/modules/demos/pages/trino-taxi-data.adoc +++ b/docs/modules/demos/pages/trino-taxi-data.adoc @@ -1,5 +1,6 @@ = trino-taxi-data :page-aliases: stable@stackablectl::demos/trino-taxi-data.adoc +:description: Install and demo Trino with NYC taxi data: Query with SQL, visualize with Superset, and explore data in MinIO and Trino on Kubernetes. :superset-docs: https://superset.apache.org/docs/creating-charts-dashboards/creating-your-first-dashboard#creating-charts-in-explore-view :nyc-website: https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf @@ -301,13 +302,12 @@ returns === Create Additional Dashboards -You also have the possibility to create additional charts and bundle them together in a Dashboard. Have a look at -{superset-docs}[the Superset documentation] on how to do that. +You also have the possibility to create additional charts and bundle them together in a Dashboard. Have a look at {superset-docs}[the Superset documentation] on how to do that. === Load Additional Data -You can use the MinIO webinterface to upload additional data. As an alternative you can use the S3 API with an S3 client -like https://s3tools.org/s3cmd[s3cmd]. It is recommended to put the data into a folder (prefix) in the `demo` bucket. +You can use the MinIO webinterface to upload additional data. As an alternative you can use the S3 API with an S3 client like https://s3tools.org/s3cmd[s3cmd]. +It is recommended to put the data into a folder (prefix) in the `demo` bucket. Have a look at the defined tables inside the `hive.demo` schema on how to inform Trino about the newly available data. @@ -340,8 +340,7 @@ CREATE TABLE hive.demo.ny_taxi_data_raw ( ) ---- -If you want to transform or filter your data in any way before using it e.g. in Superset you can create a view as -follows: +If you want to transform or filter your data in any way before using it e.g. in Superset you can create a view as follows: [source,sql] ---- @@ -372,5 +371,4 @@ AND tpep_pickup_datetime <= from_iso8601_timestamp('2022-05-31T00:00:00') === Connect to Trino via CLI, Python or DBeaver -If you prefer running your SQL statements via command-line, a Python script or a graphical database manager like DBeaver -please have a look at the {trino-client-docs}[the Trino documentation] on how to do that. +If you prefer running your SQL statements via command-line, a Python script or a graphical database manager like DBeaver, consult the {trino-client-docs}[the Trino documentation] on how to do that. diff --git a/stacks/stacks-v2.yaml b/stacks/stacks-v2.yaml index 757dd40f..12a55e9c 100644 --- a/stacks/stacks-v2.yaml +++ b/stacks/stacks-v2.yaml @@ -1,10 +1,11 @@ +--- stacks: monitoring: description: Stack containing Prometheus and Grafana stackableRelease: 24.7 stackableOperators: - - commons - - listener + - commons + - listener labels: - monitoring - prometheus @@ -538,7 +539,7 @@ stacks: description: >- A stack used to demonstrate an end-to-end security concept. - Please note that this stack is tightly coupled with the demo. + Note that this stack is tightly coupled with the demo. So if you install the stack you will get demo-specific parts (such as Keycloak users or regorules). stackableRelease: 24.7 stackableOperators: From 7f4565b4abf6b8f270862897136aa6bae73c6399 Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Wed, 18 Sep 2024 11:15:19 +0200 Subject: [PATCH 5/5] chore: Upgrade the Vector aggregator --- stacks/_templates/vector-aggregator.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stacks/_templates/vector-aggregator.yaml b/stacks/_templates/vector-aggregator.yaml index cdedc2ee..91bbdbd6 100644 --- a/stacks/_templates/vector-aggregator.yaml +++ b/stacks/_templates/vector-aggregator.yaml @@ -3,11 +3,11 @@ name: vector repo: name: vector url: https://helm.vector.dev -version: 0.35.0 # app version 0.40.0 +version: 0.36.1 # app version 0.41.1 options: commonLabels: stackable.tech/vendor: Stackable - podLabels: # Doesn't seem to work? + podLabels: stackable.tech/vendor: Stackable role: Aggregator customConfig: @@ -25,7 +25,7 @@ options: - https://opensearch-cluster-master.default.svc.cluster.local:9200 mode: bulk # The auto-detection of the API version does not work in Vector - # 0.39.0 for OpenSearch, so the version must be set explicitly + # 0.41.1 for OpenSearch, so the version must be set explicitly # (see https://github.com/vectordotdev/vector/issues/17690). api_version: v8 tls: