From 048def240657da883a29c6149b14434060439e89 Mon Sep 17 00:00:00 2001 From: Lukasz Staniszewski <59453698+lukasz-staniszewski@users.noreply.github.com> Date: Fri, 5 Jan 2024 01:50:39 +0100 Subject: [PATCH] feat(phase 2a): task 7 + task 8 solution (#20) --- notebooks/tpc-di-setup-dev.ipynb | 1096 +++++++++++++++++------------- tasks-phase2a.md | 270 ++++++-- 2 files changed, 826 insertions(+), 540 deletions(-) diff --git a/notebooks/tpc-di-setup-dev.ipynb b/notebooks/tpc-di-setup-dev.ipynb index 6653c32..1093c1e 100644 --- a/notebooks/tpc-di-setup-dev.ipynb +++ b/notebooks/tpc-di-setup-dev.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 32, + "execution_count": 1, "id": "90a0ed7a-a23a-4523-9171-f1d076700168", "metadata": { "pycharm": { @@ -28,45 +28,10 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "id": "a0fbec0d-d410-4de9-8dfd-00b07fdb7b89", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: typer==0.9.0 in /usr/local/lib/python3.8/dist-packages (from typer[All]==0.9.0) (0.9.0)\n", - "Requirement already satisfied: google-cloud-storage==2.13.0 in /usr/local/lib/python3.8/dist-packages (2.13.0)\n", - "Requirement already satisfied: click<9.0.0,>=7.1.1 in /usr/local/lib/python3.8/dist-packages (from typer==0.9.0->typer[All]==0.9.0) (8.1.7)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.8/dist-packages (from typer==0.9.0->typer[All]==0.9.0) (4.9.0)\n", - "Requirement already satisfied: google-auth<3.0dev,>=2.23.3 in /usr/local/lib/python3.8/dist-packages (from google-cloud-storage==2.13.0) (2.26.1)\n", - "Requirement already satisfied: google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 in /usr/local/lib/python3.8/dist-packages (from google-cloud-storage==2.13.0) (2.15.0)\n", - "Requirement already satisfied: google-cloud-core<3.0dev,>=2.3.0 in /usr/local/lib/python3.8/dist-packages (from google-cloud-storage==2.13.0) (2.4.1)\n", - "Requirement already satisfied: google-resumable-media>=2.6.0 in /usr/local/lib/python3.8/dist-packages (from google-cloud-storage==2.13.0) (2.7.0)\n", - "Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /usr/local/lib/python3.8/dist-packages (from google-cloud-storage==2.13.0) (2.31.0)\n", - "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /usr/local/lib/python3.8/dist-packages (from google-cloud-storage==2.13.0) (1.5.0)\n", - "Requirement already satisfied: colorama<0.5.0,>=0.4.3 in /usr/local/lib/python3.8/dist-packages (from typer[All]==0.9.0) (0.4.6)\n", - "Requirement already satisfied: shellingham<2.0.0,>=1.3.0 in /usr/local/lib/python3.8/dist-packages (from typer[All]==0.9.0) (1.5.4)\n", - "Requirement already satisfied: rich<14.0.0,>=10.11.0 in /usr/local/lib/python3.8/dist-packages (from typer[All]==0.9.0) (13.7.0)\n", - "Requirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in /usr/local/lib/python3.8/dist-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-storage==2.13.0) (1.62.0)\n", - "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0.dev0,>=3.19.5 in /usr/local/lib/python3.8/dist-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-storage==2.13.0) (4.25.1)\n", - "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from google-auth<3.0dev,>=2.23.3->google-cloud-storage==2.13.0) (5.3.2)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.8/dist-packages (from google-auth<3.0dev,>=2.23.3->google-cloud-storage==2.13.0) (0.3.0)\n", - "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.8/dist-packages (from google-auth<3.0dev,>=2.23.3->google-cloud-storage==2.13.0) (4.9)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.8/dist-packages (from requests<3.0.0dev,>=2.18.0->google-cloud-storage==2.13.0) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests<3.0.0dev,>=2.18.0->google-cloud-storage==2.13.0) (2.8)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests<3.0.0dev,>=2.18.0->google-cloud-storage==2.13.0) (2.1.0)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests<3.0.0dev,>=2.18.0->google-cloud-storage==2.13.0) (2019.11.28)\n", - "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.8/dist-packages (from rich<14.0.0,>=10.11.0->typer[All]==0.9.0) (3.0.0)\n", - "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.8/dist-packages (from rich<14.0.0,>=10.11.0->typer[All]==0.9.0) (2.17.2)\n", - "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.8/dist-packages (from markdown-it-py>=2.2.0->rich<14.0.0,>=10.11.0->typer[All]==0.9.0) (0.1.2)\n", - "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /usr/local/lib/python3.8/dist-packages (from pyasn1-modules>=0.2.1->google-auth<3.0dev,>=2.23.3->google-cloud-storage==2.13.0) (0.5.1)\n", - "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", - "\u001b[0m" - ] - } - ], + "outputs": [], "source": [ "!pip3.8 install typer[All]==0.9.0 google-cloud-storage==2.13.0" ] @@ -81,75 +46,10 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "id": "2a3cdb5b-4978-479e-85c7-c9dbac3e65fd", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " -+syyyyyyys:\n", - " `/yho:` -yd.\n", - " `/yh/` +m.\n", - " .oho. hy .`\n", - " .sh/` :N` `-/o` `+dyyo:.\n", - " .yh:` `M- `-/osysoym :hs` `-+sys: hhyssssssssy+\n", - " .sh:` `N: ms/-`` yy.yh- -hy. `.N-````````+N.\n", - " `od/` `N- -/oM- ddd+` `sd: hNNm -N:\n", - " :do` .M. dMMM- `ms. /d+` `NMMs `do\n", - " .yy- :N` ```mMMM. - -hy. /MMM: yh\n", - " `+d+` `:/oo/` `-/osyh/ossssssdNMM` .sh: yMMN` /m.\n", - " -dh- :ymNMMMMy `-/shmNm-`:N/-.`` `.sN /N- `NMMy .m/\n", - " `oNs` -hysosmMMMMydmNmds+-.:ohm : sd` :MMM/ yy\n", - " .hN+ /d: -MMMmhs/-.` .MMMh .ss+- `yy` sMMN` :N.\n", - " :mN/ `N/ `o/-` :MMMo +MMMN- .` `ds mMMh do\n", - " /NN/ `N+....--:/+oooosooo+:sMMM: hMMMM: `my .m+ -MMM+ :N.\n", - " /NMo -+ooooo+/:-....`...:+hNMN. `NMMMd` .MM/ -m: oMMN. hs\n", - " -NMd` :mm -MMMm- .s/ -MMm. /m- mMMd -N.\n", - " `mMM/ .- /MMh. -dMo -MMMy od. .MMMs..---yh\n", - " +MMM. sNo`.sNMM+ :MMMM/ sh`+MMMNmNm+++-\n", - " mMMM- /--ohmMMM+ :MMMMm. `hyymmmdddo\n", - " MMMMh. ```` `-+yy/`yMMM/ :MMMMMy -sm:.``..-:-.`\n", - " dMMMMmo-.``````..-:/osyhddddho. `+shdh+. hMMM: :MmMMMM/ ./yy/` `:sys+/+sh/\n", - " .dMMMMMMmdddddmmNMMMNNNNNMMMMMs sNdo- dMMM- `-/yd/MMMMm-:sy+. :hs- /N`\n", - " `/ymNNNNNNNmmdys+/::----/dMMm: +m- mMMM+ohmo/.` sMMMMdo- .om: `sh\n", - " `.-----+/.` `.-+hh/` `od. NMMNmds/ `mmy:` +mMy `:yy.\n", - " /moyso+//+ossso:. .yy` `dy+:` .. :MMMN+---/oys:\n", - " /+m: `.-:::-` /d+ +MMMMMMMNh:`\n", - " +MN/ -yh. `+hddhy+.\n", - " /MM+ .sh:\n", - " :NMo -sh/\n", - " -NMs `/yy:\n", - " .NMy `:sh+.\n", - " `mMm` ./yds-\n", - " `dMMMmyo:-.````.-:oymNy:`\n", - " +NMMMMMMMMMMMMMMMMms:`\n", - " -+shmNMMMNmdy+:`\n", - "\n", - "\n", - " Now attempting installation...\n", - "\n", - "\n", - "Looking for a previous installation of SDKMAN...\n", - "SDKMAN found.\n", - "\n", - "======================================================================================================\n", - " You already have SDKMAN installed.\n", - " SDKMAN was found at:\n", - "\n", - " /root/.sdkman\n", - "\n", - " Please consider running the following if you need to upgrade.\n", - "\n", - " $ sdk selfupdate force\n", - "\n", - "======================================================================================================\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "%%bash\n", "curl -s https://get.sdkman.io | bash" @@ -165,21 +65,10 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "id": "275488e6-68e3-4f42-a3ca-060b286bade8", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\u001b[1;33mjava 8.0.392-amzn is already installed.\u001b[0m\n", - "\n", - "\u001b[1;32mUsing java version 8.0.392-amzn in this shell.\u001b[0m\n" - ] - } - ], + "outputs": [], "source": [ "%%bash\n", "source \"$HOME/.sdkman/bin/sdkman-init.sh\"\n", @@ -197,7 +86,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 5, "id": "898835b7-c347-49dd-bc0f-ca845375e1e5", "metadata": {}, "outputs": [ @@ -227,10 +116,24 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 6, "id": "7481d445-4ea6-40cc-8320-5520fe8d4dd4", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "fatal: destination path 'tbd-tpc-di' already exists and is not an empty directory.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Already up to date.\n" + ] + }, { "name": "stderr", "output_type": "stream", @@ -242,6 +145,7 @@ "name": "stdout", "output_type": "stream", "text": [ + "D\ttests/fact_watches_dates_proper_relation.sql\n", "Your branch is up to date with 'origin/notebook'.\n" ] } @@ -249,9 +153,9 @@ "source": [ "%%bash\n", "mkdir -p git && cd git\n", - "# git clone https://github.com/thai-chicken/tbd-tpc-di.git\n", + "git clone https://github.com/thai-chicken/tbd-tpc-di.git\n", "cd tbd-tpc-di\n", - "# git pull\n", + "git pull\n", "git checkout notebook" ] }, @@ -280,18 +184,18 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "9a24c0aa-bff7-4831-869d-3183e7373a94", - "metadata": {}, - "outputs": [], + "cell_type": "markdown", + "id": "1f948b2f-3c22-46a3-988f-23e97fa00668", + "metadata": { + "tags": [] + }, "source": [ - "# Install and setup JVM 11" + "## Install and setup JVM 11" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 8, "id": "62e466e8-41aa-4afc-8392-5675d36adcf4", "metadata": {}, "outputs": [ @@ -323,314 +227,101 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 9, "id": "259b72bc-2efd-405b-883f-618a9772bf22", "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ":: loading settings :: url = jar:file:/usr/local/lib/python3.8/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" + ] + }, { "name": "stderr", "output_type": "stream", "text": [ - "Setting default log level to \"WARN\".\n", - "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" + "Ivy Default Cache set to: /root/.ivy2/cache\n", + "The jars for the packages stored in: /root/.ivy2/jars\n", + "com.databricks#spark-xml_2.12 added as a dependency\n", + ":: resolving dependencies :: org.apache.spark#spark-submit-parent-8f25649b-c864-4cb5-8879-1409c34c6e00;1.0\n", + "\tconfs: [default]\n", + "\tfound com.databricks#spark-xml_2.12;0.17.0 in central\n", + "\tfound commons-io#commons-io;2.11.0 in central\n", + "\tfound org.glassfish.jaxb#txw2;3.0.2 in central\n", + "\tfound org.apache.ws.xmlschema#xmlschema-core;2.3.0 in central\n", + "\tfound org.scala-lang.modules#scala-collection-compat_2.12;2.9.0 in central\n", + ":: resolution report :: resolve 694ms :: artifacts dl 22ms\n", + "\t:: modules in use:\n", + "\tcom.databricks#spark-xml_2.12;0.17.0 from central in [default]\n", + "\tcommons-io#commons-io;2.11.0 from central in [default]\n", + "\torg.apache.ws.xmlschema#xmlschema-core;2.3.0 from central in [default]\n", + "\torg.glassfish.jaxb#txw2;3.0.2 from central in [default]\n", + "\torg.scala-lang.modules#scala-collection-compat_2.12;2.9.0 from central in [default]\n", + "\t---------------------------------------------------------------------\n", + "\t| | modules || artifacts |\n", + "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", + "\t---------------------------------------------------------------------\n", + "\t| default | 5 | 0 | 0 | 0 || 5 | 0 |\n", + "\t---------------------------------------------------------------------\n", + ":: retrieving :: org.apache.spark#spark-submit-parent-8f25649b-c864-4cb5-8879-1409c34c6e00\n", + "\tconfs: [default]\n", + "\t0 artifacts copied, 5 already retrieved (0kB/19ms)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "24/01/04 13:06:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", - "24/01/04 13:06:53 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: gs://tbd-2023z-304098-data/tpc-di/Date.txt.\n", - "java.lang.RuntimeException: java.lang.ClassNotFoundException: Class com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem not found\n", - "\tat org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2688)\n", - "\tat org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3431)\n", - "\tat org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)\n", - "\tat org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)\n", - "\tat org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)\n", - "\tat org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)\n", - "\tat org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)\n", - "\tat org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)\n", - "\tat org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:53)\n", - "\tat org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:370)\n", - "\tat org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:228)\n", - "\tat org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:210)\n", - "\tat scala.Option.getOrElse(Option.scala:189)\n", - "\tat org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:210)\n", - "\tat org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:185)\n", - "\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n", - "\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n", - "\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n", - "\tat java.lang.reflect.Method.invoke(Method.java:498)\n", - "\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n", - "\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n", - "\tat py4j.Gateway.invoke(Gateway.java:282)\n", - "\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n", - "\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n", - "\tat py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)\n", - "\tat py4j.ClientServerConnection.run(ClientServerConnection.java:106)\n", - "\tat java.lang.Thread.run(Thread.java:750)\n", - "Caused by: java.lang.ClassNotFoundException: Class com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem not found\n", - "\tat org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2592)\n", - "\tat org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2686)\n", - "\t... 26 more\n" + "24/01/04 22:12:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[31m╭─\u001b[0m\u001b[31m────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m─────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2;33m/home/jupyter/git/tbd-tpc-di/\u001b[0m\u001b[1;33mtpcdi.py\u001b[0m:\u001b[94m139\u001b[0m in \u001b[92mprocess_files\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m136 \u001b[0m\u001b[2m│ │ │ \u001b[0mStructField(\u001b[33m\"\u001b[0m\u001b[33mFISCAL_QTR_DESC\u001b[0m\u001b[33m\"\u001b[0m, StringType(), \u001b[94mFalse\u001b[0m), \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m137 \u001b[0m\u001b[2m│ │ │ \u001b[0mStructField(\u001b[33m\"\u001b[0m\u001b[33mHOLIDAY_FLAG\u001b[0m\u001b[33m\"\u001b[0m, BooleanType(), \u001b[94mFalse\u001b[0m), \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m138 \u001b[0m\u001b[2m│ │ \u001b[0m]) \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m139 \u001b[2m│ │ \u001b[0mload_csv(schema, con_file_name, \u001b[33m'\u001b[0m\u001b[33mdate\u001b[0m\u001b[33m'\u001b[0m) \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m140 \u001b[0m\u001b[2m│ \u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m141 \u001b[0m\u001b[2m│ \u001b[0mcon_file_name = \u001b[33m'\u001b[0m\u001b[33mDailyMarket.txt\u001b[0m\u001b[33m'\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m142 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mif\u001b[0m file_name \u001b[95min\u001b[0m [\u001b[33m'\u001b[0m\u001b[33mall\u001b[0m\u001b[33m'\u001b[0m, con_file_name]: \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m╭─\u001b[0m\u001b[33m────────────────────────────────\u001b[0m\u001b[33m locals \u001b[0m\u001b[33m────────────────────────────────\u001b[0m\u001b[33m─╮\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m batch = \u001b[94m1\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m con_file_name = \u001b[33m'Date.txt'\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m file_name = \u001b[33m'all'\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m get_stage_path = \u001b[1m<\u001b[0m\u001b[1;95mfunction\u001b[0m\u001b[39m process_files..get_stage_path at \u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94m0x7f55cfdf6430\u001b[0m\u001b[1m>\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m load_csv = \u001b[1m<\u001b[0m\u001b[1;95mfunction\u001b[0m\u001b[39m process_files..load_csv at \u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94m0x7f55cfdf64c0\u001b[0m\u001b[1m>\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m output_directory = \u001b[33m'/tmp/tpc-di'\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m overwrite = \u001b[94mFalse\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m save_df = \u001b[1m<\u001b[0m\u001b[1;95mfunction\u001b[0m\u001b[39m process_files..save_df at \u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94m0x7f55cfdf6550\u001b[0m\u001b[1m>\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m schema = \u001b[1;35mStructType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m[\u001b[0m\u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'SK_DATE_ID'\u001b[0m, \u001b[1;35mIntegerType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'DATE_VALUE'\u001b[0m, \u001b[1;35mDateType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'DATE_DESC'\u001b[0m, \u001b[1;35mStringType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'CALENDAR_YEAR_ID'\u001b[0m, \u001b[1;35mIntegerType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'CALENDAR_YEAR_DESC'\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[1;35mStringType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'CALENDAR_QTR_ID'\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[1;35mIntegerType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'CALENDAR_QTR_DESC'\u001b[0m, \u001b[1;35mStringType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'CALENDAR_MONTH_ID'\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[1;35mIntegerType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'CALENDAR_MONTH_DESC'\u001b[0m, \u001b[1;35mStringType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'CALENDAR_WEEK_ID'\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[1;35mIntegerType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'CALENDAR_WEEK_DESC'\u001b[0m, \u001b[1;35mStringType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'DAY_OF_WEEK_NUM'\u001b[0m, \u001b[1;35mIntegerType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'DAY_OF_WEEK_DESC'\u001b[0m, \u001b[1;35mStringType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'FISCAL_YEAR_ID'\u001b[0m, \u001b[1;35mIntegerType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'FISCAL_YEAR_DESC'\u001b[0m, \u001b[1;35mStringType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'FISCAL_QTR_ID'\u001b[0m, \u001b[1;35mIntegerType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'FISCAL_QTR_DESC'\u001b[0m, \u001b[1;35mStringType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'HOLIDAY_FLAG'\u001b[0m, \u001b[1;35mBooleanType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m\u001b[1m)\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m session = \u001b[1m<\u001b[0m\u001b[1;95mpyspark.sql.session.SparkSession\u001b[0m\u001b[39m object at \u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94m0x7f55cfeadf70\u001b[0m\u001b[1m>\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m show = \u001b[94mFalse\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m skip_upload = \u001b[94mFalse\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m stage = \u001b[33m'tbd-2023z-304098-data'\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m upload_files = \u001b[1m<\u001b[0m\u001b[1;95mfunction\u001b[0m\u001b[39m process_files..upload_files at \u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94m0x7f55cfdf65e0\u001b[0m\u001b[1m>\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m╰──────────────────────────────────────────────────────────────────────────╯\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2;33m/home/jupyter/git/tbd-tpc-di/\u001b[0m\u001b[1;33mtpcdi.py\u001b[0m:\u001b[94m106\u001b[0m in \u001b[92mload_csv\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m103 \u001b[0m\u001b[2m│ │ \u001b[0mdelimiter = upload_files(file_name, stage) \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m104 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m105 \u001b[0m\u001b[2m│ │ \u001b[0mdf = ( \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m106 \u001b[2m│ │ │ \u001b[0msession \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m107 \u001b[0m\u001b[2m│ │ │ \u001b[0m.read \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m108 \u001b[0m\u001b[2m│ │ │ \u001b[0m.format(\u001b[33m\"\u001b[0m\u001b[33mcsv\u001b[0m\u001b[33m\"\u001b[0m) \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m109 \u001b[0m\u001b[2m│ │ │ \u001b[0m.schema(schema) \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m╭─\u001b[0m\u001b[33m────────────────────────────────\u001b[0m\u001b[33m locals \u001b[0m\u001b[33m────────────────────────────────\u001b[0m\u001b[33m─╮\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m delimiter = \u001b[33m'|'\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m file_name = \u001b[33m'Date.txt'\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m get_stage_path = \u001b[1m<\u001b[0m\u001b[1;95mfunction\u001b[0m\u001b[39m process_files..get_stage_path at \u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94m0x7f55cfdf6430\u001b[0m\u001b[1m>\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m save_df = \u001b[1m<\u001b[0m\u001b[1;95mfunction\u001b[0m\u001b[39m process_files..save_df at \u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94m0x7f55cfdf6550\u001b[0m\u001b[1m>\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m schema = \u001b[1;35mStructType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m[\u001b[0m\u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'SK_DATE_ID'\u001b[0m, \u001b[1;35mIntegerType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'DATE_VALUE'\u001b[0m, \u001b[1;35mDateType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'DATE_DESC'\u001b[0m, \u001b[1;35mStringType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'CALENDAR_YEAR_ID'\u001b[0m, \u001b[1;35mIntegerType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'CALENDAR_YEAR_DESC'\u001b[0m, \u001b[1;35mStringType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'CALENDAR_QTR_ID'\u001b[0m, \u001b[1;35mIntegerType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'CALENDAR_QTR_DESC'\u001b[0m, \u001b[1;35mStringType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'CALENDAR_MONTH_ID'\u001b[0m, \u001b[1;35mIntegerType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'CALENDAR_MONTH_DESC'\u001b[0m, \u001b[1;35mStringType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'CALENDAR_WEEK_ID'\u001b[0m, \u001b[1;35mIntegerType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'CALENDAR_WEEK_DESC'\u001b[0m, \u001b[1;35mStringType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'DAY_OF_WEEK_NUM'\u001b[0m, \u001b[1;35mIntegerType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'DAY_OF_WEEK_DESC'\u001b[0m, \u001b[1;35mStringType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'FISCAL_YEAR_ID'\u001b[0m, \u001b[1;35mIntegerType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'FISCAL_YEAR_DESC'\u001b[0m, \u001b[1;35mStringType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'FISCAL_QTR_ID'\u001b[0m, \u001b[1;35mIntegerType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'FISCAL_QTR_DESC'\u001b[0m, \u001b[1;35mStringType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m, \u001b[1;35mStructField\u001b[0m\u001b[1m(\u001b[0m\u001b[33m'HOLIDAY_FLAG'\u001b[0m, \u001b[1;35mBooleanType\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94mFalse\u001b[0m\u001b[1m)\u001b[0m\u001b[1m]\u001b[0m\u001b[1m)\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m session = \u001b[1m<\u001b[0m\u001b[1;95mpyspark.sql.session.SparkSession\u001b[0m\u001b[39m object at \u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94m0x7f55cfeadf70\u001b[0m\u001b[1m>\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m stage = \u001b[33m'tbd-2023z-304098-data'\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m stage_path = \u001b[33m'gs://tbd-2023z-304098-data/tpc-di/Date.txt'\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m table_name = \u001b[33m'date'\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m upload_files = \u001b[1m<\u001b[0m\u001b[1;95mfunction\u001b[0m\u001b[39m process_files..upload_files at \u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94m0x7f55cfdf65e0\u001b[0m\u001b[1m>\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m╰──────────────────────────────────────────────────────────────────────────╯\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.8/dist-packages/pyspark/sql/\u001b[0m\u001b[1;33mreadwriter.py\u001b[0m:\u001b[94m177\u001b[0m in \u001b[92mload\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m 174 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[96mself\u001b[0m.schema(schema) \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m 175 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[96mself\u001b[0m.options(**options) \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m 176 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96misinstance\u001b[0m(path, \u001b[96mstr\u001b[0m): \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 177 \u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m \u001b[96mself\u001b[0m._df(\u001b[96mself\u001b[0m._jreader.load(path)) \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m 178 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94melif\u001b[0m path \u001b[95mis\u001b[0m \u001b[95mnot\u001b[0m \u001b[94mNone\u001b[0m: \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m 179 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mtype\u001b[0m(path) != \u001b[96mlist\u001b[0m: \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m 180 \u001b[0m\u001b[2m│ │ │ │ \u001b[0mpath = [path] \u001b[2m# type: ignore[list-item]\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m╭─\u001b[0m\u001b[33m────────────────────────────────\u001b[0m\u001b[33m locals \u001b[0m\u001b[33m────────────────────────────────\u001b[0m\u001b[33m─╮\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m format = \u001b[94mNone\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m options = \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m path = \u001b[33m'gs://tbd-2023z-304098-data/tpc-di/Date.txt'\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m schema = \u001b[94mNone\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m self = \u001b[1m<\u001b[0m\u001b[1;95mpyspark.sql.readwriter.DataFrameReader\u001b[0m\u001b[39m object at \u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[94m0x7f55cfdfaa00\u001b[0m\u001b[1m>\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m╰──────────────────────────────────────────────────────────────────────────╯\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.8/dist-packages/py4j/\u001b[0m\u001b[1;33mjava_gateway.py\u001b[0m:\u001b[94m1321\u001b[0m in \u001b[92m__call__\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m1318 \u001b[0m\u001b[2m│ │ │ \u001b[0mproto.END_COMMAND_PART \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m1319 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m1320 \u001b[0m\u001b[2m│ │ \u001b[0manswer = \u001b[96mself\u001b[0m.gateway_client.send_command(command) \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1321 \u001b[2m│ │ \u001b[0mreturn_value = get_return_value( \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m1322 \u001b[0m\u001b[2m│ │ │ \u001b[0manswer, \u001b[96mself\u001b[0m.gateway_client, \u001b[96mself\u001b[0m.target_id, \u001b[96mself\u001b[0m.name) \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m1323 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m1324 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mfor\u001b[0m temp_arg \u001b[95min\u001b[0m temp_args: \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m╭─\u001b[0m\u001b[33m────────────────────────────────\u001b[0m\u001b[33m locals \u001b[0m\u001b[33m────────────────────────────────\u001b[0m\u001b[33m─╮\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m answer = \u001b[33m'xro46'\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m args = \u001b[1m(\u001b[0m\u001b[33m'gs://tbd-2023z-304098-data/tpc-di/Date.txt'\u001b[0m,\u001b[1m)\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m args_command = \u001b[33m'sgs://tbd-2023z-304098-data/tpc-di/Date.txt\\n'\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m command = \u001b[33m'c\\no45\\nload\\nsgs://tbd-2023z-304098-data/tpc-di/Date.t…\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m self = \u001b[1m<\u001b[0m\u001b[1;95mpy4j.java_gateway.JavaMember\u001b[0m\u001b[39m object at \u001b[0m\u001b[94m0x7f55cf52b2b0\u001b[0m\u001b[1m>\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m temp_args = \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m╰──────────────────────────────────────────────────────────────────────────╯\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.8/dist-packages/pyspark/sql/\u001b[0m\u001b[1;33mutils.py\u001b[0m:\u001b[94m190\u001b[0m in \u001b[92mdeco\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m187 \u001b[0m\u001b[94mdef\u001b[0m \u001b[92mcapture_sql_exception\u001b[0m(f: Callable[..., Any]) -> Callable[..., Any] \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m188 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mdef\u001b[0m \u001b[92mdeco\u001b[0m(*a: Any, **kw: Any) -> Any: \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m189 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mtry\u001b[0m: \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m190 \u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m f(*a, **kw) \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m191 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mexcept\u001b[0m Py4JJavaError \u001b[94mas\u001b[0m e: \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m192 \u001b[0m\u001b[2m│ │ │ \u001b[0mconverted = convert_exception(e.java_exception) \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m193 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[95mnot\u001b[0m \u001b[96misinstance\u001b[0m(converted, UnknownException): \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m╭─\u001b[0m\u001b[33m────────────────────────────────\u001b[0m\u001b[33m locals \u001b[0m\u001b[33m────────────────────────────────\u001b[0m\u001b[33m─╮\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m a = \u001b[1m(\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[2m│ \u001b[0m\u001b[33m'xro46'\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[2m│ \u001b[0m\u001b[1m<\u001b[0m\u001b[1;95mpy4j.clientserver.JavaClient\u001b[0m\u001b[39m object at \u001b[0m\u001b[94m0x7f55cfead4c0\u001b[0m\u001b[1m>\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[2m│ \u001b[0m\u001b[33m'o45'\u001b[0m, \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[2m│ \u001b[0m\u001b[33m'load'\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m \u001b[1m)\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m converted = \u001b[1;35mUnknownException\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m f = \u001b[1m<\u001b[0m\u001b[1;95mfunction\u001b[0m\u001b[39m get_return_value at \u001b[0m\u001b[94m0x7f55d6c7a5e0\u001b[0m\u001b[1m>\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m kw = \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m╰──────────────────────────────────────────────────────────────────────────╯\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.8/dist-packages/py4j/\u001b[0m\u001b[1;33mprotocol.py\u001b[0m:\u001b[94m326\u001b[0m in \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[92mget_return_value\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m323 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[96mtype\u001b[0m = answer[\u001b[94m1\u001b[0m] \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m324 \u001b[0m\u001b[2m│ │ │ \u001b[0mvalue = OUTPUT_CONVERTER[\u001b[96mtype\u001b[0m](answer[\u001b[94m2\u001b[0m:], gateway_client) \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m325 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mif\u001b[0m answer[\u001b[94m1\u001b[0m] == REFERENCE_TYPE: \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m326 \u001b[2m│ │ │ │ \u001b[0m\u001b[94mraise\u001b[0m Py4JJavaError( \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m327 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[33m\"\u001b[0m\u001b[33mAn error occurred while calling \u001b[0m\u001b[33m{0}\u001b[0m\u001b[33m{1}\u001b[0m\u001b[33m{2}\u001b[0m\u001b[33m.\u001b[0m\u001b[33m\\n\u001b[0m\u001b[33m\"\u001b[0m. \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m328 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[96mformat\u001b[0m(target_id, \u001b[33m\"\u001b[0m\u001b[33m.\u001b[0m\u001b[33m\"\u001b[0m, name), value) \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[2m329 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m╭─\u001b[0m\u001b[33m────────────────────────────────\u001b[0m\u001b[33m locals \u001b[0m\u001b[33m────────────────────────────────\u001b[0m\u001b[33m─╮\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m answer = \u001b[33m'xro46'\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m gateway_client = \u001b[1m<\u001b[0m\u001b[1;95mpy4j.clientserver.JavaClient\u001b[0m\u001b[39m object at \u001b[0m\u001b[94m0x7f55cfead4c0\u001b[0m\u001b[1m>\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m name = \u001b[33m'load'\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m target_id = \u001b[33m'o45'\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m type = \u001b[33m'r'\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m│\u001b[0m value = JavaObject \u001b[33mid\u001b[0m=\u001b[35mo46\u001b[0m \u001b[33m│\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m│\u001b[0m \u001b[33m╰──────────────────────────────────────────────────────────────────────────╯\u001b[0m \u001b[31m│\u001b[0m\n", - "\u001b[31m╰──────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", - "\u001b[1;91mPy4JJavaError: \u001b[0mAn error occurred while calling o45.load.\n", - ": java.lang.RuntimeException: java.lang.ClassNotFoundException: Class \n", - "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem not found\n", - " at \n", - "\u001b[1;35morg.apache.hadoop.conf.Configuration.getClass\u001b[0m\u001b[1m(\u001b[0mConfiguration.jav\u001b[1;92ma:2688\u001b[0m\u001b[1m)\u001b[0m\n", - " at \n", - "\u001b[1;35morg.apache.hadoop.fs.FileSystem.getFileSystemClass\u001b[0m\u001b[1m(\u001b[0mFileSystem.jav\u001b[1;92ma:3431\u001b[0m\u001b[1m)\u001b[0m\n", - " at \n", - "\u001b[1;35morg.apache.hadoop.fs.FileSystem.createFileSystem\u001b[0m\u001b[1m(\u001b[0mFileSystem.jav\u001b[1;92ma:3466\u001b[0m\u001b[1m)\u001b[0m\n", - " at org.apache.hadoop.fs.FileSystem.access$\u001b[1;35m300\u001b[0m\u001b[1m(\u001b[0mFileSystem.jav\u001b[1;92ma:174\u001b[0m\u001b[1m)\u001b[0m\n", - " at \n", - "org.apache.hadoop.fs.FileSystem$\u001b[1;35mCache.getInternal\u001b[0m\u001b[1m(\u001b[0mFileSystem.jav\u001b[1;92ma:3574\u001b[0m\u001b[1m)\u001b[0m\n", - " at org.apache.hadoop.fs.FileSystem$\u001b[1;35mCache.get\u001b[0m\u001b[1m(\u001b[0mFileSystem.jav\u001b[1;92ma:3521\u001b[0m\u001b[1m)\u001b[0m\n", - " at \u001b[1;35morg.apache.hadoop.fs.FileSystem.get\u001b[0m\u001b[1m(\u001b[0mFileSystem.jav\u001b[1;92ma:540\u001b[0m\u001b[1m)\u001b[0m\n", - " at \u001b[1;35morg.apache.hadoop.fs.Path.getFileSystem\u001b[0m\u001b[1m(\u001b[0mPath.jav\u001b[1;92ma:365\u001b[0m\u001b[1m)\u001b[0m\n", - " at \n", - "org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$checkAndGlobPath\n", - "IfNecessary$\u001b[1;35m1\u001b[0m\u001b[1m(\u001b[0mDataSource.scal\u001b[1;92ma:752\u001b[0m\u001b[1m)\u001b[0m\n", - " at \u001b[1;35mscala.collection.immutable.List.map\u001b[0m\u001b[1m(\u001b[0mList.scal\u001b[1;92ma:293\u001b[0m\u001b[1m)\u001b[0m\n", - " at \n", - "org.apache.spark.sql.execution.datasources.DataSource$\u001b[1;35m.checkAndGlobPathIfNecessa\u001b[0m\n", - "\u001b[1;35mry\u001b[0m\u001b[1m(\u001b[0mDataSource.scal\u001b[1;92ma:750\u001b[0m\u001b[1m)\u001b[0m\n", - " at \n", - "\u001b[1;35morg.apache.spark.sql.execution.datasources.DataSource.checkAndGlobPathIfNecessar\u001b[0m\n", - "\u001b[1;35my\u001b[0m\u001b[1m(\u001b[0mDataSource.scal\u001b[1;92ma:579\u001b[0m\u001b[1m)\u001b[0m\n", - " at \n", - "\u001b[1;35morg.apache.spark.sql.execution.datasources.DataSource.resolveRelation\u001b[0m\u001b[1m(\u001b[0mDataSource\n", - ".scal\u001b[1;92ma:408\u001b[0m\u001b[1m)\u001b[0m\n", - " at \n", - "\u001b[1;35morg.apache.spark.sql.DataFrameReader.loadV1Source\u001b[0m\u001b[1m(\u001b[0mDataFrameReader.scal\u001b[1;92ma:228\u001b[0m\u001b[1m)\u001b[0m\n", - " at \n", - "org.apache.spark.sql.DataFrameReader.$anonfun$load$\u001b[1;35m2\u001b[0m\u001b[1m(\u001b[0mDataFrameReader.scal\u001b[1;92ma:210\u001b[0m\u001b[1m)\u001b[0m\n", - " at \u001b[1;35mscala.Option.getOrElse\u001b[0m\u001b[1m(\u001b[0mOption.scal\u001b[1;92ma:189\u001b[0m\u001b[1m)\u001b[0m\n", - " at \u001b[1;35morg.apache.spark.sql.DataFrameReader.load\u001b[0m\u001b[1m(\u001b[0mDataFrameReader.scal\u001b[1;92ma:210\u001b[0m\u001b[1m)\u001b[0m\n", - " at \u001b[1;35morg.apache.spark.sql.DataFrameReader.load\u001b[0m\u001b[1m(\u001b[0mDataFrameReader.scal\u001b[1;92ma:185\u001b[0m\u001b[1m)\u001b[0m\n", - " at \u001b[1;35msun.reflect.NativeMethodAccessorImpl.invoke0\u001b[0m\u001b[1m(\u001b[0mNative Method\u001b[1m)\u001b[0m\n", - " at \n", - "\u001b[1;35msun.reflect.NativeMethodAccessorImpl.invoke\u001b[0m\u001b[1m(\u001b[0mNativeMethodAccessorImpl.jav\u001b[1;92ma:62\u001b[0m\u001b[1m)\u001b[0m\n", - " at \n", - "\u001b[1;35msun.reflect.DelegatingMethodAccessorImpl.invoke\u001b[0m\u001b[1m(\u001b[0mDelegatingMethodAccessorImpl.jav\n", - "\u001b[1;92ma:43\u001b[0m\u001b[1m)\u001b[0m\n", - " at \u001b[1;35mjava.lang.reflect.Method.invoke\u001b[0m\u001b[1m(\u001b[0mMethod.jav\u001b[1;92ma:498\u001b[0m\u001b[1m)\u001b[0m\n", - " at \u001b[1;35mpy4j.reflection.MethodInvoker.invoke\u001b[0m\u001b[1m(\u001b[0mMethodInvoker.jav\u001b[1;92ma:244\u001b[0m\u001b[1m)\u001b[0m\n", - " at \u001b[1;35mpy4j.reflection.ReflectionEngine.invoke\u001b[0m\u001b[1m(\u001b[0mReflectionEngine.jav\u001b[1;92ma:357\u001b[0m\u001b[1m)\u001b[0m\n", - " at \u001b[1;35mpy4j.Gateway.invoke\u001b[0m\u001b[1m(\u001b[0mGateway.jav\u001b[1;92ma:282\u001b[0m\u001b[1m)\u001b[0m\n", - " at \u001b[1;35mpy4j.commands.AbstractCommand.invokeMethod\u001b[0m\u001b[1m(\u001b[0mAbstractCommand.jav\u001b[1;92ma:132\u001b[0m\u001b[1m)\u001b[0m\n", - " at \u001b[1;35mpy4j.commands.CallCommand.execute\u001b[0m\u001b[1m(\u001b[0mCallCommand.jav\u001b[1;92ma:79\u001b[0m\u001b[1m)\u001b[0m\n", - " at \n", - "\u001b[1;35mpy4j.ClientServerConnection.waitForCommands\u001b[0m\u001b[1m(\u001b[0mClientServerConnection.jav\u001b[1;92ma:182\u001b[0m\u001b[1m)\u001b[0m\n", - " at \u001b[1;35mpy4j.ClientServerConnection.run\u001b[0m\u001b[1m(\u001b[0mClientServerConnection.jav\u001b[1;92ma:106\u001b[0m\u001b[1m)\u001b[0m\n", - " at \u001b[1;35mjava.lang.Thread.run\u001b[0m\u001b[1m(\u001b[0mThread.jav\u001b[1;92ma:750\u001b[0m\u001b[1m)\u001b[0m\n", - "Caused by: java.lang.ClassNotFoundException: Class \n", - "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem not found\n", - " at \n", - "\u001b[1;35morg.apache.hadoop.conf.Configuration.getClassByName\u001b[0m\u001b[1m(\u001b[0mConfiguration.jav\u001b[1;92ma:2592\u001b[0m\u001b[1m)\u001b[0m\n", - " at \n", - "\u001b[1;35morg.apache.hadoop.conf.Configuration.getClass\u001b[0m\u001b[1m(\u001b[0mConfiguration.jav\u001b[1;92ma:2686\u001b[0m\u001b[1m)\u001b[0m\n", - " \u001b[33m...\u001b[0m \u001b[1;36m29\u001b[0m more\n", - "\n" + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" ] }, { - "ename": "CalledProcessError", - "evalue": "Command 'b'source \"$HOME/.sdkman/bin/sdkman-init.sh\"\\ncd $REPO_ROOT\\npython3.8 tpcdi.py --output-directory $GEN_OUTPUT_DIR --stage $DATA_BUCKET\\n'' returned non-zero exit status 1.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mCalledProcessError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[38], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mget_ipython\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_cell_magic\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mbash\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msource \u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m$HOME/.sdkman/bin/sdkman-init.sh\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[38;5;124;43mcd $REPO_ROOT\u001b[39;49m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[38;5;124;43mpython3.8 tpcdi.py --output-directory $GEN_OUTPUT_DIR --stage $DATA_BUCKET\u001b[39;49m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py:2478\u001b[0m, in \u001b[0;36mInteractiveShell.run_cell_magic\u001b[0;34m(self, magic_name, line, cell)\u001b[0m\n\u001b[1;32m 2476\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuiltin_trap:\n\u001b[1;32m 2477\u001b[0m args \u001b[38;5;241m=\u001b[39m (magic_arg_s, cell)\n\u001b[0;32m-> 2478\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2480\u001b[0m \u001b[38;5;66;03m# The code below prevents the output from being displayed\u001b[39;00m\n\u001b[1;32m 2481\u001b[0m \u001b[38;5;66;03m# when using magics with decodator @output_can_be_silenced\u001b[39;00m\n\u001b[1;32m 2482\u001b[0m \u001b[38;5;66;03m# when the last Python token in the expression is a ';'.\u001b[39;00m\n\u001b[1;32m 2483\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(fn, magic\u001b[38;5;241m.\u001b[39mMAGIC_OUTPUT_CAN_BE_SILENCED, \u001b[38;5;28;01mFalse\u001b[39;00m):\n", - "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:153\u001b[0m, in \u001b[0;36mScriptMagics._make_script_magic..named_script_magic\u001b[0;34m(line, cell)\u001b[0m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 152\u001b[0m line \u001b[38;5;241m=\u001b[39m script\n\u001b[0;32m--> 153\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshebang\u001b[49m\u001b[43m(\u001b[49m\u001b[43mline\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcell\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:305\u001b[0m, in \u001b[0;36mScriptMagics.shebang\u001b[0;34m(self, line, cell)\u001b[0m\n\u001b[1;32m 300\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m args\u001b[38;5;241m.\u001b[39mraise_error \u001b[38;5;129;01mand\u001b[39;00m p\u001b[38;5;241m.\u001b[39mreturncode \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 301\u001b[0m \u001b[38;5;66;03m# If we get here and p.returncode is still None, we must have\u001b[39;00m\n\u001b[1;32m 302\u001b[0m \u001b[38;5;66;03m# killed it but not yet seen its return code. We don't wait for it,\u001b[39;00m\n\u001b[1;32m 303\u001b[0m \u001b[38;5;66;03m# in case it's stuck in uninterruptible sleep. -9 = SIGKILL\u001b[39;00m\n\u001b[1;32m 304\u001b[0m rc \u001b[38;5;241m=\u001b[39m p\u001b[38;5;241m.\u001b[39mreturncode \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m9\u001b[39m\n\u001b[0;32m--> 305\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m CalledProcessError(rc, cell)\n", - "\u001b[0;31mCalledProcessError\u001b[0m: Command 'b'source \"$HOME/.sdkman/bin/sdkman-init.sh\"\\ncd $REPO_ROOT\\npython3.8 tpcdi.py --output-directory $GEN_OUTPUT_DIR --stage $DATA_BUCKET\\n'' returned non-zero exit status 1." + "name": "stdout", + "output_type": "stream", + "text": [ + "24/01/04 22:12:15 WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.\n", + "24/01/04 22:12:16 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.\n", + "24/01/04 22:12:24 WARN Client: Same path resource file:///root/.ivy2/jars/com.databricks_spark-xml_2.12-0.17.0.jar added multiple times to distributed cache.\n", + "24/01/04 22:12:24 WARN Client: Same path resource file:///root/.ivy2/jars/commons-io_commons-io-2.11.0.jar added multiple times to distributed cache.\n", + "24/01/04 22:12:24 WARN Client: Same path resource file:///root/.ivy2/jars/org.glassfish.jaxb_txw2-3.0.2.jar added multiple times to distributed cache.\n", + "24/01/04 22:12:24 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.ws.xmlschema_xmlschema-core-2.3.0.jar added multiple times to distributed cache.\n", + "24/01/04 22:12:24 WARN Client: Same path resource file:///root/.ivy2/jars/org.scala-lang.modules_scala-collection-compat_2.12-2.9.0.jar added multiple times to distributed cache.\n", + "24/01/04 22:12:50 WARN HiveClientImpl: Detected HiveConf hive.execution.engine is 'tez' and will be reset to 'mr' to disable useless hive logic\n", + "24/01/04 22:14:24 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n", + "DATE table created.\n", + "DAILY_MARKET table created.\n", + "INDUSTRY table created.\n", + "PROSPECT table created.\n", + "CUSTOMER_MGMT table created.\n", + "TAX_RATE table created.\n", + "HR table created.\n", + "WATCH_HISTORY table created.\n", + "TRADE table created.\n", + "TRADE_HISTORY table created.\n", + "STATUS_TYPE table created.\n", + "TRADE_TYPE table created.\n", + "HOLDING_HISTORY table created.\n", + "CASH_TRANSACTION table created.\n", + "CMP table created.\n", + "SEC table created.\n", + "FIN table created.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" ] } ], @@ -651,35 +342,242 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 10, "id": "98d0642c-b5c9-4aa8-996f-40f82e3c90e0", "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0m22:18:19 Running with dbt=1.7.3\n", + "\u001b[0m22:18:19 Installing dbt-labs/dbt_utils\n", + "\u001b[0m22:18:20 Installed from version 1.1.1\n", + "\u001b[0m22:18:20 Up to date!\n", + "\u001b[0m22:18:24 Running with dbt=1.7.3\n", + "\u001b[0m22:18:25 Registered adapter: spark=1.7.1\n", + "\u001b[0m22:18:25 Found 44 models, 4 tests, 17 sources, 0 exposures, 0 metrics, 553 macros, 0 groups, 0 semantic models\n", + "\u001b[0m22:18:25 \n", + ":: loading settings :: url = jar:file:/usr/local/lib/python3.8/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" + ] + }, { "name": "stderr", "output_type": "stream", "text": [ - "bash: line 2: dbt: command not found\n" + "Ivy Default Cache set to: /root/.ivy2/cache\n", + "The jars for the packages stored in: /root/.ivy2/jars\n", + "com.databricks#spark-xml_2.12 added as a dependency\n", + ":: resolving dependencies :: org.apache.spark#spark-submit-parent-22f21f88-292d-4b32-aed2-323dee8b367e;1.0\n", + "\tconfs: [default]\n", + "\tfound com.databricks#spark-xml_2.12;0.17.0 in central\n", + "\tfound commons-io#commons-io;2.11.0 in central\n", + "\tfound org.glassfish.jaxb#txw2;3.0.2 in central\n", + "\tfound org.apache.ws.xmlschema#xmlschema-core;2.3.0 in central\n", + "\tfound org.scala-lang.modules#scala-collection-compat_2.12;2.9.0 in central\n", + ":: resolution report :: resolve 571ms :: artifacts dl 20ms\n", + "\t:: modules in use:\n", + "\tcom.databricks#spark-xml_2.12;0.17.0 from central in [default]\n", + "\tcommons-io#commons-io;2.11.0 from central in [default]\n", + "\torg.apache.ws.xmlschema#xmlschema-core;2.3.0 from central in [default]\n", + "\torg.glassfish.jaxb#txw2;3.0.2 from central in [default]\n", + "\torg.scala-lang.modules#scala-collection-compat_2.12;2.9.0 from central in [default]\n", + "\t---------------------------------------------------------------------\n", + "\t| | modules || artifacts |\n", + "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", + "\t---------------------------------------------------------------------\n", + "\t| default | 5 | 0 | 0 | 0 || 5 | 0 |\n", + "\t---------------------------------------------------------------------\n", + ":: retrieving :: org.apache.spark#spark-submit-parent-22f21f88-292d-4b32-aed2-323dee8b367e\n", + "\tconfs: [default]\n", + "\t0 artifacts copied, 5 already retrieved (0kB/12ms)\n", + "WARNING: An illegal reflective access operation has occurred\n", + "WARNING: Illegal reflective access by org.apache.hadoop.shaded.org.xbill.DNS.ResolverConfig (file:/usr/local/lib/python3.8/dist-packages/pyspark/jars/hadoop-client-runtime-3.3.2.jar) to method sun.net.dns.ResolverConfiguration.open()\n", + "WARNING: Please consider reporting this to the maintainers of org.apache.hadoop.shaded.org.xbill.DNS.ResolverConfig\n", + "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", + "WARNING: All illegal access operations will be denied in a future release\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/01/04 22:18:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" ] }, { - "ename": "CalledProcessError", - "evalue": "Command 'b'cd $REPO_ROOT\\ndbt run\\n'' returned non-zero exit status 127.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mCalledProcessError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[39], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mget_ipython\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_cell_magic\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mbash\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mcd $REPO_ROOT\u001b[39;49m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[38;5;124;43mdbt run\u001b[39;49m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py:2478\u001b[0m, in \u001b[0;36mInteractiveShell.run_cell_magic\u001b[0;34m(self, magic_name, line, cell)\u001b[0m\n\u001b[1;32m 2476\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuiltin_trap:\n\u001b[1;32m 2477\u001b[0m args \u001b[38;5;241m=\u001b[39m (magic_arg_s, cell)\n\u001b[0;32m-> 2478\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2480\u001b[0m \u001b[38;5;66;03m# The code below prevents the output from being displayed\u001b[39;00m\n\u001b[1;32m 2481\u001b[0m \u001b[38;5;66;03m# when using magics with decodator @output_can_be_silenced\u001b[39;00m\n\u001b[1;32m 2482\u001b[0m \u001b[38;5;66;03m# when the last Python token in the expression is a ';'.\u001b[39;00m\n\u001b[1;32m 2483\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(fn, magic\u001b[38;5;241m.\u001b[39mMAGIC_OUTPUT_CAN_BE_SILENCED, \u001b[38;5;28;01mFalse\u001b[39;00m):\n", - "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:153\u001b[0m, in \u001b[0;36mScriptMagics._make_script_magic..named_script_magic\u001b[0;34m(line, cell)\u001b[0m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 152\u001b[0m line \u001b[38;5;241m=\u001b[39m script\n\u001b[0;32m--> 153\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshebang\u001b[49m\u001b[43m(\u001b[49m\u001b[43mline\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcell\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:305\u001b[0m, in \u001b[0;36mScriptMagics.shebang\u001b[0;34m(self, line, cell)\u001b[0m\n\u001b[1;32m 300\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m args\u001b[38;5;241m.\u001b[39mraise_error \u001b[38;5;129;01mand\u001b[39;00m p\u001b[38;5;241m.\u001b[39mreturncode \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 301\u001b[0m \u001b[38;5;66;03m# If we get here and p.returncode is still None, we must have\u001b[39;00m\n\u001b[1;32m 302\u001b[0m \u001b[38;5;66;03m# killed it but not yet seen its return code. We don't wait for it,\u001b[39;00m\n\u001b[1;32m 303\u001b[0m \u001b[38;5;66;03m# in case it's stuck in uninterruptible sleep. -9 = SIGKILL\u001b[39;00m\n\u001b[1;32m 304\u001b[0m rc \u001b[38;5;241m=\u001b[39m p\u001b[38;5;241m.\u001b[39mreturncode \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m9\u001b[39m\n\u001b[0;32m--> 305\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m CalledProcessError(rc, cell)\n", - "\u001b[0;31mCalledProcessError\u001b[0m: Command 'b'cd $REPO_ROOT\\ndbt run\\n'' returned non-zero exit status 127." + "name": "stdout", + "output_type": "stream", + "text": [ + "24/01/04 22:18:34 WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.\n", + "24/01/04 22:18:34 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.\n", + "24/01/04 22:18:42 WARN Client: Same path resource file:///root/.ivy2/jars/com.databricks_spark-xml_2.12-0.17.0.jar added multiple times to distributed cache.\n", + "24/01/04 22:18:42 WARN Client: Same path resource file:///root/.ivy2/jars/commons-io_commons-io-2.11.0.jar added multiple times to distributed cache.\n", + "24/01/04 22:18:42 WARN Client: Same path resource file:///root/.ivy2/jars/org.glassfish.jaxb_txw2-3.0.2.jar added multiple times to distributed cache.\n", + "24/01/04 22:18:42 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.ws.xmlschema_xmlschema-core-2.3.0.jar added multiple times to distributed cache.\n", + "24/01/04 22:18:42 WARN Client: Same path resource file:///root/.ivy2/jars/org.scala-lang.modules_scala-collection-compat_2.12-2.9.0.jar added multiple times to distributed cache.\n", + "24/01/04 22:19:09 WARN HiveClientImpl: Detected HiveConf hive.execution.engine is 'tez' and will be reset to 'mr' to disable useless hive logic\n", + "\u001b[0m22:19:14 Concurrency: 1 threads (target='dev')\n", + "\u001b[0m22:19:14 \n", + "\u001b[0m22:19:14 1 of 43 START sql table model demo_bronze.brokerage_cash_transaction ........... [RUN]\n", + "24/01/04 22:19:15 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "24/01/04 22:19:16 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.\n", + "\u001b[0m22:19:31 1 of 43 OK created sql table model demo_bronze.brokerage_cash_transaction ...... [\u001b[32mOK\u001b[0m in 17.35s]\n", + "\u001b[0m22:19:31 2 of 43 START sql table model demo_bronze.brokerage_daily_market ............... [RUN]\n", + "24/01/04 22:19:32 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:19:45 2 of 43 OK created sql table model demo_bronze.brokerage_daily_market .......... [\u001b[32mOK\u001b[0m in 13.72s]\n", + "\u001b[0m22:19:45 3 of 43 START sql table model demo_bronze.brokerage_holding_history ............ [RUN]\n", + "24/01/04 22:19:46 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:19:48 3 of 43 OK created sql table model demo_bronze.brokerage_holding_history ....... [\u001b[32mOK\u001b[0m in 2.91s]\n", + "\u001b[0m22:19:48 4 of 43 START sql table model demo_bronze.brokerage_trade ...................... [RUN]\n", + "24/01/04 22:19:48 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:19:57 4 of 43 OK created sql table model demo_bronze.brokerage_trade ................. [\u001b[32mOK\u001b[0m in 8.81s]\n", + "\u001b[0m22:19:57 5 of 43 START sql table model demo_bronze.brokerage_trade_history .............. [RUN]\n", + "24/01/04 22:19:57 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:20:03 5 of 43 OK created sql table model demo_bronze.brokerage_trade_history ......... [\u001b[32mOK\u001b[0m in 6.26s]\n", + "\u001b[0m22:20:03 6 of 43 START sql table model demo_bronze.brokerage_watch_history .............. [RUN]\n", + "24/01/04 22:20:04 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:20:11 6 of 43 OK created sql table model demo_bronze.brokerage_watch_history ......... [\u001b[32mOK\u001b[0m in 8.13s]\n", + "\u001b[0m22:20:11 7 of 43 START sql table model demo_bronze.crm_customer_mgmt .................... [RUN]\n", + "24/01/04 22:20:12 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "24/01/04 22:20:12 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n", + "\u001b[0m22:20:15 7 of 43 OK created sql table model demo_bronze.crm_customer_mgmt ............... [\u001b[32mOK\u001b[0m in 3.65s]\n", + "\u001b[0m22:20:15 8 of 43 START sql table model demo_bronze.finwire_company ...................... [RUN]\n", + "24/01/04 22:20:15 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:20:17 8 of 43 OK created sql table model demo_bronze.finwire_company ................. [\u001b[32mOK\u001b[0m in 1.85s]\n", + "\u001b[0m22:20:17 9 of 43 START sql table model demo_bronze.finwire_financial .................... [RUN]\n", + "24/01/04 22:20:17 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:20:28 9 of 43 OK created sql table model demo_bronze.finwire_financial ............... [\u001b[32mOK\u001b[0m in 11.16s]\n", + "\u001b[0m22:20:28 10 of 43 START sql table model demo_bronze.finwire_security .................... [RUN]\n", + "24/01/04 22:20:28 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:20:30 10 of 43 OK created sql table model demo_bronze.finwire_security ............... [\u001b[32mOK\u001b[0m in 1.97s]\n", + "\u001b[0m22:20:30 11 of 43 START sql table model demo_bronze.hr_employee ......................... [RUN]\n", + "24/01/04 22:20:30 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:20:31 11 of 43 OK created sql table model demo_bronze.hr_employee .................... [\u001b[32mOK\u001b[0m in 1.47s]\n", + "\u001b[0m22:20:31 12 of 43 START sql table model demo_bronze.reference_date ...................... [RUN]\n", + "24/01/04 22:20:32 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:20:33 12 of 43 OK created sql table model demo_bronze.reference_date ................. [\u001b[32mOK\u001b[0m in 1.44s]\n", + "\u001b[0m22:20:33 13 of 43 START sql table model demo_bronze.reference_industry .................. [RUN]\n", + "24/01/04 22:20:33 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:20:34 13 of 43 OK created sql table model demo_bronze.reference_industry ............. [\u001b[32mOK\u001b[0m in 1.08s]\n", + "\u001b[0m22:20:34 14 of 43 START sql table model demo_bronze.reference_status_type ............... [RUN]\n", + "24/01/04 22:20:34 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:20:35 14 of 43 OK created sql table model demo_bronze.reference_status_type .......... [\u001b[32mOK\u001b[0m in 1.09s]\n", + "\u001b[0m22:20:35 15 of 43 START sql table model demo_bronze.reference_tax_rate .................. [RUN]\n", + "24/01/04 22:20:35 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:20:36 15 of 43 OK created sql table model demo_bronze.reference_tax_rate ............. [\u001b[32mOK\u001b[0m in 1.08s]\n", + "\u001b[0m22:20:36 16 of 43 START sql table model demo_bronze.reference_trade_type ................ [RUN]\n", + "24/01/04 22:20:37 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:20:37 16 of 43 OK created sql table model demo_bronze.reference_trade_type ........... [\u001b[32mOK\u001b[0m in 1.00s]\n", + "\u001b[0m22:20:37 17 of 43 START sql table model demo_bronze.syndicated_prospect ................. [RUN]\n", + "24/01/04 22:20:38 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:20:39 17 of 43 OK created sql table model demo_bronze.syndicated_prospect ............ [\u001b[32mOK\u001b[0m in 1.55s]\n", + "\u001b[0m22:20:39 18 of 43 START sql table model demo_silver.daily_market ........................ [RUN]\n", + "24/01/04 22:20:39 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:22:56 18 of 43 OK created sql table model demo_silver.daily_market ................... [\u001b[32mOK\u001b[0m in 137.26s]\n", + "\u001b[0m22:22:56 19 of 43 START sql table model demo_silver.employees ........................... [RUN]\n", + "24/01/04 22:22:56 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:22:58 19 of 43 OK created sql table model demo_silver.employees ...................... [\u001b[32mOK\u001b[0m in 1.81s]\n", + "\u001b[0m22:22:58 20 of 43 START sql table model demo_silver.date ................................ [RUN]\n", + "24/01/04 22:22:58 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:23:00 20 of 43 OK created sql table model demo_silver.date ........................... [\u001b[32mOK\u001b[0m in 1.86s]\n", + "\u001b[0m22:23:00 21 of 43 START sql table model demo_silver.companies ........................... [RUN]\n", + "24/01/04 22:23:00 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:23:05 21 of 43 OK created sql table model demo_silver.companies ...................... [\u001b[32mOK\u001b[0m in 5.69s]\n", + "\u001b[0m22:23:06 22 of 43 START sql table model demo_silver.accounts ............................ [RUN]\n", + "24/01/04 22:23:06 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:23:12 22 of 43 OK created sql table model demo_silver.accounts ....................... [\u001b[32mOK\u001b[0m in 6.25s]\n", + "\u001b[0m22:23:12 23 of 43 START sql table model demo_silver.customers ........................... [RUN]\n", + "24/01/04 22:23:12 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:23:19 23 of 43 OK created sql table model demo_silver.customers ...................... [\u001b[32mOK\u001b[0m in 6.45s]\n", + "\u001b[0m22:23:19 24 of 43 START sql table model demo_silver.trades_history ...................... [RUN]\n", + "24/01/04 22:23:19 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:24:11 24 of 43 OK created sql table model demo_silver.trades_history ................. [\u001b[32mOK\u001b[0m in 52.29s]\n", + "\u001b[0m22:24:11 25 of 43 START sql table model demo_gold.dim_broker ............................ [RUN]\n", + "24/01/04 22:24:12 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:24:14 25 of 43 OK created sql table model demo_gold.dim_broker ....................... [\u001b[32mOK\u001b[0m in 2.34s]\n", + "\u001b[0m22:24:14 26 of 43 START sql table model demo_gold.dim_date .............................. [RUN]\n", + "24/01/04 22:24:14 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:24:15 26 of 43 OK created sql table model demo_gold.dim_date ......................... [\u001b[32mOK\u001b[0m in 1.66s]\n", + "\u001b[0m22:24:15 27 of 43 START sql table model demo_gold.dim_company ........................... [RUN]\n", + "24/01/04 22:24:15 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:24:17 27 of 43 OK created sql table model demo_gold.dim_company ...................... [\u001b[32mOK\u001b[0m in 1.90s]\n", + "\u001b[0m22:24:17 28 of 43 START sql table model demo_silver.financials .......................... [RUN]\n", + "24/01/04 22:24:17 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:24:29 28 of 43 OK created sql table model demo_silver.financials ..................... [\u001b[32mOK\u001b[0m in 11.47s]\n", + "\u001b[0m22:24:29 29 of 43 START sql table model demo_silver.securities .......................... [RUN]\n", + "24/01/04 22:24:29 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:24:32 29 of 43 OK created sql table model demo_silver.securities ..................... [\u001b[32mOK\u001b[0m in 3.60s]\n", + "\u001b[0m22:24:32 30 of 43 START sql table model demo_silver.cash_transactions ................... [RUN]\n", + "24/01/04 22:24:33 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:24:39 30 of 43 OK created sql table model demo_silver.cash_transactions .............. [\u001b[32mOK\u001b[0m in 7.06s]\n", + "\u001b[0m22:24:39 31 of 43 START sql table model demo_gold.dim_customer .......................... [RUN]\n", + "24/01/04 22:24:40 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:24:45 31 of 43 OK created sql table model demo_gold.dim_customer ..................... [\u001b[32mOK\u001b[0m in 5.64s]\n", + "\u001b[0m22:24:45 32 of 43 START sql table model demo_gold.dim_trade ............................. [RUN]\n", + "24/01/04 22:24:45 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:25:05 32 of 43 OK created sql table model demo_gold.dim_trade ........................ [\u001b[32mOK\u001b[0m in 20.51s]\n", + "\u001b[0m22:25:05 33 of 43 START sql table model demo_silver.trades .............................. [RUN]\n", + "24/01/04 22:25:06 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:25:40 33 of 43 OK created sql table model demo_silver.trades ......................... [\u001b[32mOK\u001b[0m in 34.06s]\n", + "\u001b[0m22:25:40 34 of 43 START sql table model demo_gold.dim_security .......................... [RUN]\n", + "24/01/04 22:25:40 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:25:41 34 of 43 OK created sql table model demo_gold.dim_security ..................... [\u001b[32mOK\u001b[0m in 1.93s]\n", + "\u001b[0m22:25:41 35 of 43 START sql table model demo_silver.watches_history ..................... [RUN]\n", + "24/01/04 22:25:42 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:25:54 35 of 43 OK created sql table model demo_silver.watches_history ................ [\u001b[32mOK\u001b[0m in 12.73s]\n", + "\u001b[0m22:25:54 36 of 43 START sql table model demo_gold.dim_account ........................... [RUN]\n", + "24/01/04 22:25:55 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:25:57 36 of 43 OK created sql table model demo_gold.dim_account ...................... [\u001b[32mOK\u001b[0m in 3.07s]\n", + "\u001b[0m22:25:57 37 of 43 START sql table model demo_silver.holdings_history .................... [RUN]\n", + "24/01/04 22:25:58 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:26:10 37 of 43 OK created sql table model demo_silver.holdings_history ............... [\u001b[32mOK\u001b[0m in 12.70s]\n", + "\u001b[0m22:26:10 38 of 43 START sql table model demo_silver.watches ............................. [RUN]\n", + "24/01/04 22:26:10 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:26:29 38 of 43 OK created sql table model demo_silver.watches ........................ [\u001b[32mOK\u001b[0m in 19.37s]\n", + "\u001b[0m22:26:29 39 of 43 START sql table model demo_gold.fact_cash_transactions ................ [RUN]\n", + "24/01/04 22:26:30 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:26:36 39 of 43 OK created sql table model demo_gold.fact_cash_transactions ........... [\u001b[32mOK\u001b[0m in 6.11s]\n", + "\u001b[0m22:26:36 40 of 43 START sql table model demo_gold.fact_trade ............................ [RUN]\n", + "24/01/04 22:26:36 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:26:53 40 of 43 OK created sql table model demo_gold.fact_trade ....................... [\u001b[32mOK\u001b[0m in 17.67s]\n", + "\u001b[0m22:26:53 41 of 43 START sql table model demo_gold.fact_holdings ......................... [RUN]\n", + "24/01/04 22:26:54 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:27:23 41 of 43 OK created sql table model demo_gold.fact_holdings .................... [\u001b[32mOK\u001b[0m in 29.41s]\n", + "\u001b[0m22:27:23 42 of 43 START sql table model demo_gold.fact_watches .......................... [RUN]\n", + "24/01/04 22:27:23 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:27:30 42 of 43 OK created sql table model demo_gold.fact_watches ..................... [\u001b[32mOK\u001b[0m in 7.63s]\n", + "\u001b[0m22:27:30 43 of 43 START sql table model demo_gold.fact_cash_balances .................... [RUN]\n", + "24/01/04 22:27:31 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.\n", + "\u001b[0m22:27:41 43 of 43 OK created sql table model demo_gold.fact_cash_balances ............... [\u001b[32mOK\u001b[0m in 10.93s]\n", + "\u001b[0m22:27:41 \n", + "\u001b[0m22:27:41 Finished running 43 table models in 0 hours 9 minutes and 15.90 seconds (555.90s).\n", + "\u001b[0m22:27:41 \n", + "\u001b[0m22:27:41 \u001b[32mCompleted successfully\u001b[0m\n", + "\u001b[0m22:27:41 \n", + "\u001b[0m22:27:41 Done. PASS=43 WARN=0 ERROR=0 SKIP=0 TOTAL=43\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" ] } ], "source": [ "%%bash\n", "cd $REPO_ROOT\n", + "dbt deps\n", "dbt run" ] }, @@ -693,28 +591,174 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "c6b563f4-0b31-40a3-b353-9e6c9de8c482", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0m22:27:47 Running with dbt=1.7.3\n", + "\u001b[0m22:27:48 Installing dbt-labs/dbt_utils\n", + "\u001b[0m22:27:48 Installed from version 1.1.1\n", + "\u001b[0m22:27:48 Up to date!\n", + "\u001b[0m22:27:52 Running with dbt=1.7.3\n", + "\u001b[0m22:27:53 Registered adapter: spark=1.7.1\n", + "\u001b[0m22:27:53 Found 44 models, 4 tests, 17 sources, 0 exposures, 0 metrics, 553 macros, 0 groups, 0 semantic models\n", + "\u001b[0m22:27:53 \n", + ":: loading settings :: url = jar:file:/usr/local/lib/python3.8/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Ivy Default Cache set to: /root/.ivy2/cache\n", + "The jars for the packages stored in: /root/.ivy2/jars\n", + "com.databricks#spark-xml_2.12 added as a dependency\n", + ":: resolving dependencies :: org.apache.spark#spark-submit-parent-3fcec813-3c46-422b-815b-97d9de5f26b1;1.0\n", + "\tconfs: [default]\n", + "\tfound com.databricks#spark-xml_2.12;0.17.0 in central\n", + "\tfound commons-io#commons-io;2.11.0 in central\n", + "\tfound org.glassfish.jaxb#txw2;3.0.2 in central\n", + "\tfound org.apache.ws.xmlschema#xmlschema-core;2.3.0 in central\n", + "\tfound org.scala-lang.modules#scala-collection-compat_2.12;2.9.0 in central\n", + ":: resolution report :: resolve 555ms :: artifacts dl 21ms\n", + "\t:: modules in use:\n", + "\tcom.databricks#spark-xml_2.12;0.17.0 from central in [default]\n", + "\tcommons-io#commons-io;2.11.0 from central in [default]\n", + "\torg.apache.ws.xmlschema#xmlschema-core;2.3.0 from central in [default]\n", + "\torg.glassfish.jaxb#txw2;3.0.2 from central in [default]\n", + "\torg.scala-lang.modules#scala-collection-compat_2.12;2.9.0 from central in [default]\n", + "\t---------------------------------------------------------------------\n", + "\t| | modules || artifacts |\n", + "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", + "\t---------------------------------------------------------------------\n", + "\t| default | 5 | 0 | 0 | 0 || 5 | 0 |\n", + "\t---------------------------------------------------------------------\n", + ":: retrieving :: org.apache.spark#spark-submit-parent-3fcec813-3c46-422b-815b-97d9de5f26b1\n", + "\tconfs: [default]\n", + "\t0 artifacts copied, 5 already retrieved (0kB/31ms)\n", + "WARNING: An illegal reflective access operation has occurred\n", + "WARNING: Illegal reflective access by org.apache.hadoop.shaded.org.xbill.DNS.ResolverConfig (file:/usr/local/lib/python3.8/dist-packages/pyspark/jars/hadoop-client-runtime-3.3.2.jar) to method sun.net.dns.ResolverConfiguration.open()\n", + "WARNING: Please consider reporting this to the maintainers of org.apache.hadoop.shaded.org.xbill.DNS.ResolverConfig\n", + "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", + "WARNING: All illegal access operations will be denied in a future release\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/01/04 22:27:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/01/04 22:28:04 WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.\n", + "24/01/04 22:28:04 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.\n", + "24/01/04 22:28:14 WARN Client: Same path resource file:///root/.ivy2/jars/com.databricks_spark-xml_2.12-0.17.0.jar added multiple times to distributed cache.\n", + "24/01/04 22:28:14 WARN Client: Same path resource file:///root/.ivy2/jars/commons-io_commons-io-2.11.0.jar added multiple times to distributed cache.\n", + "24/01/04 22:28:14 WARN Client: Same path resource file:///root/.ivy2/jars/org.glassfish.jaxb_txw2-3.0.2.jar added multiple times to distributed cache.\n", + "24/01/04 22:28:14 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.ws.xmlschema_xmlschema-core-2.3.0.jar added multiple times to distributed cache.\n", + "24/01/04 22:28:14 WARN Client: Same path resource file:///root/.ivy2/jars/org.scala-lang.modules_scala-collection-compat_2.12-2.9.0.jar added multiple times to distributed cache.\n", + "24/01/04 22:28:37 WARN HiveClientImpl: Detected HiveConf hive.execution.engine is 'tez' and will be reset to 'mr' to disable useless hive logic\n", + "\u001b[0m22:28:41 Concurrency: 1 threads (target='dev')\n", + "\u001b[0m22:28:41 \n", + "\u001b[0m22:28:41 1 of 4 START test fact_trade__unique_trade ..................................... [RUN]\n", + "24/01/04 22:28:43 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.\n", + "\u001b[0m22:28:56 1 of 4 PASS fact_trade__unique_trade ........................................... [\u001b[32mPASS\u001b[0m in 15.38s]\n", + "\u001b[0m22:28:56 2 of 4 START test fact_trade__unique_trade-checkpoint .......................... [RUN]\n", + "\u001b[0m22:29:00 2 of 4 PASS fact_trade__unique_trade-checkpoint ................................ [\u001b[32mPASS\u001b[0m in 3.69s]\n", + "\u001b[0m22:29:00 3 of 4 START test fact_watches__dates_proper_relation .......................... [RUN]\n", + "\u001b[0m22:29:03 3 of 4 PASS fact_watches__dates_proper_relation ................................ [\u001b[32mPASS\u001b[0m in 3.48s]\n", + "\u001b[0m22:29:03 4 of 4 START test fact_watches__dates_proper_relation-checkpoint ............... [RUN]\n", + "\u001b[0m22:29:05 4 of 4 PASS fact_watches__dates_proper_relation-checkpoint ..................... [\u001b[32mPASS\u001b[0m in 1.95s]\n", + "\u001b[0m22:29:05 \n", + "\u001b[0m22:29:05 Finished running 4 tests in 0 hours 1 minutes and 11.83 seconds (71.83s).\n", + "\u001b[0m22:29:05 \n", + "\u001b[0m22:29:05 \u001b[32mCompleted successfully\u001b[0m\n", + "\u001b[0m22:29:05 \n", + "\u001b[0m22:29:05 Done. PASS=4 WARN=0 ERROR=0 SKIP=0 TOTAL=4\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], "source": [ "%%bash\n", "cd $REPO_ROOT\n", + "dbt deps\n", "dbt test" ] }, + { + "cell_type": "markdown", + "id": "a884478f-1b69-49e3-8b47-88582bb7e316", + "metadata": {}, + "source": [ + "# Test Spark" + ] + }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 12, "id": "a6ce00fe-ce46-4202-a798-a7f227ff3c1c", "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ":: loading settings :: url = jar:file:/usr/local/lib/python3.8/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" + ] + }, { "name": "stderr", "output_type": "stream", "text": [ - "Setting default log level to \"WARN\".\n", - "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "Ivy Default Cache set to: /root/.ivy2/cache\n", + "The jars for the packages stored in: /root/.ivy2/jars\n", + "com.databricks#spark-xml_2.12 added as a dependency\n", + ":: resolving dependencies :: org.apache.spark#spark-submit-parent-ccb06429-1b11-4007-8aea-a563e4eb9ebb;1.0\n", + "\tconfs: [default]\n", + "\tfound com.databricks#spark-xml_2.12;0.17.0 in central\n", + "\tfound commons-io#commons-io;2.11.0 in central\n", + "\tfound org.glassfish.jaxb#txw2;3.0.2 in central\n", + "\tfound org.apache.ws.xmlschema#xmlschema-core;2.3.0 in central\n", + "\tfound org.scala-lang.modules#scala-collection-compat_2.12;2.9.0 in central\n", + ":: resolution report :: resolve 607ms :: artifacts dl 34ms\n", + "\t:: modules in use:\n", + "\tcom.databricks#spark-xml_2.12;0.17.0 from central in [default]\n", + "\tcommons-io#commons-io;2.11.0 from central in [default]\n", + "\torg.apache.ws.xmlschema#xmlschema-core;2.3.0 from central in [default]\n", + "\torg.glassfish.jaxb#txw2;3.0.2 from central in [default]\n", + "\torg.scala-lang.modules#scala-collection-compat_2.12;2.9.0 from central in [default]\n", + "\t---------------------------------------------------------------------\n", + "\t| | modules || artifacts |\n", + "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", + "\t---------------------------------------------------------------------\n", + "\t| default | 5 | 0 | 0 | 0 || 5 | 0 |\n", + "\t---------------------------------------------------------------------\n", + ":: retrieving :: org.apache.spark#spark-submit-parent-ccb06429-1b11-4007-8aea-a563e4eb9ebb\n", + "\tconfs: [default]\n", + "\t0 artifacts copied, 5 already retrieved (0kB/27ms)\n", "WARNING: An illegal reflective access operation has occurred\n", "WARNING: Illegal reflective access by org.apache.hadoop.shaded.org.xbill.DNS.ResolverConfig (file:/usr/local/lib/python3.8/dist-packages/pyspark/jars/hadoop-client-runtime-3.3.2.jar) to method sun.net.dns.ResolverConfiguration.open()\n", "WARNING: Please consider reporting this to the maintainers of org.apache.hadoop.shaded.org.xbill.DNS.ResolverConfig\n", @@ -726,9 +770,28 @@ "name": "stdout", "output_type": "stream", "text": [ - "24/01/04 13:08:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", - "24/01/04 13:08:42 WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.\n", - "24/01/04 13:08:43 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.\n" + "24/01/05 00:15:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/01/05 00:15:42 WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.\n", + "24/01/05 00:15:42 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.\n", + "24/01/05 00:15:51 WARN Client: Same path resource file:///root/.ivy2/jars/com.databricks_spark-xml_2.12-0.17.0.jar added multiple times to distributed cache.\n", + "24/01/05 00:15:51 WARN Client: Same path resource file:///root/.ivy2/jars/commons-io_commons-io-2.11.0.jar added multiple times to distributed cache.\n", + "24/01/05 00:15:51 WARN Client: Same path resource file:///root/.ivy2/jars/org.glassfish.jaxb_txw2-3.0.2.jar added multiple times to distributed cache.\n", + "24/01/05 00:15:51 WARN Client: Same path resource file:///root/.ivy2/jars/org.apache.ws.xmlschema_xmlschema-core-2.3.0.jar added multiple times to distributed cache.\n", + "24/01/05 00:15:51 WARN Client: Same path resource file:///root/.ivy2/jars/org.scala-lang.modules_scala-collection-compat_2.12-2.9.0.jar added multiple times to distributed cache.\n" ] } ], @@ -744,7 +807,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "id": "8c892605-9496-4526-b574-a19168678934", "metadata": {}, "outputs": [ @@ -752,16 +815,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "24/01/04 13:09:41 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist\n", - "24/01/04 13:09:41 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist\n", - "24/01/04 13:09:52 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0\n", - "24/01/04 13:09:52 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.17.0.2\n", - "24/01/04 13:09:52 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException\n", - "+---------+\n", - "|namespace|\n", - "+---------+\n", - "| default|\n", - "+---------+\n", + "24/01/05 00:16:20 WARN HiveClientImpl: Detected HiveConf hive.execution.engine is 'tez' and will be reset to 'mr' to disable useless hive logic\n", + "+-----------+\n", + "| namespace|\n", + "+-----------+\n", + "| bronze|\n", + "| default|\n", + "|demo_bronze|\n", + "| demo_gold|\n", + "|demo_silver|\n", + "| digen|\n", + "| gold|\n", + "| silver|\n", + "+-----------+\n", "\n" ] } @@ -772,76 +838,156 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": null, "id": "b2e94c83-4983-49f5-8597-77e3b9c4661c", "metadata": {}, "outputs": [ + { + "data": { + "text/plain": [ + "DataFrame[]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.sql(\"use demo_gold\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44296edd-82e6-4600-b730-d2cc4992a81d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 1:=============================> (1 + 1) / 2]\r" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "24/01/04 13:09:56 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException\n", - "24/01/04 13:09:56 WARN ObjectStore: Failed to get database demp_gold, returning NoSuchObjectException\n" + "+---------+--------------------+-----------+\n", + "|namespace| tableName|isTemporary|\n", + "+---------+--------------------+-----------+\n", + "|demo_gold| dim_account| false|\n", + "|demo_gold| dim_broker| false|\n", + "|demo_gold| dim_company| false|\n", + "|demo_gold| dim_customer| false|\n", + "|demo_gold| dim_date| false|\n", + "|demo_gold| dim_security| false|\n", + "|demo_gold| dim_trade| false|\n", + "|demo_gold| fact_cash_balances| false|\n", + "|demo_gold|fact_cash_transac...| false|\n", + "|demo_gold| fact_holdings| false|\n", + "|demo_gold| fact_trade| false|\n", + "|demo_gold| fact_watches| false|\n", + "+---------+--------------------+-----------+\n", + "\n" ] }, { - "ename": "AnalysisException", - "evalue": "Database 'demp_gold' not found", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAnalysisException\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[42], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mspark\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msql\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43muse demp_gold\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/pyspark/sql/session.py:1034\u001b[0m, in \u001b[0;36mSparkSession.sql\u001b[0;34m(self, sqlQuery, **kwargs)\u001b[0m\n\u001b[1;32m 1032\u001b[0m sqlQuery \u001b[38;5;241m=\u001b[39m formatter\u001b[38;5;241m.\u001b[39mformat(sqlQuery, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 1033\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1034\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DataFrame(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jsparkSession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msql\u001b[49m\u001b[43m(\u001b[49m\u001b[43msqlQuery\u001b[49m\u001b[43m)\u001b[49m, \u001b[38;5;28mself\u001b[39m)\n\u001b[1;32m 1035\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 1036\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(kwargs) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n", - "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/py4j/java_gateway.py:1321\u001b[0m, in \u001b[0;36mJavaMember.__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1315\u001b[0m command \u001b[38;5;241m=\u001b[39m proto\u001b[38;5;241m.\u001b[39mCALL_COMMAND_NAME \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1316\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommand_header \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1317\u001b[0m args_command \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1318\u001b[0m proto\u001b[38;5;241m.\u001b[39mEND_COMMAND_PART\n\u001b[1;32m 1320\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgateway_client\u001b[38;5;241m.\u001b[39msend_command(command)\n\u001b[0;32m-> 1321\u001b[0m return_value \u001b[38;5;241m=\u001b[39m \u001b[43mget_return_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1322\u001b[0m \u001b[43m \u001b[49m\u001b[43manswer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgateway_client\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1324\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m temp_arg \u001b[38;5;129;01min\u001b[39;00m temp_args:\n\u001b[1;32m 1325\u001b[0m temp_arg\u001b[38;5;241m.\u001b[39m_detach()\n", - "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/pyspark/sql/utils.py:196\u001b[0m, in \u001b[0;36mcapture_sql_exception..deco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 192\u001b[0m converted \u001b[38;5;241m=\u001b[39m convert_exception(e\u001b[38;5;241m.\u001b[39mjava_exception)\n\u001b[1;32m 193\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(converted, UnknownException):\n\u001b[1;32m 194\u001b[0m \u001b[38;5;66;03m# Hide where the exception came from that shows a non-Pythonic\u001b[39;00m\n\u001b[1;32m 195\u001b[0m \u001b[38;5;66;03m# JVM exception message.\u001b[39;00m\n\u001b[0;32m--> 196\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m converted \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 197\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 198\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n", - "\u001b[0;31mAnalysisException\u001b[0m: Database 'demp_gold' not found" + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" ] } ], "source": [ - "spark.sql(\"use demp_gold\")" + "spark.sql(\"show tables\").show()" + ] + }, + { + "cell_type": "markdown", + "id": "093e2212-77da-431e-b2d4-a0fe50647240", + "metadata": {}, + "source": [ + "# Calculate layers' tables" ] }, { "cell_type": "code", - "execution_count": 43, - "id": "44296edd-82e6-4600-b730-d2cc4992a81d", + "execution_count": 42, + "id": "27d30e8a-9a17-46d0-bc6a-1a5ea2970e75", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "+---------+---------+-----------+\n", - "|namespace|tableName|isTemporary|\n", - "+---------+---------+-----------+\n", - "+---------+---------+-----------+\n", - "\n" + " Layer | # Tables\n", + "----------------+-----------\n", + " bronze | 0\n", + " default | 0\n", + " demo_bronze | 17\n", + " demo_gold | 12\n", + " demo_silver | 14\n", + " digen | 17\n", + " gold | 0\n", + " silver | 0\n" ] } ], "source": [ - "spark.sql(\"show tables\").show()" + "print(f\"{'Layer':>15} | {'# Tables':>10}\")\n", + "print(\"-\"*16 + \"+\" + \"-\"*11)\n", + "for db_row in spark.sql(\"show databases\").collect():\n", + " db_name = db_row.namespace\n", + " spark.sql(f\"use {db_name}\")\n", + " n_tables = spark.sql(\"show tables\").count()\n", + " print(f\"{db_name:>15} | {n_tables:>10}\")\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "d431d55c-20d8-45ed-aa23-72824950a314", + "metadata": {}, + "source": [ + "# Testing" ] }, { "cell_type": "code", - "execution_count": 44, - "id": "7e34fa44-084c-4445-ad41-17b4762a0446", + "execution_count": null, + "id": "4faf7793-ec51-45e9-8ea4-fe65dcd27dc1", "metadata": {}, "outputs": [], "source": [ - "spark.stop()" + "df_pandas = spark.sql(\"SELECT * FROM demo_gold.fact_watches LIMIT 10\").toPandas()" ] }, { "cell_type": "code", "execution_count": null, - "id": "c190b0c0-b4e2-4aed-85aa-1b78b77d1c05", + "id": "708c7e1d-b9fe-4dab-8a12-29d3e8bf1c74", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "df_pandas.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76205ed4-de8b-4aaa-9862-bd1f5e51f80a", + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql('''SELECT sk_trade_id, COUNT(*) cnt\n", + " FROM demo_gold.fact_trade\n", + " GROUP BY sk_trade_id\n", + " HAVING cnt > 1\n", + " ''').show()" + ] } ], "metadata": { diff --git a/tasks-phase2a.md b/tasks-phase2a.md index df47efc..3061e1a 100644 --- a/tasks-phase2a.md +++ b/tasks-phase2a.md @@ -1,98 +1,164 @@ -``` -IMPORTANT ❗ ❗ ❗ Please remember to destroy all the resources after each work session. You can recreate infrastructure by creating new PR and merging it to master. - -![img.png](doc/figures/destroy.png) - -0. The goal of this phase is to create infrastructure, perform benchmarking/scalability tests of sample three-tier lakehouse solution and analyze the results using: - -* [TPC-DI benchmark](https://www.tpc.org/tpcdi/) -* [dbt - data transformation tool](https://www.getdbt.com/) -* [GCP Composer - managed Apache Airflow](https://cloud.google.com/composer?hl=pl) -* [GCP Dataproc - managed Apache Spark](https://spark.apache.org/) -* [GCP Vertex AI Workbench - managed JupyterLab](https://cloud.google.com/vertex-ai-notebooks?hl=pl) - -Worth to read: - -* -* -* -* -* -``` -# 1. Authors: - -### 1.1. Team +# TBD Project | 2023Z | Phase 2 + +> IMPORTANT ❗ ❗ ❗ Please remember to destroy all the resources after each work session. You can recreate infrastructure by creating new PR and merging it to master. +> +> ![img.png](doc/figures/destroy.png) +> +> The goal of this phase is to create infrastructure, perform benchmarking/scalability tests of sample three-tier lakehouse solution and analyze the results using: +> +> * [TPC-DI benchmark](https://www.tpc.org/tpcdi/) +> * [dbt - data transformation tool](https://www.getdbt.com/) +> * [GCP Composer - managed Apache Airflow](https://cloud.google.com/composer?hl=pl) +> * [GCP Dataproc - managed Apache Spark](https://spark.apache.org/) +> * [GCP Vertex AI Workbench - managed JupyterLab](https://cloud.google.com/vertex-ai-notebooks?hl=pl) +> +> Worth to read: +> +> * +> * +> * +> * +> * + +# 1. Authors + +## 1.1. Team * Łukasz Staniszewski * Albert Ściseł * Mateusz Szczepanowski -### 1.2. Info +## 1.2. Info * Group number: 5 * Forked repo link: # 2. Replace your root `main.tf` - -From the phase 1 with [main.tf](https://github.com/bdg-tbd/tbd-workshop-1/blob/v1.0.36/main.tf) and change each module `source` reference from the repo relative path to a github repo tag `v1.0.36`. + +> From the phase 1 with [main.tf](https://github.com/bdg-tbd/tbd-workshop-1/blob/v1.0.36/main.tf) and change each module `source` reference from the repo relative path to a github repo tag `v1.0.36`. Changed file: [main.tf](https://github.com/thai-chicken/tbd-2023z-phase1/blob/master/main.tf) # 3. Provision your infrastructure -a) setup Vertex AI Workbench `pyspark` kernel as described in point [8](https://github.com/bdg-tbd/tbd-workshop-1/tree/v1.0.32#project-setup) - -b) upload [tpc-di-setup.ipynb](https://github.com/bdg-tbd/tbd-workshop-1/blob/v1.0.36/notebooks/tpc-di-setup.ipynb) to the running instance of your Vertex AI Workbench +>a) setup Vertex AI Workbench `pyspark` kernel as described in point [8](https://github.com/bdg-tbd/tbd-workshop-1/tree/v1.0.32#project-setup) +> +> +>b) upload [tpc-di-setup.ipynb](https://github.com/bdg-tbd/tbd-workshop-1/blob/v1.0.36/notebooks/tpc-di-setup.ipynb) to the running instance of your Vertex AI Workbench ![img.png](doc/figures/phase2a_task3.png) -# 4. In `tpc-di-setup.ipynb` modify cell under section ***Clone tbd-tpc-di repo***: - -a) first, fork to your github organization. - -* Forked repo link: - -b) create new branch (e.g. 'notebook') in your fork of tbd-tpc-di and modify profiles.yaml by commenting following lines: - -``` -#"spark.driver.port": "30000" -#"spark.blockManager.port": "30001" -#"spark.driver.host": "10.11.0.5" #FIXME: Result of the command (kubectl get nodes -o json | jq -r '.items[0].status.addresses[0].address') -#"spark.driver.bindAddress": "0.0.0.0" -``` - -This lines are required to run dbt on airflow but have to be commented while running dbt in notebook. - -c) update git clone command to point to ***your fork***. +# 4. In `tpc-di-setup.ipynb` modify cell under section ***Clone tbd-tpc-di repo*** + +> a) first, fork to your github organization. +> +> +> b) create new branch (e.g. 'notebook') in your fork of tbd-tpc-di and modify profiles.yaml by commenting following lines: +> +> ```tf +> #"spark.driver.port": "30000" +> #"spark.blockManager.port": "30001" +> #"spark.driver.host": "10.11.0.5" #FIXME: Result of the command (kubectl get nodes -o json | jq -r '.items[0].status.addresses[0].address') +> #"spark.driver.bindAddress": "0.0.0.0" +> ``` +> +> This lines are required to run dbt on airflow but have to be commented while running dbt in notebook. +> +> +> c) update git clone command to point to ***your fork***. + +Forked repo link: Link to the notebook: [tpc-di-setup.ipynb](https://github.com/thai-chicken/tbd-2023z-phase1/blob/master/notebooks/tpc-di-setup-dev.ipynb) # 5. Access Vertex AI Workbench and run cell by cell notebook `tpc-di-setup.ipynb` -a) in the first cell of the notebook replace: `%env DATA_BUCKET=tbd-2023z-9910-data` with your data bucket. - -b) in the cell: +> a) in the first cell of the notebook replace: `%env DATA_BUCKET=tbd-2023z-9910-data` with your data bucket. +> +> +> b) in the cell: +> +> ```bash +> mkdir -p git && cd git +> git clone https://github.com/mwiewior/tbd-tpc-di.git +> cd tbd-tpc-di +> git pull +> ``` +> +> replace repo with your fork. Next checkout to 'notebook' branch. +> +> +> c) after running first cells your fork of `tbd-tpc-di` repository will be cloned into Vertex AI enviroment (see git folder). +> +> +> d) take a look on `git/tbd-tpc-di/profiles.yaml`. This file includes Spark parameters that can be changed if you need to increase the number of executors and +> +> ```tf +> server_side_parameters: +> "spark.driver.memory": "2g" +> "spark.executor.memory": "4g" +> "spark.executor.instances": "2" +> "spark.hadoop.hive.metastore.warehouse.dir": "hdfs:///user/hive/warehouse/" +> ``` + +As a result of running cell with `python3.8 tpcdi.py --output-directory $GEN_OUTPUT_DIR --stage $DATA_BUCKET` we had couple of tables created: ```bash -mkdir -p git && cd git -git clone https://github.com/mwiewior/tbd-tpc-di.git -cd tbd-tpc-di -git pull +DATE table created. +DAILY_MARKET table created. +INDUSTRY table created. +PROSPECT table created. +CUSTOMER_MGMT table created. +TAX_RATE table created. +HR table created. +WATCH_HISTORY table created. +TRADE table created. +TRADE_HISTORY table created. +STATUS_TYPE table created. +TRADE_TYPE table created. +HOLDING_HISTORY table created. +CASH_TRANSACTION table created. +CMP table created. +SEC table created. +FIN table created. ``` -replace repo with your fork. Next checkout to 'notebook' branch. +As a result of running `dbt run` we got all the databases created: -c) after running first cells your fork of `tbd-tpc-di` repository will be cloned into Vertex AI enviroment (see git folder). +```bash +20:46:34 Finished running 43 table models in 0 hours 18 minutes and 1.68 seconds (1081.68s). +20:46:34 +20:46:34 Completed successfully +20:46:34 +20:46:34 Done. PASS=43 WARN=0 ERROR=0 SKIP=0 TOTAL=43 +``` -d) take a look on `git/tbd-tpc-di/profiles.yaml`. This file includes Spark parameters that can be changed if you need to increase the number of executors and +As a result of running `dbt test` we got: +```bash +20:48:06 Finished running 1 test in 0 hours 1 minutes and 5.87 seconds (65.87s). +20:48:06 +20:48:06 Completed successfully +20:48:06 +20:48:06 Done. PASS=1 WARN=0 ERROR=0 SKIP=0 TOTAL=1 ``` -server_side_parameters: - "spark.driver.memory": "2g" - "spark.executor.memory": "4g" - "spark.executor.instances": "2" - "spark.hadoop.hive.metastore.warehouse.dir": "hdfs:///user/hive/warehouse/" + +Using SQL commands we managed to see all the databases created: + +```bash ++-----------+ +| namespace| ++-----------+ +| bronze| +| default| +|demo_bronze| +| demo_gold| +|demo_silver| +| digen| +| gold| +| silver| ++-----------+ ``` # 6. Explore files created by generator and describe them, including format, content, total size @@ -101,11 +167,46 @@ server_side_parameters: # 7. Analyze tpcdi.py. What happened in the loading stage? -***Your answer*** +Script processes and loads TPC-DI benchmark data into a data lakehouse environment. It consists of two main functions: `get_session` and `process_files`. + +Function `get_session()` sets up a SparkSession with Hive support to handle SQL opeartions. Result of this function are four databases: `digen`, `bronze`, `silver` and `gold` created in Hive. + +Function `process_files()` is core of the script and it processes several specific files with different formats and loads them into data lakehouse. It consists of sub-functions like: `get_stage_path()` (constructing the staging path), `save_df()` (saving dataframe as table), `upload_files()` (handles file upload to GCS bucket). Depending on file format/metadata, function processes it in different way: + +* CSV and TXT are handled the same - schemas are created using `StructType` and `StructField` objects and then passed to `load_csv()` function (that loads .csv file into dataframe). +* XML is handled by SparkSession object to process hierarchical data. +* For FINWIRE `StructType` and `StructField` are used to enable reading the whole lines. Then fixed-width fields are handled and data extracted by processing separately different record types (`CMP`, `SEC` and `FIN`), each ending in separate Table. + +While loading, the program estabilishes and manages dependencies using Ivy, then loads modules and creates the tables - it creates 43 table models. Whole process ends up completing successfully. # 8. Using SparkSQL answer: how many table were created in each layer? -***SparkSQL command and output*** +SparkSQL command: + +```python +print(f"{'Layer':>15} | {'# Tables':>10}") +print("-"*16 + "+" + "-"*11) +for db_row in spark.sql("show databases").collect(): + db_name = db_row.namespace + spark.sql(f"use {db_name}") + n_tables = spark.sql("show tables").count() + print(f"{db_name:>15} | {n_tables:>10}") +``` + +Result: + +```tf + Layer | # Tables +----------------+----------- + bronze | 0 + default | 0 + demo_bronze | 17 + demo_gold | 12 + demo_silver | 14 + digen | 17 + gold | 0 + silver | 0 +``` # 9. Add some 3 more [dbt tests](https://docs.getdbt.com/docs/build/tests) and explain what you are testing. ***Add new tests to your repository.*** @@ -123,3 +224,42 @@ so dbt_git_repo points to your fork of tbd-tpc-di. # 11. Redeploy infrastructure and check if the DAG finished with no errors ***The screenshot of Apache Aiflow UI*** + +> The goal of next tasks is to perform benchmarking/scalability tests of sample three-tier lakehouse solution. + +# 12. In main.tf, change machine_type + +> ```tf +> module "dataproc" { +> depends_on = [module.vpc] +> source = "github.com/bdg-tbd/tbd-workshop-1.git?ref=v1.0.36/modules/dataproc" +> project_name = var.project_name +> region = var.region +> subnet = module.vpc.subnets[local.notebook_subnet_id].id +> machine_type = "e2-standard-2" +> } +> ``` +> +> and subsititute "e2-standard-2" with "e2-standard-4". + +# 13. If needed request to increase cpu quotas (e.g. to 30 CPUs) + +> [Link to console](https://console.cloud.google.com/apis/api/compute.googleapis.com/quotas?project=tbd-2023z-9918) + +# 14. Different number of executors + +> Using tbd-tpc-di notebook perform dbt run with different number of executors, i.e., 1, 2, and 5, by changing: +> +> ```tf +> "spark.executor.instances": "2" +> ``` +> +>in profiles.yml. + +# 15. Collect console output from dbt run + +>In the notebook, collect console output from dbt run, then parse it and retrieve total execution time and execution times of processing each model. Save the results from each number of executors. + +# 16. Analyze + +> Analyze the performance and scalability of execution times of each model. Visualize and discucss the final results.