diff --git a/Dockerfile b/Dockerfile index 1cd7cd0..8ad352e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -34,8 +34,8 @@ RUN apt-get clean \ ENV PYTHONIOENCODING=utf-8 ENV LANG=C.UTF-8 -# Update python -RUN python -m pip install --upgrade pip setuptools wheel --no-cache-dir +# Update python packages +RUN python -m pip install --upgrade pip setuptools wheel yq pytz pandas colorama --no-cache-dir ## # dbt-bigquery with all packages diff --git a/cron_dbt_docs.sh b/cron_dbt_docs.sh index 6ab0e61..ae9d646 100644 --- a/cron_dbt_docs.sh +++ b/cron_dbt_docs.sh @@ -2,6 +2,9 @@ # set -o errexit +set -o pipefail +set -o nounset + catch() { echo 'catching!' if [ "$1" != "0" ]; then @@ -24,14 +27,89 @@ while [ $# -gt 0 ]; do shift done -echo 'Start realoading DBT Workload' +echo 'Start reloading DBT Workload' echo 'Checking dependencies' echo 'Update DBT-Project Repo files' echo "Working on dbt-project directory ${DBT_REPO_NAME}" -if cd /data/dbt/"${DBT_REPO_NAME}"; then git config pull.rebase true; git reset --hard; git pull; else git clone "${GITLINK_SECRET}" /data/dbt/; fi +DBT_DIR="/data/dbt/${DBT_REPO_NAME}" +if cd "${DBT_DIR}"; then + git config pull.rebase true + git reset --hard + git pull +else + git clone "${GITLINK_SECRET}" /data/dbt/ + cd "${DBT_DIR}" +fi + +# Function to check and modify packages.yml if needed +check_and_modify_packages() { + local airflow_vars_file="${1:-dbt_airflow_variables.yml}" + local packages_file="${2:-packages.yml}" + + # If the airflow vars file is not an absolute path, treat it as relative to the dbt project dir + if [[ "${airflow_vars_file}" != /* ]]; then + airflow_vars_file="./${airflow_vars_file}" + fi + if [[ "${packages_file}" != /* ]]; then + packages_file="./${packages_file}" + fi + + if [ ! -f "${airflow_vars_file}" ]; then + echo "⚠️ [WARNING] Airflow variables file not found: ${airflow_vars_file}. Skipping DATA_QUALITY check." + return 0 + fi + + if [ ! -f "${packages_file}" ]; then + echo "⚠️ [WARNING] packages.yml not found: ${packages_file}. Skipping package modification." + return 0 + fi + + # Parse DATA_QUALITY similarly to mr_e2e_workflow.yaml + local data_quality + data_quality="$(grep -E '^[[:space:]]*DATA_QUALITY:' "${airflow_vars_file}" | tail -n1 | awk '{print $2}' | tr -d "'" | tr -d '"')" + + # Only modify if the re-data package exists in packages.yml + if grep -q 're-data/re_data' "${packages_file}"; then + if [ "${data_quality}" = "false" ] || [ "${data_quality}" = "False" ]; then + echo "⚠️ [WARNING] DATA_QUALITY is ${data_quality}. Removing re-data package from ${packages_file}" + + if command -v yq >/dev/null 2>&1; then + # yq v4 syntax (same as mr_e2e_workflow.yaml) + yq 'del(.packages[] | select(.package == "re-data/re_data"))' "${packages_file}" > "${packages_file}.tmp" + else + # Fallback removal (no yq): drop the '- package: re-data/re_data' block + awk ' + BEGIN { skip = 0 } + { + if (skip == 1) { + if ($0 ~ /^[[:space:]]*-[[:space:]]*package:/) { + skip = 0 + } else { + next + } + } + if ($0 ~ /^[[:space:]]*-[[:space:]]*package:[[:space:]]*re-data\/re_data[[:space:]]*$/) { + skip = 1 + next + } + print + } + ' "${packages_file}" > "${packages_file}.tmp" + fi + + mv "${packages_file}.tmp" "${packages_file}" + + # Force recompilation by removing manifest (if present) + rm -f "./target/manifest.json" + echo "✅ [SUCCESS] re-data package removed and manifest cleared" + fi + fi +} + +# Always check DATA_QUALITY and adjust packages.yml before dbt deps +check_and_modify_packages "${AIRFLOW_SECRET_FILE_NAME:-dbt_airflow_variables.yml}" "packages.yml" + echo 'Update dbt packages' /usr/local/bin/dbt deps --profiles-dir /data/dbt/"${DBT_REPO_NAME}"/ --project-dir /data/dbt/"${DBT_REPO_NAME}"/ echo 'Generate dbt docs' -/usr/local/bin/dbt docs generate --profiles-dir /data/dbt/"${DBT_REPO_NAME}"/ --project-dir /data/dbt/"${DBT_REPO_NAME}"/ - -trap 'catch $? $LINENO' EXIT \ No newline at end of file +/usr/local/bin/dbt docs generate --profiles-dir /data/dbt/"${DBT_REPO_NAME}"/ --project-dir /data/dbt/"${DBT_REPO_NAME}"/ \ No newline at end of file