From 751c2306e3c71736bad57b774991c40bc570c4f5 Mon Sep 17 00:00:00 2001 From: logicoffee Date: Tue, 20 Jan 2026 20:13:07 +0900 Subject: [PATCH 1/4] Implement roughly --- README.md | 23 +++- pyproject.toml | 1 + .../schema_builder.py | 118 +++++++++++++++++- 3 files changed, 139 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 2c63755..8a6b36c 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,12 @@ BigQuery extras (only needed for `build_schema`): pip install analytics-query-analyzer[bigquery] ``` +Redshift extras (only needed for `build_schema`): + +```bash +pip install analytics-query-analyzer[redshift] +``` + ## Support BigQuery is currently supported and tested. @@ -129,7 +135,22 @@ print(schema) ``` - Authentication uses Application Default Credentials (ADC). -- Only BigQuery is supported for `build_schema`. - When `table` is omitted, it scans all tables in the dataset. - When both `dataset` and `table` are omitted, it scans all datasets in the project. - The returned `schema` can be passed directly to `analyze` and `analyze_timespan`. + +Fetching from Redshift is also supported: + +```python +from analytics_query_analyzer import build_schema +from sqlglot import dialects + +schema = build_schema(dialects.Redshift, "my_database", "public", "orders") +print(schema) +``` + +Redshift authentication supports two modes: + +- Set `REDSHIFT_HOST`, `REDSHIFT_USER`, and `REDSHIFT_PASSWORD` (optional `REDSHIFT_PORT`). +- Set `REDSHIFT_CLUSTER_IDENTIFIER`, `REDSHIFT_REGION`, and `REDSHIFT_DB_USER` to use IAM. +- Use `AWS_PROFILE` if you want to select a non-default AWS profile. diff --git a/pyproject.toml b/pyproject.toml index ad87a1b..26a0536 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ dynamic = ["version"] [project.optional-dependencies] bigquery = ["google-cloud-bigquery"] +redshift = ["redshift-connector"] [tool.setuptools] package-dir = {"" = "src"} diff --git a/src/analytics_query_analyzer/schema_builder.py b/src/analytics_query_analyzer/schema_builder.py index 28c8fba..971c941 100644 --- a/src/analytics_query_analyzer/schema_builder.py +++ b/src/analytics_query_analyzer/schema_builder.py @@ -1,6 +1,8 @@ +import os + from sqlglot import dialects -SUPPORTED_DIALECTS = {dialects.BigQuery} +SUPPORTED_DIALECTS = {dialects.BigQuery, dialects.Redshift} def build_schema( @@ -14,7 +16,11 @@ def build_schema( ) if resolved not in SUPPORTED_DIALECTS: raise ValueError(f"Unsupported dialect for build_schema: {resolved.__name__}") - return _build_schema_bigquery(database, schema, table) + if resolved is dialects.BigQuery: + return _build_schema_bigquery(database, schema, table) + if resolved is dialects.Redshift: + return _build_schema_redshift(database, schema, table) + raise ValueError(f"Unsupported dialect for build_schema: {resolved.__name__}") def _build_schema_bigquery(project: str, dataset: str | None, table: str | None): @@ -69,3 +75,111 @@ def _dataset_to_schema(dataset_id: str) -> dict: table_ref = dataset_ref.table(table) table_obj = client.get_table(table_ref) return {project: {dataset: _table_to_schema(table_obj)}} + + +def _redshift_connection_kwargs(database: str) -> dict[str, str | int | bool]: + host = os.getenv("REDSHIFT_HOST") + port = os.getenv("REDSHIFT_PORT") + user = os.getenv("REDSHIFT_USER") + password = os.getenv("REDSHIFT_PASSWORD") + if host: + if not user or not password: + raise RuntimeError( + "Redshift host auth requires REDSHIFT_USER and REDSHIFT_PASSWORD." + ) + return { + "host": host, + "port": int(port) if port else 5439, + "database": database, + "user": user, + "password": password, + } + + cluster_identifier = os.getenv("REDSHIFT_CLUSTER_IDENTIFIER") + region = os.getenv("REDSHIFT_REGION") + db_user = os.getenv("REDSHIFT_DB_USER") or user + if cluster_identifier and region and db_user: + return { + "cluster_identifier": cluster_identifier, + "database": database, + "db_user": db_user, + "region": region, + "iam": True, + } + + raise RuntimeError( + "Redshift connection configuration not found. " + "Set REDSHIFT_HOST/REDSHIFT_USER/REDSHIFT_PASSWORD or " + "REDSHIFT_CLUSTER_IDENTIFIER/REDSHIFT_REGION/REDSHIFT_DB_USER." + ) + + +def _build_schema_redshift(database: str, schema: str | None, table: str | None): + try: + import redshift_connector + except ImportError as e: + raise RuntimeError( + "Redshift support is not installed. " + "Install with: pip install analytics-query-analyzer[redshift]" + ) from e + + connection_kwargs = _redshift_connection_kwargs(database) + conn = redshift_connector.connect(**connection_kwargs) + try: + cursor = conn.cursor() + try: + if schema is None: + cursor.execute( + """ + select schema_name + from information_schema.schemata + where schema_name not in ('information_schema', 'pg_catalog') + and schema_name not like 'pg_%' + order by schema_name + """ + ) + schemas = [row[0] for row in cursor.fetchall()] + else: + schemas = [schema] + + result: dict[str, dict] = {} + for schema_name in schemas: + if table is None: + cursor.execute( + """ + select table_name + from information_schema.tables + where table_schema = %s + and table_type = 'BASE TABLE' + order by table_name + """, + (schema_name,), + ) + tables = [row[0] for row in cursor.fetchall()] + else: + tables = [table] + if not tables: + result[schema_name] = {} + continue + + placeholders = ", ".join(["%s"] * len(tables)) + cursor.execute( + f""" + select table_name, column_name, data_type, udt_name + from information_schema.columns + where table_schema = %s + and table_name in ({placeholders}) + order by table_name, ordinal_position + """, + (schema_name, *tables), + ) + table_schema: dict[str, dict[str, str]] = {name: {} for name in tables} + for table_name, column_name, data_type, udt_name in cursor.fetchall(): + column_type = data_type or udt_name or "unknown" + table_schema.setdefault(table_name, {})[column_name] = column_type + result[schema_name] = table_schema + finally: + cursor.close() + finally: + conn.close() + return {database: result} From 7414b1b193bc4d4def84423abe188b56f5bec0eb Mon Sep 17 00:00:00 2001 From: logicoffee Date: Wed, 21 Jan 2026 20:57:42 +0900 Subject: [PATCH 2/4] Make it possible to set host when using iam --- .../schema_builder.py | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/src/analytics_query_analyzer/schema_builder.py b/src/analytics_query_analyzer/schema_builder.py index 971c941..610fb73 100644 --- a/src/analytics_query_analyzer/schema_builder.py +++ b/src/analytics_query_analyzer/schema_builder.py @@ -82,6 +82,24 @@ def _redshift_connection_kwargs(database: str) -> dict[str, str | int | bool]: port = os.getenv("REDSHIFT_PORT") user = os.getenv("REDSHIFT_USER") password = os.getenv("REDSHIFT_PASSWORD") + + cluster_identifier = os.getenv("REDSHIFT_CLUSTER_IDENTIFIER") + region = os.getenv("REDSHIFT_REGION") + db_user = os.getenv("REDSHIFT_DB_USER") or user + if cluster_identifier and region and db_user: + kwargs = { + "cluster_identifier": cluster_identifier, + "database": database, + "db_user": db_user, + "region": region, + "iam": True, + } + if host: + kwargs["host"] = host + if port: + kwargs["port"] = int(port) + return kwargs + if host: if not user or not password: raise RuntimeError( @@ -95,18 +113,6 @@ def _redshift_connection_kwargs(database: str) -> dict[str, str | int | bool]: "password": password, } - cluster_identifier = os.getenv("REDSHIFT_CLUSTER_IDENTIFIER") - region = os.getenv("REDSHIFT_REGION") - db_user = os.getenv("REDSHIFT_DB_USER") or user - if cluster_identifier and region and db_user: - return { - "cluster_identifier": cluster_identifier, - "database": database, - "db_user": db_user, - "region": region, - "iam": True, - } - raise RuntimeError( "Redshift connection configuration not found. " "Set REDSHIFT_HOST/REDSHIFT_USER/REDSHIFT_PASSWORD or " From d321709aecbd3afc5d7a55e18e6a392629f6428c Mon Sep 17 00:00:00 2001 From: logicoffee Date: Wed, 21 Jan 2026 21:23:34 +0900 Subject: [PATCH 3/4] Update --- README.md | 2 +- src/analytics_query_analyzer/schema_builder.py | 14 ++++++-------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 8a6b36c..78c5fe8 100644 --- a/README.md +++ b/README.md @@ -152,5 +152,5 @@ print(schema) Redshift authentication supports two modes: - Set `REDSHIFT_HOST`, `REDSHIFT_USER`, and `REDSHIFT_PASSWORD` (optional `REDSHIFT_PORT`). -- Set `REDSHIFT_CLUSTER_IDENTIFIER`, `REDSHIFT_REGION`, and `REDSHIFT_DB_USER` to use IAM. +- Set `REDSHIFT_CLUSTER_IDENTIFIER`, `REDSHIFT_REGION`, and `REDSHIFT_DB_USER` to use IAM (you can also set `REDSHIFT_HOST`/`REDSHIFT_PORT`). - Use `AWS_PROFILE` if you want to select a non-default AWS profile. diff --git a/src/analytics_query_analyzer/schema_builder.py b/src/analytics_query_analyzer/schema_builder.py index 610fb73..3bfc6ed 100644 --- a/src/analytics_query_analyzer/schema_builder.py +++ b/src/analytics_query_analyzer/schema_builder.py @@ -80,12 +80,14 @@ def _dataset_to_schema(dataset_id: str) -> dict: def _redshift_connection_kwargs(database: str) -> dict[str, str | int | bool]: host = os.getenv("REDSHIFT_HOST") port = os.getenv("REDSHIFT_PORT") - user = os.getenv("REDSHIFT_USER") - password = os.getenv("REDSHIFT_PASSWORD") cluster_identifier = os.getenv("REDSHIFT_CLUSTER_IDENTIFIER") region = os.getenv("REDSHIFT_REGION") - db_user = os.getenv("REDSHIFT_DB_USER") or user + db_user = os.getenv("REDSHIFT_DB_USER") + + user = os.getenv("REDSHIFT_USER") + password = os.getenv("REDSHIFT_PASSWORD") + if cluster_identifier and region and db_user: kwargs = { "cluster_identifier": cluster_identifier, @@ -100,11 +102,7 @@ def _redshift_connection_kwargs(database: str) -> dict[str, str | int | bool]: kwargs["port"] = int(port) return kwargs - if host: - if not user or not password: - raise RuntimeError( - "Redshift host auth requires REDSHIFT_USER and REDSHIFT_PASSWORD." - ) + if host and user and password: return { "host": host, "port": int(port) if port else 5439, From 2e46f28575dc9f34e75d4b0ae97dfc955dc1dfe0 Mon Sep 17 00:00:00 2001 From: logicoffee Date: Wed, 21 Jan 2026 21:26:46 +0900 Subject: [PATCH 4/4] Update --- src/analytics_query_analyzer/schema_builder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/analytics_query_analyzer/schema_builder.py b/src/analytics_query_analyzer/schema_builder.py index 3bfc6ed..3bf732e 100644 --- a/src/analytics_query_analyzer/schema_builder.py +++ b/src/analytics_query_analyzer/schema_builder.py @@ -8,8 +8,8 @@ def build_schema( dialect: str | type[dialects.Dialect], database: str, - schema: str | None, - table: str | None, + schema: str | None = None, + table: str | None = None, ): resolved = ( dialects.get_or_raise(dialect.lower()) if isinstance(dialect, str) else dialect