feat: add performance metrics commands

openedx · Mar 15, 2024 · db3cc91 · db3cc91
1 parent daf3b92
commit db3cc91
Show file tree

Hide file tree

Showing 6 changed files with 336 additions and 1 deletion.
diff --git a/tutoraspects/commands_v0.py b/tutoraspects/commands_v0.py
@@ -106,6 +106,21 @@ def alembic(context, command) -> None:
     """
     runner.run_job("aspects", command)
 
+# Ex: "tutor local do performance-metrics "
+@click.command(context_settings={"ignore_unknown_options": True})
+@click.pass_obj
+def performance_metrics(context) -> None:
+    """
+    Job to import Superset assets.
+    """
+    config = tutor_config.load(context.root)
+    runner = context.job_runner(config)
+
+    command = """echo 'Performance...' &&
+    python /app/pythonpath/performance_metrics.py &&
+    echo 'Done!';
+    """
+    runner.run_job("superset", command)
 
 # Ex: "tutor local do import_assets "
 @click.command(context_settings={"ignore_unknown_options": True})

diff --git a/tutoraspects/commands_v1.py b/tutoraspects/commands_v1.py
@@ -129,6 +129,21 @@ def import_assets() -> list[tuple[str, str]]:
         ),
     ]
 
+# Ex: "tutor local do import_assets "
+@click.command(context_settings={"ignore_unknown_options": True})
+def performance_metrics() -> list[tuple[str, str]]:
+    """
+    Job to import Superset assets.
+    """
+    return [
+        (
+            "superset",
+            "echo 'Performance...' && "
+            "python /app/pythonpath/performance_metrics.py &&"
+            "echo 'Done!';",
+        ),
+    ]
+
 
 # Ex: "tutor local do dump_data_to_clickhouse "
 @click.command(context_settings={"ignore_unknown_options": True})
@@ -311,6 +326,7 @@ def check_superset_assets():
     dump_data_to_clickhouse,
     transform_tracking_logs,
     import_assets,
+    performance_metrics,
 )
 
 COMMANDS = (aspects,)
diff --git a/tutoraspects/templates/aspects/apps/superset/pythonpath/performance_metrics.py b/tutoraspects/templates/aspects/apps/superset/pythonpath/performance_metrics.py
@@ -0,0 +1,136 @@
+from superset.app import create_app
+app = create_app()
+app.app_context().push()
+
+"""
+The plan
+
+1. Load the instructor dashboard
+2. Load the instructor dashboard datasets
+3. Perform the following operations for every dataset:
+    a. Load the dataset
+    b. Perform queries on the dataset with/without filters
+    c. Perform aggregations on the dataset
+    d. Measure the time taken for each operation
+    e. Repeat 5 times and take metrics such as average, median, max, min, etc.
+"""
+from superset.extensions import db
+from flask import g
+from superset.models.dashboard import Dashboard
+from superset.models.slice import Slice
+from superset.charts.schemas import ChartDataQueryContextSchema
+from superset.charts.data.commands.get_data_command import ChartDataCommand
+from superset import security_manager
+import json
+from datetime import datetime
+import sqlparse
+from unittest.mock import patch
+import uuid
+import time
+
+ASPECTS_VERSION = "{{ASPECTS_VERSION}}"
+UUID = str(uuid.uuid4())[0:6]
+RUN_ID = f'aspects-{ASPECTS_VERSION}-{UUID}'
+
+def load_dashboards():
+    """Load dashboards from the database."""
+    embedable_dashboards = {{ SUPERSET_EMBEDDABLE_DASHBOARDS }}
+    dashboards = db.session.query(Dashboard).filter(Dashboard.slug.in_(embedable_dashboards)).all()
+    return dashboards
+
+
+def performance_metrics():
+    """Measure the performance of the dashboard."""
+    with patch('clickhouse_connect.common.build_client_name') as mock_build_client_name:
+        mock_build_client_name.return_value = RUN_ID
+        dashboards = load_dashboards()
+        report = []
+        for dashboard in dashboards:
+            print("Dashboard:", dashboard.slug)
+            for slice in dashboard.slices:
+                report.append(meassure_chart(slice))
+        return report
+
+
+report_format = """
+    {i}. {slice}
+    Superset time: {superset_time}"""
+
+query_format = """    Query duration: {query_duration_ms} s.
+    Result rows: {result_rows}
+    Memory Usage (MB): {memory_usage_mb}
+    Row count Clickhouse {rowcount:}
+    Filters: {filters}"""
+
+def meassure_chart(slice, extra_filters=None):
+    """
+    Meassure the performance of a chart and return the results.
+    """
+    print("Slice:", slice)
+    query_context = json.loads(slice.query_context)
+    query_context.update({
+        "result_format": "json",
+        "result_type": "full",
+        "force": True,
+    })
+    if extra_filters:
+        query_context["filters"] = extra_filters
+    g.user = security_manager.find_user(username="admin")
+    query_context = ChartDataQueryContextSchema().load(query_context)
+    command = ChartDataCommand(query_context)
+    start_time = datetime.now()
+    result = command.run()
+    end_time = datetime.now()
+    result["time_elapsed"] = '{} s.'.format((end_time-start_time).total_seconds())
+    result["slice"] = slice
+    return result
+
+def get_query_log_from_clickhouse(report):
+    chart_uuid = "bb13bb31-c797-4ed3-a7f9-7825cc6dc482"
+
+    slice = db.session.query(Slice).filter(Slice.uuid == chart_uuid).one()
+    query_context = json.loads(slice.query_context)
+    query_context["queries"][0]["filters"].append(
+        {
+            'col': 'http_user_agent', 'op': '==', 'val': RUN_ID
+        }
+    )
+    slice.query_context = json.dumps(query_context)
+    result = meassure_chart(slice)
+    clickhouse_queries = {}
+    for query in result["queries"]:
+        for row in query["data"]:
+            #for field in ["query_duration_ms", "result_rows", "event_time", "memory_usage_mb"]:
+                #print(f"{field}: {row[field]}")
+            parsed_sql = str(sqlparse.parse(row.pop("query"))[0])
+            clickhouse_queries[parsed_sql] = row
+
+
+    print("\nSuperset Reports")
+    for i, result in enumerate(report):
+        print(report_format.format(
+            i=(i+1),
+            slice=result["slice"],
+            superset_time=result["time_elapsed"]
+        ))
+        for i, query in enumerate(result["queries"]):
+            parsed_sql = str(sqlparse.parse(query["query"])[0]).replace(";", "") + "\n FORMAT Native"
+            clickhouse_report = clickhouse_queries.get(parsed_sql, {})
+            print(query_format.format(
+                query_duration_ms=clickhouse_report.get("query_duration_ms") / 1000,
+                memory_usage_mb =clickhouse_report.get("memory_usage_mb"),
+                result_rows=clickhouse_report.get("result_rows"),
+                rowcount=query["rowcount"],
+                filters=query["applied_filters"],
+            ))
+
+
+
+
+if __name__ == "__main__":
+    print("Running performance metrics. RUN ID:", RUN_ID)
+    report = performance_metrics()
+    # Clickhouse query log takes some seconds to log queries.
+    print("Waiting for clickhouse log...")
+    time.sleep(10)
+    get_query_log_from_clickhouse(report)
diff --git a/...plates/aspects/build/aspects-superset/openedx-assets/assets/charts/Query_performance.yaml b/...plates/aspects/build/aspects-superset/openedx-assets/assets/charts/Query_performance.yaml
@@ -0,0 +1,43 @@
+_file_name: Query_performance.yaml
+cache_timeout: null
+certification_details: null
+certified_by: null
+dataset_uuid: 53099fd2-6739-45e5-9a93-fcb9fa70119f
+description: null
+params:
+  adhoc_filters:
+  - clause: WHERE
+    comparator: No filter
+    expressionType: SIMPLE
+    operator: TEMPORAL_RANGE
+    subject: event_time
+  all_columns: []
+  color_pn: true
+  extra_form_data: {}
+  groupby:
+  - http_user_agent
+  - query_duration_ms
+  - result_rows
+  - event_time
+  - memory_usage_mb
+  - tables
+  - query
+  order_by_cols: []
+  order_desc: true
+  percent_metrics: []
+  query_mode: aggregate
+  row_limit: 1000
+  server_page_length: 10
+  show_cell_bars: true
+  table_timestamp_format: smart_date
+  temporal_columns_lookup:
+    event_time: true
+  time_grain_sqla: P1D
+  viz_type: table
+query_context: '{"datasource":{"id":266,"type":"table"},"force":false,"queries":[{"filters":[{"col":"event_time","op":"TEMPORAL_RANGE","val":"No
+  filter"}],"extras":{"time_grain_sqla":"P1D","having":"","where":""},"applied_time_extras":{},"columns":["http_user_agent","query_duration_ms","result_rows",{"timeGrain":"P1D","columnType":"BASE_AXIS","sqlExpression":"event_time","label":"event_time","expressionType":"SQL"},"memory_usage_mb","tables","query"],"metrics":[],"orderby":[],"annotation_layers":[],"row_limit":1000,"series_limit":0,"order_desc":true,"url_params":{},"custom_params":{},"custom_form_data":{},"post_processing":[]}],"form_data":{"datasource":"266__table","viz_type":"table","slice_id":422,"query_mode":"aggregate","groupby":["http_user_agent","query_duration_ms","result_rows","event_time","memory_usage_mb","tables","query"],"time_grain_sqla":"P1D","temporal_columns_lookup":{"event_time":true},"all_columns":[],"percent_metrics":[],"adhoc_filters":[{"clause":"WHERE","comparator":"No
+  filter","expressionType":"SIMPLE","operator":"TEMPORAL_RANGE","subject":"event_time"}],"order_by_cols":[],"row_limit":1000,"server_page_length":10,"order_desc":true,"table_timestamp_format":"smart_date","show_cell_bars":true,"color_pn":true,"extra_form_data":{},"dashboards":[],"force":false,"result_format":"json","result_type":"full"},"result_format":"json","result_type":"full"}'
+slice_name: Query performance
+uuid: bb13bb31-c797-4ed3-a7f9-7825cc6dc482
+version: 1.0.0
+viz_type: table
diff --git a/...ts/templates/aspects/build/aspects-superset/openedx-assets/assets/datasets/query_log.yaml b/...ts/templates/aspects/build/aspects-superset/openedx-assets/assets/datasets/query_log.yaml
@@ -0,0 +1,125 @@
+_file_name: query_log.yaml
+cache_timeout: null
+columns:
+- advanced_data_type: null
+  column_name: event_time
+  description: null
+  expression: null
+  extra: {}
+  filterable: true
+  groupby: true
+  is_active: true
+  is_dttm: true
+  python_date_format: null
+  type: DateTime
+  verbose_name: Event Time
+- advanced_data_type: null
+  column_name: query
+  description: null
+  expression: null
+  extra: {}
+  filterable: true
+  groupby: true
+  is_active: true
+  is_dttm: false
+  python_date_format: null
+  type: String
+  verbose_name: Query
+- advanced_data_type: null
+  column_name: tables
+  description: null
+  expression: null
+  extra: {}
+  filterable: true
+  groupby: true
+  is_active: true
+  is_dttm: false
+  python_date_format: null
+  type: Array(LowCardinality(String))
+  verbose_name: null
+- advanced_data_type: null
+  column_name: http_user_agent
+  description: null
+  expression: null
+  extra: {}
+  filterable: true
+  groupby: true
+  is_active: true
+  is_dttm: false
+  python_date_format: null
+  type: LowCardinality(String)
+  verbose_name: null
+- advanced_data_type: null
+  column_name: memory_usage_mb
+  description: null
+  expression: null
+  extra: {}
+  filterable: true
+  groupby: true
+  is_active: true
+  is_dttm: false
+  python_date_format: null
+  type: Float64
+  verbose_name: null
+- advanced_data_type: null
+  column_name: query_duration_ms
+  description: null
+  expression: null
+  extra: {}
+  filterable: true
+  groupby: true
+  is_active: true
+  is_dttm: false
+  python_date_format: null
+  type: UInt64
+  verbose_name: null
+- advanced_data_type: null
+  column_name: result_rows
+  description: null
+  expression: null
+  extra: {}
+  filterable: true
+  groupby: true
+  is_active: true
+  is_dttm: false
+  python_date_format: null
+  type: UInt64
+  verbose_name: null
+database_uuid: 21174b6c-4d40-4958-8161-d6c3cf5e77b6
+default_endpoint: null
+description: null
+extra: null
+fetch_values_predicate: null
+filter_select_enabled: true
+main_dttm_col: null
+metrics:
+- currency: null
+  d3format: null
+  description: null
+  expression: COUNT(*)
+  extra:
+    warning_markdown: ''
+  metric_name: count
+  metric_type: count
+  verbose_name: COUNT(*)
+  warning_text: null
+normalize_columns: false
+offset: 0
+params: null
+schema: main
+sql: |
+  SELECT
+      query_duration_ms,
+      result_rows,
+      memory_usage / (1024 * 1024 ) AS memory_usage_mb,
+      event_time,
+      tables,
+      query,
+      http_user_agent
+  FROM system.query_log
+  WHERE (has(databases, '{{ASPECTS_XAPI_DATABASE}}') OR has(databases, '{{ASPECTS_EVENT_SINK_DATABASE}}') OR has(databases, '{{DBT_PROFILE_TARGET_DATABASE}}')) AND (http_user_agent LIKE 'aspects-%') AND (type = 'QueryFinish')
+  ORDER BY event_time DESC
+table_name: query_log
+template_params: null
+uuid: 53099fd2-6739-45e5-9a93-fcb9fa70119f
+version: 1.0.0
diff --git a/tutoraspects/templates/openedx-assets/queries/slowest_clickhouse_queries.sql b/tutoraspects/templates/openedx-assets/queries/slowest_clickhouse_queries.sql
@@ -7,6 +7,6 @@ SELECT
 FROM
     system.query_log
 WHERE
-    TYPE = 'QueryFinish'
+    type = 'QueryFinish'
 ORDER BY
     query_duration_ms DESC;