From e9e15554d7f2f586a1ea2a81fb896a95b9feba85 Mon Sep 17 00:00:00 2001 From: John Brittain Date: Fri, 11 Oct 2024 15:45:12 +0100 Subject: [PATCH] Mark rows excluded from ingestion --- pyproject.toml | 1 + src/InsightBoard/__main__.py | 2 + src/InsightBoard/database/database.py | 6 +- src/InsightBoard/pages/upload.py | 233 ++++++++++++++++++++------ src/InsightBoard/utils.py | 20 ++- tests/unit/pages/test_upload.py | 41 ++++- uv.lock | 13 +- 7 files changed, 255 insertions(+), 61 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 612928e..38c7de2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ dependencies = [ "dash-dangerously-set-inner-html>=0.0.2", "tomli-w>=1.0.0", "waitress>=3.0.0", + "cachetools>=5.5.0", ] scripts = { InsightBoard = "InsightBoard:main" } dynamic = ["version"] diff --git a/src/InsightBoard/__main__.py b/src/InsightBoard/__main__.py index 9061b41..320624e 100644 --- a/src/InsightBoard/__main__.py +++ b/src/InsightBoard/__main__.py @@ -1,3 +1,4 @@ +import logging import argparse from InsightBoard import main from InsightBoard.app import app @@ -10,6 +11,7 @@ if args.debug: # Dash launches a Flask development server + logging.basicConfig(level=logging.DEBUG) app.run(debug=True) else: # Otherwise, launch the production server diff --git a/src/InsightBoard/database/database.py b/src/InsightBoard/database/database.py index 1c4ea84..73d75c1 100644 --- a/src/InsightBoard/database/database.py +++ b/src/InsightBoard/database/database.py @@ -9,7 +9,7 @@ from pathlib import Path from pyarrow import Table from datetime import datetime -from functools import cache +from cachetools import cached, TTLCache class DatabaseBackend(Enum): @@ -71,6 +71,7 @@ def commit_tables(self, table_names: [str], datasets: [pd.DataFrame]): for table_name, df in zip(table_names, datasets, strict=True): self.commit_table(table_name, df) + @cached(TTLCache(maxsize=1, ttl=10)) def get_primary_key(self, table_name: str): schema = self.get_table_schema(table_name) # Find field that has the 'PrimaryKey' set to 'True' @@ -84,13 +85,14 @@ def get_primary_key(self, table_name: str): raise ValueError(f"Table '{table_name}' has more than one primary key.") return primary_key + @cached(TTLCache(maxsize=1, ttl=10)) def get_primary_keys(self, table_name: str): primary_key = self.get_primary_key(table_name) if not primary_key: return [] return self.read_table_column(table_name, primary_key).tolist() - @cache + @cached(TTLCache(maxsize=1, ttl=10)) def get_table_schema(self, table_name: str): schema_filename = ( Path(self.data_folder).parent / "schemas" / f"{table_name}.schema.json" diff --git a/src/InsightBoard/pages/upload.py b/src/InsightBoard/pages/upload.py index 68e2d42..73dac02 100644 --- a/src/InsightBoard/pages/upload.py +++ b/src/InsightBoard/pages/upload.py @@ -15,6 +15,11 @@ # so stop adding rules after this limit is reached MAX_CONDITIONAL_FORMATTING = 512 +# Due to formatting constraints we store the _delete column (which doubles as a button) as a string +_DELETE_COLUMN = "_delete" +_DELETE_TRUE = "↺" +_DELETE_FALSE = "✖" + # Register the page dash.register_page(__name__, path="/upload") projectObj = None @@ -99,8 +104,15 @@ def layout(): columns=[], data=[], editable=True, - row_deletable=True, + hidden_columns=[], + column_selectable=None, style_data_conditional=[], + css=[ + { # Hide the annoying `Toggle Columns` button + "selector": ".show-hide", + "rule": "display: none", + }, + ], tooltip_data=[], page_size=25, style_table={ @@ -108,10 +120,17 @@ def layout(): "min-height": "300px", "overflowY": "auto", }, + # Freeze 'delete' and 'Row' columns fixed_columns={ "headers": True, - "data": 1, - }, # Freeze first column + "data": 2, + }, + style_header_conditional=[ + { + "if": {"column_id": _DELETE_COLUMN}, + "color": "transparent", + }, + ], ), ], style={ @@ -146,25 +165,48 @@ def layout(): "justifyContent": "flex-end", }, ), - # Button for reparsing edited data - dbc.Button("Revalidate", id="update-button", n_clicks=0), - # Buttons for downloading CSV and committing changes - dbc.Button( - "Download as CSV", - id="download-button", - n_clicks=0, - style={"margin": "10px"}, - ), - dcc.Download(id="download-csv"), - # Commit Button moved to the right-hand side - dbc.Button( - "Commit to Database", - id="commit-button", - n_clicks=0, - disabled=True, - style={"float": "right", "margin": "10px"}, - ), + # Buttons for row operations (on the next line) + html.Div(id="data-stats"), html.Div(id="commit-output"), + html.Div([ + dbc.Button( + "Reject rows with empty IDs", + id="remove-empty-ids-button", + n_clicks=0, + style={"marginRight": "5px"}, + ), + dbc.Button( + "Reject rows with any errors", + id="remove-error-rows-button", + n_clicks=0, + style={"margin": "5px"}, + ), + dbc.Button( + "Restore deleted rows", + id="restore-deleted-rows-button", + n_clicks=0, + style={"margin": "5px"}, + ), + html.Br(), + # Button for reparsing edited data + dbc.Button("Update & validate", id="update-button", n_clicks=0, style={"marginRight": "5px"}), + # Buttons for downloading CSV and committing changes + dbc.Button( + "Download as CSV", + id="download-button", + n_clicks=0, + style={"margin": "5px"}, + ), + dcc.Download(id="download-csv"), + # Commit Button moved to the right-hand side + dbc.Button( + "Commit to Database", + id="commit-button", + n_clicks=0, + disabled=True, + style={"float": "right", "margin": "10px", "verticalAlign": "middle"}, + ), + ]), dcc.ConfirmDialog(id="confirm-commit-dialog", message=""), dbc.Checklist( id="upload-settings", @@ -173,7 +215,7 @@ def layout(): {"label": "Show full validation log", "value": 2}, {"label": "Update existing records", "value": 3}, ], - value=[3], # list of 'value's are are 'on', e.g. [2] + value=[3], # list of 'value's that are 'on' by default inline=True, switch=True, style={"margin": "10px"}, @@ -189,11 +231,12 @@ def layout(): @callback( Output("url-refresh", "href"), # Refresh the page Input("close-button", "n_clicks"), # Triggered by 'Close File' button + Input("project", "data"), # Triggered by project selection in navbar ) -def refresh_url(n_clicks): +def refresh_url(n_clicks, project): if n_clicks: return "/upload" - return None + raise dash.exceptions.PreventUpdate # Update parser dropdown based on selected project @@ -270,12 +313,19 @@ def update_page_size(page_size): # When a table name is selected from the dropdown, update the DataTable display @callback( Output("editable-table", "columns"), # Update DataTable + Output("editable-table", "hidden_columns"), Output("editable-table", "data"), + Output("editable-table", "active_cell"), + Output("data-stats", "children"), Input("imported-tables-dropdown", "options"), # Triggered by 'table' selection ... Input("imported-tables-dropdown", "value"), Input("unique-table-id", "data"), Input("only-show-validation-errors", "value"), Input("update-existing-records", "value"), + Input("remove-empty-ids-button", "n_clicks"), + Input("remove-error-rows-button", "n_clicks"), + Input("restore-deleted-rows-button", "n_clicks"), + Input("editable-table", "active_cell"), State("project", "data"), State("edited-data-store", "data"), # Populate with table from edited-data store State("parsed-data-store", "data"), @@ -287,6 +337,10 @@ def update_table( unique_table_id, only_show_validation_errors, update_existing_records, + remove_empty_ids_n_clicks, + remove_error_rows_n_clicks, + restore_deleted_rows_n_clicks, + active_cell, project, edited_datasets, parsed_datasets, @@ -297,9 +351,22 @@ def update_table( if not datasets: datasets = parsed_datasets if not datasets: - return [], [] + raise dash.exceptions.PreventUpdate + + ctx = dash.callback_context + trig_active_cell = ctx_trigger(ctx, "editable-table.active_cell") + trig_remove_empty_ids = ctx_trigger(ctx, "remove-empty-ids-button.n_clicks") + trig_remove_error_rows = ctx_trigger(ctx, "remove-error-rows-button.n_clicks") + trig_restore_deleted_rows = ctx_trigger(ctx, "restore-deleted-rows-button.n_clicks") + + # The only active cell we want to respond to is the delete button + if trig_active_cell and active_cell and not active_cell.get("column_id") == _DELETE_COLUMN: + raise dash.exceptions.PreventUpdate data = datasets[options.index(selected_table)] + data_stats = f"Total rows: {len(data)}" + projectObj = utils.get_project(project) + primary_key = projectObj.database.get_primary_key(selected_table) # Convert any lists to strings for display data = clean_dataset(data, project, selected_table, lists_to_strings=True) @@ -307,22 +374,59 @@ def update_table( columns = [{"name": col, "id": col, "editable": True} for col in keys] columns = utils.ensure_schema_ordering(columns, project, selected_table) - # Prepend non-editable 'Row' column - columns.insert(0, {"name": "Row", "id": "Row", "editable": False}) - for i, row in enumerate(data): - row["Row"] = i + 1 - # Filter showing only rows with errors if only_show_validation_errors: data = [row for row, error in zip(data, errors) if any(error)] if not update_existing_records: - # Only display rows where primary key does not already exist in the database - projectObj = utils.get_project(project) - primary_key = projectObj.database.get_primary_key(selected_table) + # Remove rows where the primary key does not already exist in the database existing_keys = projectObj.database.get_primary_keys(selected_table) - data = [row for row in data if row.get(primary_key) not in existing_keys] + data = [row for row in data if row.get(primary_key, None) not in existing_keys] + + # Respond to delete button clicks + if active_cell and active_cell.get("column_id") == _DELETE_COLUMN: + i = active_cell.get("row") + row = data[i] + active_cell = False # Permits the button to be clicked again straight away + row[_DELETE_COLUMN] = ( + _DELETE_FALSE + if row.get(_DELETE_COLUMN, _DELETE_FALSE) == _DELETE_TRUE + else _DELETE_TRUE + ) + + # Mark rows with empty IDs for deletion + if trig_remove_empty_ids: + for row in data: + if not row.get(primary_key, None): + row[_DELETE_COLUMN] = _DELETE_TRUE + + # Mark rows with errors for deletion + if trig_remove_error_rows: + for row in data: + i = row["Row"] - 1 + if any(errors[i]): + row[_DELETE_COLUMN] = _DELETE_TRUE + + # Restore deleted rows + if trig_restore_deleted_rows: + for row in data: + row[_DELETE_COLUMN] = _DELETE_FALSE + + # Check how many visible rows are marked for deletion + deleted_rows = len([row for row in data if row.get(_DELETE_COLUMN, _DELETE_FALSE) == _DELETE_TRUE]) + + # Move columns '_delete' and 'Row' to the front + columns = [ + {"name": _DELETE_COLUMN, "id": _DELETE_COLUMN, "editable": False}, + {"name": "Row", "id": "Row", "editable": False}, + *[col for col in columns if col["id"] not in [_DELETE_COLUMN, "Row"]], + ] + + hidden_columns = [] + data_stats += f", Showing: {len(data)}" + if deleted_rows: + data_stats += f", Deleted: {deleted_rows}" - return columns, data + return columns, hidden_columns, data, active_cell, data_stats # Utility function to remove quotes from strings @@ -350,7 +454,7 @@ def clean_value(x, key_name=None, dtypes={}): "any": Any type is allowed. """ - # Target type can be not specified if, e.g. enum or 'Rows' column + # Target type can be not specified if, e.g. enum or 'Row' column if isinstance(dtypes, str) and dtypes == "any": target_types = [ "string", @@ -462,15 +566,14 @@ def update_edited_data( ): new_edited_data_store = parsed_data if not new_edited_data_store: - return [] + raise dash.exceptions.PreventUpdate - # Merge full data with edited data based on Row number, then pop Row + # Merge full data with edited data based on Row number full_edited_data = new_edited_data_store[tables.index(selected_table)] for row in edited_table_data: row_idx = row.get("Row", None) if row_idx: full_edited_data[row_idx - 1] = row - row.pop("Row", None) # Clean data clean_table_data = clean_dataset( @@ -563,6 +666,7 @@ def validate_errors( # Validate the data against the schema df = df.where(pd.notnull(df), None) + df.drop(columns=["Row", _DELETE_COLUMN], inplace=True, errors="ignore") errors = errors_to_dict(utils.validate_against_jsonschema(df, schema_file_relaxed)) # If a strict schema exists, validate against that too @@ -791,6 +895,12 @@ def parse_data(project, contents, filename, selected_parser): ) table_name = next(iter(parsed_dbs)) + # Populate 'Row' and '_delete' columns + for table in parsed_dbs_dict: + for i, row in enumerate(table): + row["Row"] = i + 1 + row[_DELETE_COLUMN] = _DELETE_FALSE + return ( f"File '{filename}' uploaded successfully.", parsed_dbs_dict, @@ -845,23 +955,35 @@ def highlight_and_tooltip_changes( ] tooltip_data = [{} for _ in range(start_idx)] keys = next(iter(data)).keys() - data_cols = [k for k in keys if k != "Row"] + data_cols = [k for k in keys if k not in ["Row", _DELETE_COLUMN]] + deleted_rows = [] error_rows = [] # Iterate over each row in the modified data try: # Ensure rows with errors are highlighted before placing cell-level highlights for i, row in enumerate(data[start_idx:end_idx]): - row_tooltip = {} # Store tooltips for the row - if only_show_validation_errors: - idx = row["Row"] - 1 - else: - idx = i + start_idx + idx = row["Row"] - 1 errors = validation_errors[idx] - # First, check for validation errors and highlight row - if any(errors): + # Check for deleted rows + if row[_DELETE_COLUMN] == _DELETE_TRUE: + deleted_rows.append(idx + 1) + # Check for validation errors and highlight row + if any(errors) and row[_DELETE_COLUMN] == _DELETE_FALSE: error_rows.append(idx + 1) + if deleted_rows: + style_data_conditional.append( + { + "if": { + "filter_query": " || ".join( + [f"{{Row}} = {k}" for k in deleted_rows] + ), + }, + "backgroundColor": "#CCCCCC", + "color": "#A0A0A0", + } + ) if error_rows: style_data_conditional.append( { @@ -877,10 +999,7 @@ def highlight_and_tooltip_changes( for i, row in enumerate(data[start_idx:end_idx]): row_tooltip = {} # Store tooltips for the row - if only_show_validation_errors: - idx = row["Row"] - 1 - else: - idx = i + start_idx + idx = row["Row"] - 1 errors = validation_errors[idx] # Show validation errors per cell and show tooltip for error in errors: @@ -984,7 +1103,8 @@ def update_table_style_and_validate( def download_csv(n_clicks, data, table_name): if n_clicks > 0 and data: df = pd.DataFrame(data) - df.drop(columns=["Row"], inplace=True) + df = df[df[_DELETE_COLUMN] == _DELETE_FALSE] + df.drop(columns=["Row", _DELETE_COLUMN], inplace=True) now = datetime.now() datetime_str = now.strftime("%Y-%m-%d_%H-%M-%S") filename = f"import_{table_name}_{datetime_str}.csv" @@ -1025,6 +1145,15 @@ def commit_to_database( projectObj.database.set_write_policy( WritePolicy.UPSERT if update_existing_records else WritePolicy.APPEND ) + # Remove _delete rows and ['Row', '_delete'] columns before committing + for i, table in enumerate(datasets): + datasets[i] = [ + row for row in table if row[_DELETE_COLUMN] == _DELETE_FALSE + ] + datasets[i] = [ + {k: v for k, v in row.items() if k not in ["Row", _DELETE_COLUMN]} + for row in datasets[i] + ] projectObj.database.commit_tables_dict(table_names, datasets) return dbc.Alert("Data committed to database.", color="success"), True except Exception as e: diff --git a/src/InsightBoard/utils.py b/src/InsightBoard/utils.py index ba9fa84..38da073 100644 --- a/src/InsightBoard/utils.py +++ b/src/InsightBoard/utils.py @@ -55,12 +55,28 @@ def validate_row_jsonschema(row_number, row, schema): return list(validator.iter_errors(row)) -def ensure_schema_ordering(columns, project, table): +def ensure_schema_ordering(columns, project, table, prepend=None, append=None): + if not prepend: + prepend = [] + if not append: + append = [] try: projectObj = get_project(project) schema = projectObj.database.get_table_schema(table) schema_order = list(schema["properties"].keys()) + # Add columns in prepend to the beginning + for col in reversed(prepend): + if col["id"] not in schema_order: + schema_order.insert(0, col["id"]) + # Add any remaining columns that are not in the schema to the end + for col in columns: + if col["id"] not in schema_order and col["id"] not in append: + schema_order.append(col["id"]) + # Add columns in append to the end + for col in append: + if col["id"] not in schema_order: + schema_order.append(col["id"]) columns = sorted(columns, key=lambda x: schema_order.index(x["id"])) except Exception as e: - logging.info(f"Error in ensure_schema_ordering: {str(e)}") + logging.debug(f"Error in ensure_schema_ordering: {str(e)}") return columns diff --git a/tests/unit/pages/test_upload.py b/tests/unit/pages/test_upload.py index 5560bd0..dde0aba 100644 --- a/tests/unit/pages/test_upload.py +++ b/tests/unit/pages/test_upload.py @@ -53,6 +53,11 @@ def test_update_table(): row4 = {"col1": "10", "col2": "11", "col3": "12"} table2 = [row3, row4] expected_columns = [ + { + "editable": False, + "id": "_delete", + "name": "_delete", + }, { "editable": False, "id": "Row", @@ -82,6 +87,10 @@ def test_update_table(): parsed_datasets = edited_datasets only_show_validation_errors = False update_existing_records = True + remove_empty_ids_n_clicks = None + remove_error_rows_n_clicks = None + restore_deleted_rows_n_clicks = None + active_cell = None errors = [] # Mock clean_dataset @@ -90,14 +99,26 @@ def _clean_dataset(data, *args, **kwargs): # Return table1 selected_table = "table1" - with patch("InsightBoard.pages.upload.clean_dataset") as mock_clean_dataset: + with ( + patch("InsightBoard.pages.upload.clean_dataset") as mock_clean_dataset, + patch("InsightBoard.pages.upload.dash.callback_context") as mock_ctx, + patch("InsightBoard.pages.upload.ctx_trigger") as mock_ctx_trigger, + patch("InsightBoard.utils.get_project") as mock_get_project, + ): mock_clean_dataset.side_effect = _clean_dataset - columns, data = update_table( + mock_ctx.triggered = [] + mock_ctx_trigger.return_value = False + mock_get_project.database.get_primary_key.return_value = "col1" + columns, hidden_columns, data, new_active_cell, data_stats = update_table( options, selected_table, unique_table_id, only_show_validation_errors, update_existing_records, + remove_empty_ids_n_clicks, + remove_error_rows_n_clicks, + restore_deleted_rows_n_clicks, + active_cell, project, edited_datasets, parsed_datasets, @@ -113,14 +134,26 @@ def _clean_dataset(data, *args, **kwargs): # Return table2 selected_table = "table2" - with patch("InsightBoard.pages.upload.clean_dataset") as mock_clean_dataset: + with ( + patch("InsightBoard.pages.upload.clean_dataset") as mock_clean_dataset, + patch("InsightBoard.pages.upload.dash.callback_context") as mock_ctx, + patch("InsightBoard.pages.upload.ctx_trigger") as mock_ctx_trigger, + patch("InsightBoard.utils.get_project") as mock_get_project, + ): mock_clean_dataset.side_effect = _clean_dataset - columns, data = update_table( + mock_ctx.triggered = [] + mock_ctx_trigger.return_value = False + mock_get_project.database.get_primary_key.return_value = "col1" + columns, hidden_columns, data, new_active_cell, data_stats = update_table( options, selected_table, unique_table_id, only_show_validation_errors, update_existing_records, + remove_empty_ids_n_clicks, + remove_error_rows_n_clicks, + restore_deleted_rows_n_clicks, + active_cell, project, edited_datasets, parsed_datasets, diff --git a/uv.lock b/uv.lock index 0f04e1f..4a53dd8 100644 --- a/uv.lock +++ b/uv.lock @@ -23,6 +23,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bb/2a/10164ed1f31196a2f7f3799368a821765c62851ead0e630ab52b8e14b4d0/blinker-1.8.2-py3-none-any.whl", hash = "sha256:1779309f71bf239144b9399d06ae925637cf6634cf6bd131104184531bf67c01", size = 9456 }, ] +[[package]] +name = "cachetools" +version = "5.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c3/38/a0f315319737ecf45b4319a8cd1f3a908e29d9277b46942263292115eee7/cachetools-5.5.0.tar.gz", hash = "sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a", size = 27661 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a4/07/14f8ad37f2d12a5ce41206c21820d8cb6561b728e51fad4530dff0552a67/cachetools-5.5.0-py3-none-any.whl", hash = "sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292", size = 9524 }, +] + [[package]] name = "certifi" version = "2024.8.30" @@ -305,9 +314,10 @@ wheels = [ [[package]] name = "insightboard" -version = "0.0.1.dev67+g27f9806.d20241010" +version = "0.0.1.dev72+gf891980.d20241011" source = { editable = "." } dependencies = [ + { name = "cachetools" }, { name = "dash" }, { name = "dash-bootstrap-components" }, { name = "dash-dangerously-set-inner-html" }, @@ -330,6 +340,7 @@ dev = [ [package.metadata] requires-dist = [ + { name = "cachetools", specifier = ">=5.5.0" }, { name = "dash", specifier = ">=2.18.1" }, { name = "dash-bootstrap-components", specifier = ">=1.6.0" }, { name = "dash-dangerously-set-inner-html", specifier = ">=0.0.2" },