From 9114c1afadc05db4b9131041aa07640637f56fee Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Sep 2024 11:15:00 +0200 Subject: [PATCH 1/7] Add `run-as-collection` flag to `migrate-tables` command --- labs.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/labs.yml b/labs.yml index 2ee694da06..33ecd837ef 100644 --- a/labs.yml +++ b/labs.yml @@ -245,8 +245,11 @@ commands: - name: migrate-tables description: | - Trigger the migrate-tables workflow and, optionally, migrate-external-hiveserde-tables-in-place-experimental - workflow and migrate-external-tables-ctas workflow. + Trigger the `migrate-tables` workflow and, optionally, `migrate-external-hiveserde-tables-in-place-experimental` + workflow and `migrate-external-tables-ctas workflow`. + flags: + - name: run-as-collection + description: Run the command for the collection of workspaces with ucx installed. Default is False. - name: migrate-acls description: | From 0e50be2b6d407f6330b3ae80f27ac495e1edadc8 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Sep 2024 11:17:56 +0200 Subject: [PATCH 2/7] Let `migrate-tables` command run on a collection --- src/databricks/labs/ucx/cli.py | 60 ++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py index e5b97a4ceb..18073f4047 100644 --- a/src/databricks/labs/ucx/cli.py +++ b/src/databricks/labs/ucx/cli.py @@ -510,34 +510,46 @@ def assign_metastore( @ucx.command -def migrate_tables(w: WorkspaceClient, prompts: Prompts, *, ctx: WorkspaceContext | None = None): +def migrate_tables( + w: WorkspaceClient, + prompts: Prompts, + *, + ctx: WorkspaceContext | None = None, + run_as_collection: bool = False, + a: AccountClient | None = None, +) -> None: """ Trigger the migrate-tables workflow and, optionally, the migrate-external-hiveserde-tables-in-place-experimental workflow and migrate-external-tables-ctas. """ - if ctx is None: - ctx = WorkspaceContext(w) - deployed_workflows = ctx.deployed_workflows - deployed_workflows.run_workflow("migrate-tables") - - tables = ctx.tables_crawler.snapshot() - hiveserde_tables = [table for table in tables if table.what == What.EXTERNAL_HIVESERDE] - if len(hiveserde_tables) > 0: - percentage_hiveserde_tables = len(hiveserde_tables) / len(tables) * 100 - if prompts.confirm( - f"Found {len(hiveserde_tables)} ({percentage_hiveserde_tables:.2f}%) hiveserde tables, do you want to run " - f"the migrate-external-hiveserde-tables-in-place-experimental workflow?" - ): - deployed_workflows.run_workflow("migrate-external-hiveserde-tables-in-place-experimental") - - external_ctas_tables = [table for table in tables if table.what == What.EXTERNAL_NO_SYNC] - if len(external_ctas_tables) > 0: - percentage_external_ctas_tables = len(external_ctas_tables) / len(tables) * 100 - if prompts.confirm( - f"Found {len(external_ctas_tables)} ({percentage_external_ctas_tables:.2f}%) external tables which cannot be migrated using sync" - f", do you want to run the migrate-external-tables-ctas workflow?" - ): - deployed_workflows.run_workflow("migrate-external-tables-ctas") + if ctx: + workspace_contexts = [ctx] + else: + workspace_contexts = _get_workspace_contexts(w, a, run_as_collection) + for workspace_context in workspace_contexts: + deployed_workflows = workspace_context.deployed_workflows + deployed_workflows.run_workflow("migrate-tables") + + tables = workspace_context.tables_crawler.snapshot() + hiveserde_tables = [table for table in tables if table.what == What.EXTERNAL_HIVESERDE] + if len(hiveserde_tables) > 0: + percentage_hiveserde_tables = len(hiveserde_tables) / len(tables) * 100 + if prompts.confirm( + f"Found {len(hiveserde_tables)} ({percentage_hiveserde_tables:.2f}%) hiveserde tables in " + f"{ctx.workspace_client.config.host}, do you want to run " + f"the `migrate-external-hiveserde-tables-in-place-experimental` workflow?" + ): + deployed_workflows.run_workflow("migrate-external-hiveserde-tables-in-place-experimental") + + external_ctas_tables = [table for table in tables if table.what == What.EXTERNAL_NO_SYNC] + if len(external_ctas_tables) > 0: + percentage_external_ctas_tables = len(external_ctas_tables) / len(tables) * 100 + if prompts.confirm( + f"Found {len(external_ctas_tables)} ({percentage_external_ctas_tables:.2f}%) external tables which " + f"cannot be migrated using sync in {ctx.workspace_client.config.host}, do you want to run the " + "`migrate-external-tables-ctas workflow`?" + ): + deployed_workflows.run_workflow("migrate-external-tables-ctas") @ucx.command From 06cb1fa1b0a7b8334bb1d607ebbab41769b28851 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Sep 2024 11:20:40 +0200 Subject: [PATCH 3/7] Type hint and format migrate tables tests --- tests/unit/test_cli.py | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 3a687eb692..65ad66a020 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -706,25 +706,37 @@ def test_assign_metastore(acc_client, caplog): assign_metastore(acc_client, "123") -def test_migrate_tables(ws): +def test_migrate_tables_calls_migrate_table_job_run_now(ws) -> None: ws.jobs.wait_get_run_job_terminated_or_skipped.return_value = Run( - state=RunState(result_state=RunResultState.SUCCESS), start_time=0, end_time=1000, run_duration=1000 + state=RunState(result_state=RunResultState.SUCCESS), + start_time=0, + end_time=1000, + run_duration=1000, ) prompts = MockPrompts({}) + migrate_tables(ws, prompts) + ws.jobs.run_now.assert_called_with(456) ws.jobs.wait_get_run_job_terminated_or_skipped.assert_called_once() -def test_migrate_external_hiveserde_tables_in_place(ws): +def test_migrate_tables_calls_external_hiveserde_tables_job_run_now(ws) -> None: tables_crawler = create_autospec(TablesCrawler) table = Table( - catalog="hive_metastore", database="test", name="hiveserde", object_type="UNKNOWN", table_format="HIVE" + catalog="hive_metastore", + database="test", + name="hiveserde", + object_type="UNKNOWN", + table_format="HIVE", ) tables_crawler.snapshot.return_value = [table] ctx = WorkspaceContext(ws).replace(tables_crawler=tables_crawler) ws.jobs.wait_get_run_job_terminated_or_skipped.return_value = Run( - state=RunState(result_state=RunResultState.SUCCESS), start_time=0, end_time=1000, run_duration=1000 + state=RunState(result_state=RunResultState.SUCCESS), + start_time=0, + end_time=1000, + run_duration=1000, ) prompt = ( @@ -739,15 +751,22 @@ def test_migrate_external_hiveserde_tables_in_place(ws): ws.jobs.wait_get_run_job_terminated_or_skipped.call_count = 2 -def test_migrate_external_tables_ctas(ws): +def test_migrate_tables_calls_external_tables_ctas_job_run_now(ws) -> None: tables_crawler = create_autospec(TablesCrawler) table = Table( - catalog="hive_metastore", database="test", name="externalctas", object_type="UNKNOWN", table_format="EXTERNAL" + catalog="hive_metastore", + database="test", + name="externalctas", + object_type="UNKNOWN", + table_format="EXTERNAL", ) tables_crawler.snapshot.return_value = [table] ctx = WorkspaceContext(ws).replace(tables_crawler=tables_crawler) ws.jobs.wait_get_run_job_terminated_or_skipped.return_value = Run( - state=RunState(result_state=RunResultState.SUCCESS), start_time=0, end_time=1000, run_duration=1000 + state=RunState(result_state=RunResultState.SUCCESS), + start_time=0, + end_time=1000, + run_duration=1000, ) prompt = ( From 5a484b01468d16a6a6db662c0337ce183b470db8 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Sep 2024 11:24:00 +0200 Subject: [PATCH 4/7] Test migrate tables on a collection --- tests/unit/test_cli.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 65ad66a020..9ec7b77e8d 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -706,19 +706,28 @@ def test_assign_metastore(acc_client, caplog): assign_metastore(acc_client, "123") -def test_migrate_tables_calls_migrate_table_job_run_now(ws) -> None: - ws.jobs.wait_get_run_job_terminated_or_skipped.return_value = Run( +@pytest.mark.parametrize("run_as_collection", [False, True]) +def test_migrate_tables_calls_migrate_table_job_run_now( + run_as_collection, + workspace_clients, + acc_client, +) -> None: + if not run_as_collection: + workspace_clients = [workspace_clients[0]] + run = Run( state=RunState(result_state=RunResultState.SUCCESS), start_time=0, end_time=1000, run_duration=1000, ) - prompts = MockPrompts({}) + for workspace_client in workspace_clients: + workspace_client.jobs.wait_get_run_job_terminated_or_skipped.return_value = run - migrate_tables(ws, prompts) + migrate_tables(workspace_clients[0], MockPrompts({}), run_as_collection=run_as_collection, a=acc_client) - ws.jobs.run_now.assert_called_with(456) - ws.jobs.wait_get_run_job_terminated_or_skipped.assert_called_once() + for workspace_client in workspace_clients: + workspace_client.jobs.run_now.assert_called_with(456) + workspace_client.jobs.wait_get_run_job_terminated_or_skipped.assert_called_once() def test_migrate_tables_calls_external_hiveserde_tables_job_run_now(ws) -> None: From 4aded2694f221f51cac0eae666d005227428f14c Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Sep 2024 11:24:51 +0200 Subject: [PATCH 5/7] Fix reference to workspace context --- src/databricks/labs/ucx/cli.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py index 18073f4047..92ce46a49a 100644 --- a/src/databricks/labs/ucx/cli.py +++ b/src/databricks/labs/ucx/cli.py @@ -536,7 +536,7 @@ def migrate_tables( percentage_hiveserde_tables = len(hiveserde_tables) / len(tables) * 100 if prompts.confirm( f"Found {len(hiveserde_tables)} ({percentage_hiveserde_tables:.2f}%) hiveserde tables in " - f"{ctx.workspace_client.config.host}, do you want to run " + f"{workspace_context.workspace_client.config.host}, do you want to run " f"the `migrate-external-hiveserde-tables-in-place-experimental` workflow?" ): deployed_workflows.run_workflow("migrate-external-hiveserde-tables-in-place-experimental") @@ -546,8 +546,8 @@ def migrate_tables( percentage_external_ctas_tables = len(external_ctas_tables) / len(tables) * 100 if prompts.confirm( f"Found {len(external_ctas_tables)} ({percentage_external_ctas_tables:.2f}%) external tables which " - f"cannot be migrated using sync in {ctx.workspace_client.config.host}, do you want to run the " - "`migrate-external-tables-ctas workflow`?" + f"cannot be migrated using sync in {workspace_context.workspace_client.config.host}, do you want to " + "run the `migrate-external-tables-ctas workflow`?" ): deployed_workflows.run_workflow("migrate-external-tables-ctas") From cf22438ac5bc6229566ab21495363864ce9a3d83 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Sep 2024 11:27:23 +0200 Subject: [PATCH 6/7] Add TODO on migrate tables run as collection test --- tests/unit/test_cli.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 9ec7b77e8d..6396d82122 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -731,6 +731,7 @@ def test_migrate_tables_calls_migrate_table_job_run_now( def test_migrate_tables_calls_external_hiveserde_tables_job_run_now(ws) -> None: + # TODO: Test for running on a collection when context injection for multiple workspaces is supported. tables_crawler = create_autospec(TablesCrawler) table = Table( catalog="hive_metastore", @@ -761,6 +762,7 @@ def test_migrate_tables_calls_external_hiveserde_tables_job_run_now(ws) -> None: def test_migrate_tables_calls_external_tables_ctas_job_run_now(ws) -> None: + # TODO: Test for running on a collection when context injection for multiple workspaces is supported. tables_crawler = create_autospec(TablesCrawler) table = Table( catalog="hive_metastore", From 0a94d02819859e06d41288d77b838595f14f1aab Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 17 Sep 2024 11:33:35 +0200 Subject: [PATCH 7/7] Fix (mock) prompts --- src/databricks/labs/ucx/cli.py | 2 +- tests/unit/test_cli.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/databricks/labs/ucx/cli.py b/src/databricks/labs/ucx/cli.py index 92ce46a49a..20ecf133d4 100644 --- a/src/databricks/labs/ucx/cli.py +++ b/src/databricks/labs/ucx/cli.py @@ -547,7 +547,7 @@ def migrate_tables( if prompts.confirm( f"Found {len(external_ctas_tables)} ({percentage_external_ctas_tables:.2f}%) external tables which " f"cannot be migrated using sync in {workspace_context.workspace_client.config.host}, do you want to " - "run the `migrate-external-tables-ctas workflow`?" + "run the `migrate-external-tables-ctas` workflow?" ): deployed_workflows.run_workflow("migrate-external-tables-ctas") diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 6396d82122..9adf7e22cd 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -750,8 +750,8 @@ def test_migrate_tables_calls_external_hiveserde_tables_job_run_now(ws) -> None: ) prompt = ( - "Found 1 (.*) hiveserde tables, do you want to run the " - "migrate-external-hiveserde-tables-in-place-experimental workflow?" + "Found 1 (.*) hiveserde tables in https://localhost, do you want to run the " + "`migrate-external-hiveserde-tables-in-place-experimental` workflow?" ) prompts = MockPrompts({prompt: "Yes"}) @@ -781,8 +781,8 @@ def test_migrate_tables_calls_external_tables_ctas_job_run_now(ws) -> None: ) prompt = ( - "Found 1 (.*) external tables which cannot be migrated using sync, do you want to run the " - "migrate-external-tables-ctas workflow?" + "Found 1 (.*) external tables which cannot be migrated using sync in https://localhost, do you want to run the " + "`migrate-external-tables-ctas` workflow?" ) prompts = MockPrompts({prompt: "Yes"})