Skip to content

Commit b11a5ef

Browse files
authored
Add download cli command to download file from a collection of workspaces (#2523)
## Changes Add a `download` cli command to download file from a collection of workspaces. ### Linked issues Progresses #1783 ### Functionality - [x] added relevant user documentation - [x] added new CLI command: `download` ### Tests - [x] manually tested - [x] added unit tests - [ ] ~added integration tests~ : Covering after #2507
1 parent ad2d007 commit b11a5ef

File tree

4 files changed

+137
-3
lines changed

4 files changed

+137
-3
lines changed

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ See [contributing instructions](CONTRIBUTING.md) to help improve this project.
101101
* [`cluster-remap` command](#cluster-remap-command)
102102
* [`revert-cluster-remap` command](#revert-cluster-remap-command)
103103
* [`upload` command](#upload-command)
104+
* [`download` command](#download-command)
104105
* [Common Challenges and the Solutions](#common-challenges-and-the-solutions)
105106
* [Network Connectivity Issues](#network-connectivity-issues)
106107
* [Insufficient Privileges](#insufficient-privileges)
@@ -1421,6 +1422,16 @@ $ databricks labs ucx upload --file <file_path> --run-as-collection True
14211422
Upload a file to a single workspace (`--run-as-collection False`) or a collection of workspaces
14221423
(`--run-as-collection True`). This command is especially useful when uploading the same file to multiple workspace.
14231424

1425+
## `download` command
1426+
1427+
```text
1428+
$ databricks labs ucx download --file <file_path> --run-as-collection True
1429+
21:31:29 INFO [d.labs.ucx] Finished downloading: <file_path>
1430+
```
1431+
1432+
Download a csv file from a single workspace (`--run-as-collection False`) or a collection of workspaces
1433+
(`--run-as-collection True`). This command is especially useful when downloading the same file from multiple workspace.
1434+
14241435
# Common Challenges and the Solutions
14251436
Users might encounter some challenges while installing and executing UCX. Please find the listing of some common challenges and the solutions below.
14261437

labs.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,3 +272,11 @@ commands:
272272
description: The file to upload
273273
- name: run-as-collection
274274
description: Run the command for the collection of workspace with ucx installed. Default is False.
275+
276+
- name: download
277+
description: download file from all workspaces in the account where ucx is installed
278+
flags:
279+
- name: file
280+
description: The file to download
281+
- name: run-as-collection
282+
description: Run the command for the collection of workspace with ucx installed. Default is False.

src/databricks/labs/ucx/cli.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from io import BytesIO
12
import json
23
import webbrowser
34
from pathlib import Path
@@ -8,6 +9,7 @@
89
from databricks.labs.blueprint.tui import Prompts
910
from databricks.sdk import AccountClient, WorkspaceClient
1011
from databricks.sdk.errors import NotFound
12+
from databricks.sdk.service.workspace import ExportFormat
1113
from databricks.labs.ucx.__about__ import __version__
1214

1315
from databricks.labs.ucx.config import WorkspaceConfig
@@ -550,6 +552,48 @@ def upload(
550552
logger.info(f"Finished uploading {file}")
551553

552554

555+
@ucx.command
556+
def download(
557+
file: Path | str,
558+
w: WorkspaceClient,
559+
run_as_collection: bool = False,
560+
a: AccountClient | None = None, # Only used while testing
561+
):
562+
"""Download and merge a CSV file from the ucx installation in a (collection of) workspace(s)"""
563+
file = Path(file)
564+
if file.suffix != ".csv":
565+
raise ValueError("Command only supported for CSV files")
566+
contexts = _get_workspace_contexts(w, run_as_collection=run_as_collection, a=a)
567+
csv_header = None
568+
with file.open("wb") as output:
569+
for ctx in contexts:
570+
remote_file_name = f"{ctx.installation.install_folder()}/{file.name}"
571+
try:
572+
# Installation does not have a download method
573+
data = ctx.workspace_client.workspace.download(remote_file_name, format=ExportFormat.AUTO).read()
574+
except NotFound:
575+
logger.warning(f"File not found for {ctx.workspace_client.config.host}: {remote_file_name}")
576+
continue
577+
input_ = BytesIO() # BytesIO supports .readline() to read the header, where StreamingResponse does not
578+
input_.write(data.rstrip(b"\n"))
579+
input_.seek(0) # Go back to the beginning of the file
580+
csv_header_next = input_.readline()
581+
if csv_header is None:
582+
csv_header = csv_header_next
583+
output.write(csv_header)
584+
elif csv_header == csv_header_next:
585+
output.write(b"\n")
586+
else:
587+
raise ValueError("CSV files have different headers")
588+
output.write(input_.read())
589+
if csv_header is None:
590+
logger.warning("No file(s) to download found")
591+
if file.is_file() and file.stat().st_size == 0:
592+
file.unlink()
593+
else:
594+
logger.info(f"Finished downloading {file}")
595+
596+
553597
@ucx.command
554598
def lint_local_code(
555599
w: WorkspaceClient, prompts: Prompts, path: str | None = None, ctx: LocalCheckoutContext | None = None

tests/unit/test_cli.py

Lines changed: 74 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import io
22
import json
3+
import logging
34
import time
45
from pathlib import Path
56
from unittest.mock import create_autospec, patch, Mock
@@ -16,7 +17,7 @@
1617
from databricks.sdk.service.iam import ComplexValue, User
1718
from databricks.sdk.service.jobs import Run, RunResultState, RunState
1819
from databricks.sdk.service.provisioning import Workspace
19-
from databricks.sdk.service.workspace import ImportFormat, ObjectInfo, ObjectType
20+
from databricks.sdk.service.workspace import ExportFormat, ImportFormat, ObjectInfo, ObjectType
2021

2122
from databricks.labs.ucx.assessment.aws import AWSResources, AWSRoleAction
2223
from databricks.labs.ucx.aws.access import AWSResourcePermissions
@@ -31,6 +32,7 @@
3132
create_missing_principals,
3233
create_table_mapping,
3334
create_uber_principal,
35+
download,
3436
ensure_assessment_run,
3537
installations,
3638
join_collection,
@@ -106,7 +108,7 @@ def create_workspace_client_mock(workspace_id: int) -> WorkspaceClient:
106108
""",
107109
}
108110

109-
def download(path: str) -> io.StringIO | io.BytesIO:
111+
def mock_download(path: str, **_) -> io.StringIO | io.BytesIO:
110112
if path not in state:
111113
raise NotFound(path)
112114
if ".csv" in path or ".log" in path:
@@ -117,7 +119,7 @@ def download(path: str) -> io.StringIO | io.BytesIO:
117119
workspace_client.get_workspace_id.return_value = workspace_id
118120
workspace_client.config.host = 'https://localhost'
119121
workspace_client.current_user.me.return_value = User(user_name="foo", groups=[ComplexValue(display="admins")])
120-
workspace_client.workspace.download = download
122+
workspace_client.workspace.download.side_effect = mock_download
121123
workspace_client.statement_execution.execute_statement.return_value = sql.StatementResponse(
122124
status=sql.StatementStatus(state=sql.StatementState.SUCCEEDED),
123125
manifest=sql.ResultManifest(schema=sql.ResultSchema()),
@@ -798,3 +800,72 @@ def test_join_collection():
798800
w.workspace.download.return_value = io.StringIO(json.dumps([{"workspace_id": 123, "workspace_name": "some"}]))
799801
join_collection(a, "123")
800802
w.workspace.download.assert_not_called()
803+
804+
805+
def test_download_raises_value_error_if_not_downloading_a_csv(ws1):
806+
with pytest.raises(ValueError) as e:
807+
download(Path("test.txt"), ws1)
808+
assert "Command only supported for CSV files" in str(e)
809+
810+
811+
@pytest.mark.parametrize("run_as_collection", [False, True])
812+
def test_download_calls_workspace_download(tmp_path, workspace_clients, acc_client, run_as_collection):
813+
if not run_as_collection:
814+
workspace_clients = [workspace_clients[0]]
815+
816+
download(
817+
tmp_path / "test.csv",
818+
workspace_clients[0],
819+
run_as_collection=run_as_collection,
820+
a=acc_client,
821+
)
822+
823+
for ws in workspace_clients:
824+
ws.workspace.download.assert_called_with(
825+
"/Users/foo/.ucx/test.csv",
826+
format=ExportFormat.AUTO,
827+
)
828+
829+
830+
def test_download_warns_if_file_not_found(caplog, ws1, acc_client):
831+
ws1.workspace.download.side_effect = NotFound("test.csv")
832+
with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.cli"):
833+
download(
834+
Path("test.csv"),
835+
ws1,
836+
run_as_collection=False,
837+
a=acc_client,
838+
)
839+
assert "File not found for https://localhost: /Users/foo/.ucx/test.csv" in caplog.messages
840+
assert "No file(s) to download found" in caplog.messages
841+
842+
843+
def test_download_deletes_empty_file(tmp_path, ws1, acc_client):
844+
ws1.workspace.download.side_effect = NotFound("test.csv")
845+
mapping_path = tmp_path / "mapping.csv"
846+
download(
847+
mapping_path,
848+
ws1,
849+
run_as_collection=False,
850+
a=acc_client,
851+
)
852+
assert not mapping_path.is_file()
853+
854+
855+
def test_download_has_expected_content(tmp_path, workspace_clients, acc_client):
856+
expected = (
857+
"workspace_name,catalog_name,src_schema,dst_schema,src_table,dst_table"
858+
"\ntest,test,test,test,test,test"
859+
"\ntest,test,test,test,test,test"
860+
)
861+
mapping_path = tmp_path / "mapping.csv"
862+
863+
download(
864+
mapping_path,
865+
workspace_clients[0],
866+
run_as_collection=True,
867+
a=acc_client,
868+
)
869+
870+
content = mapping_path.read_text()
871+
assert content == expected

0 commit comments

Comments
 (0)