Skip to content

Commit

Permalink
support files with unicode BOM
Browse files Browse the repository at this point in the history
  • Loading branch information
ericvergnaud committed Jul 31, 2024
1 parent 98e75bc commit 2cb54c0
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 3 deletions.
19 changes: 16 additions & 3 deletions src/databricks/labs/blueprint/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import abc
import builtins
import codecs
import fnmatch
import io
import locale
Expand Down Expand Up @@ -789,19 +790,31 @@ def open(
newline: str | None = None,
):
"""Open a file in Databricks Workspace. Only text and binary modes are supported."""
if encoding is None or encoding == "locale":
encoding = locale.getpreferredencoding(False)
if "b" in mode and "r" in mode:
return self._ws.workspace.download(self.as_posix(), format=ExportFormat.AUTO)
if "b" in mode and "w" in mode:
return _BinaryUploadIO(self._ws, self.as_posix())
if "r" in mode:
with self._ws.workspace.download(self.as_posix(), format=ExportFormat.AUTO) as f:
return StringIO(f.read().decode(encoding))
data = f.read()
if encoding is None:
if data.startswith(codecs.BOM_UTF32_LE) or data.startswith(codecs.BOM_UTF32_BE):
encoding = 'utf-32'
elif data.startswith(codecs.BOM_UTF16_LE) or data.startswith(codecs.BOM_UTF16_BE):
encoding = 'utf-16'
elif data.startswith(codecs.BOM_UTF8):
encoding = 'utf-8-sig'
if encoding is None or encoding == "locale":
encoding = locale.getpreferredencoding(False)
return StringIO(data.decode(encoding))
if "w" in mode:
return _TextUploadIO(self._ws, self.as_posix())
raise ValueError(f"invalid mode: {mode}")

def read_text(self, encoding=None, errors=None):
with self.open(mode='r', encoding=encoding, errors=errors) as f:
return f.read()

@property
def suffix(self) -> str:
"""Return the file extension. If the file is a notebook, return the suffix based on the language."""
Expand Down
22 changes: 22 additions & 0 deletions tests/integration/test_paths.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import codecs
from pathlib import Path

import pytest
Expand Down Expand Up @@ -205,3 +206,24 @@ def test_file_and_notebook_in_same_folder_with_different_suffixes(ws, make_noteb
assert files["a.txt"].suffix == ".txt"
assert files["b"].suffix == ".py" # suffix is determined from ObjectInfo
assert files["b"].read_text() == "# Databricks notebook source\ndisplay(spark.range(10))"


@pytest.mark.parametrize(
"bom, encoding",
[
(codecs.BOM_UTF8, "utf-8"),
(codecs.BOM_UTF16_LE, "utf-16-le"),
(codecs.BOM_UTF16_BE, "utf-16-be"),
(codecs.BOM_UTF32_LE, "utf-32-le"),
(codecs.BOM_UTF32_BE, "utf-32-be"),
],
)
def test_correctly_encodes_and_decodes_file_with_bom(bom, encoding, ws, make_directory):
# Can't test notebooks because the server changes the uploaded data
folder = WorkspacePath(ws, make_directory())
file_path = folder / f"some_file_{encoding}.py"
data = bom + "a = 12".encode(encoding)
file_path.write_bytes(data)
text = file_path.read_text()
assert text == "a = 12"

0 comments on commit 2cb54c0

Please sign in to comment.