diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py index 48b4e30..025aa3a 100644 --- a/src/databricks/labs/blueprint/paths.py +++ b/src/databricks/labs/blueprint/paths.py @@ -2,6 +2,7 @@ import abc import builtins +import codecs import fnmatch import io import locale @@ -789,19 +790,31 @@ def open( newline: str | None = None, ): """Open a file in Databricks Workspace. Only text and binary modes are supported.""" - if encoding is None or encoding == "locale": - encoding = locale.getpreferredencoding(False) if "b" in mode and "r" in mode: return self._ws.workspace.download(self.as_posix(), format=ExportFormat.AUTO) if "b" in mode and "w" in mode: return _BinaryUploadIO(self._ws, self.as_posix()) if "r" in mode: with self._ws.workspace.download(self.as_posix(), format=ExportFormat.AUTO) as f: - return StringIO(f.read().decode(encoding)) + data = f.read() + if encoding is None: + if data.startswith(codecs.BOM_UTF32_LE) or data.startswith(codecs.BOM_UTF32_BE): + encoding = 'utf-32' + elif data.startswith(codecs.BOM_UTF16_LE) or data.startswith(codecs.BOM_UTF16_BE): + encoding = 'utf-16' + elif data.startswith(codecs.BOM_UTF8): + encoding = 'utf-8-sig' + if encoding is None or encoding == "locale": + encoding = locale.getpreferredencoding(False) + return StringIO(data.decode(encoding)) if "w" in mode: return _TextUploadIO(self._ws, self.as_posix()) raise ValueError(f"invalid mode: {mode}") + def read_text(self, encoding=None, errors=None): + with self.open(mode='r', encoding=encoding, errors=errors) as f: + return f.read() + @property def suffix(self) -> str: """Return the file extension. If the file is a notebook, return the suffix based on the language.""" diff --git a/tests/integration/test_paths.py b/tests/integration/test_paths.py index 0b6b2c3..b858149 100644 --- a/tests/integration/test_paths.py +++ b/tests/integration/test_paths.py @@ -1,3 +1,4 @@ +import codecs from pathlib import Path import pytest @@ -205,3 +206,24 @@ def test_file_and_notebook_in_same_folder_with_different_suffixes(ws, make_noteb assert files["a.txt"].suffix == ".txt" assert files["b"].suffix == ".py" # suffix is determined from ObjectInfo assert files["b"].read_text() == "# Databricks notebook source\ndisplay(spark.range(10))" + + +@pytest.mark.parametrize( + "bom, encoding", + [ + (codecs.BOM_UTF8, "utf-8"), + (codecs.BOM_UTF16_LE, "utf-16-le"), + (codecs.BOM_UTF16_BE, "utf-16-be"), + (codecs.BOM_UTF32_LE, "utf-32-le"), + (codecs.BOM_UTF32_BE, "utf-32-be"), + ], +) +def test_correctly_encodes_and_decodes_file_with_bom(bom, encoding, ws, make_directory): + # Can't test notebooks because the server changes the uploaded data + folder = WorkspacePath(ws, make_directory()) + file_path = folder / f"some_file_{encoding}.py" + data = bom + "a = 12".encode(encoding) + file_path.write_bytes(data) + text = file_path.read_text() + assert text == "a = 12" +