support files with unicode BOM

databrickslabs · Jul 31, 2024 · 2cb54c0 · 2cb54c0
1 parent 98e75bc
commit 2cb54c0
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 3 deletions.
diff --git a/src/databricks/labs/blueprint/paths.py b/src/databricks/labs/blueprint/paths.py
@@ -2,6 +2,7 @@
 
 import abc
 import builtins
+import codecs
 import fnmatch
 import io
 import locale
@@ -789,19 +790,31 @@ def open(
         newline: str | None = None,
     ):
         """Open a file in Databricks Workspace. Only text and binary modes are supported."""
-        if encoding is None or encoding == "locale":
-            encoding = locale.getpreferredencoding(False)
         if "b" in mode and "r" in mode:
             return self._ws.workspace.download(self.as_posix(), format=ExportFormat.AUTO)
         if "b" in mode and "w" in mode:
             return _BinaryUploadIO(self._ws, self.as_posix())
         if "r" in mode:
             with self._ws.workspace.download(self.as_posix(), format=ExportFormat.AUTO) as f:
-                return StringIO(f.read().decode(encoding))
+                data = f.read()
+                if encoding is None:
+                    if data.startswith(codecs.BOM_UTF32_LE) or data.startswith(codecs.BOM_UTF32_BE):
+                        encoding = 'utf-32'
+                    elif data.startswith(codecs.BOM_UTF16_LE) or data.startswith(codecs.BOM_UTF16_BE):
+                        encoding = 'utf-16'
+                    elif data.startswith(codecs.BOM_UTF8):
+                        encoding = 'utf-8-sig'
+                if encoding is None or encoding == "locale":
+                    encoding = locale.getpreferredencoding(False)
+                return StringIO(data.decode(encoding))
         if "w" in mode:
             return _TextUploadIO(self._ws, self.as_posix())
         raise ValueError(f"invalid mode: {mode}")
 
+    def read_text(self, encoding=None, errors=None):
+        with self.open(mode='r', encoding=encoding, errors=errors) as f:
+            return f.read()
+
     @property
     def suffix(self) -> str:
         """Return the file extension. If the file is a notebook, return the suffix based on the language."""

diff --git a/tests/integration/test_paths.py b/tests/integration/test_paths.py
@@ -1,3 +1,4 @@
+import codecs
 from pathlib import Path
 
 import pytest
@@ -205,3 +206,24 @@ def test_file_and_notebook_in_same_folder_with_different_suffixes(ws, make_noteb
     assert files["a.txt"].suffix == ".txt"
     assert files["b"].suffix == ".py"  # suffix is determined from ObjectInfo
     assert files["b"].read_text() == "# Databricks notebook source\ndisplay(spark.range(10))"
+
+
+@pytest.mark.parametrize(
+    "bom, encoding",
+    [
+        (codecs.BOM_UTF8, "utf-8"),
+        (codecs.BOM_UTF16_LE, "utf-16-le"),
+        (codecs.BOM_UTF16_BE, "utf-16-be"),
+        (codecs.BOM_UTF32_LE, "utf-32-le"),
+        (codecs.BOM_UTF32_BE, "utf-32-be"),
+    ],
+)
+def test_correctly_encodes_and_decodes_file_with_bom(bom, encoding, ws, make_directory):
+    # Can't test notebooks because the server changes the uploaded data
+    folder = WorkspacePath(ws, make_directory())
+    file_path = folder / f"some_file_{encoding}.py"
+    data = bom + "a = 12".encode(encoding)
+    file_path.write_bytes(data)
+    text = file_path.read_text()
+    assert text == "a = 12"
+