Stricter validation for user provided ULID values

mdomke · May 25, 2024 · e25f438 · e25f438
1 parent 76d5740
commit e25f438
Show file tree

Hide file tree

Showing 4 changed files with 36 additions and 11 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -5,6 +5,14 @@ Changelog
 
 Versions follow `Semantic Versioning <http://www.semver.org>`_
 
+`2.6.0`_ - 2024-05-26
+---------------------
+Changed
+~~~~~~~
+* Provide more sophisticated validation when creating ``ULID``s from user input. When using
+  ``ULID.from_str`` we will check if the characters match the base32 alphabet. In general, it is
+  ensured that the timestamp part of the ULID is not out of range.
+
 `2.5.0`_ - 2024-04-26
 ---------------------
 
@@ -159,6 +167,7 @@ Changed
 * The package now has no external dependencies.
 * The test-coverage has been raised to 100%.
 
+.. _2.6.0: https://github.com/mdomke/python-ulid/compare/2.5.0...2.6.0
 .. _2.5.0: https://github.com/mdomke/python-ulid/compare/2.4.0...2.5.0
 .. _2.4.0: https://github.com/mdomke/python-ulid/compare/2.3.0...2.4.0
 .. _2.3.0: https://github.com/mdomke/python-ulid/compare/2.2.0...2.3.0

diff --git a/tests/test_ulid.py b/tests/test_ulid.py
@@ -149,15 +149,17 @@ def test_ulid_from_timestamp() -> None:
 @pytest.mark.parametrize(
     ("constructor", "value"),
     [
-        (ULID, b"sdf"),
-        (ULID.from_timestamp, b"not-a-timestamp"),
-        (ULID.from_datetime, time.time()),
-        (ULID.from_bytes, b"not-enough"),
-        (ULID.from_bytes, 123),
-        (ULID.from_str, "not-enough"),
-        (ULID.from_str, 123),
-        (ULID.from_int, "not-an-int"),
-        (ULID.from_uuid, "not-a-uuid"),
+        (ULID, b"sdf"),  # invalid length
+        (ULID.from_timestamp, b"not-a-timestamp"),  # invalid type
+        (ULID.from_datetime, time.time()),  # invalid type
+        (ULID.from_bytes, b"not-enough"),  # invalid length
+        (ULID.from_bytes, 123),  # invalid type
+        (ULID.from_str, "not-enough"),  # invalid length
+        (ULID.from_str, 123),  # inavlid type
+        (ULID.from_str, "notavalidulidnotavalidulid"),  # invalid alphabet
+        (ULID.from_str, "Z" * 26),  # invalid timestamp
+        (ULID.from_int, "not-an-int"),  # invalid type
+        (ULID.from_uuid, "not-a-uuid"),  # invalid type
     ],
 )
 def test_ulid_invalid_input(constructor: Callable[[Params], ULID], value: Params) -> None:

diff --git a/ulid/__init__.py b/ulid/__init__.py
@@ -71,14 +71,26 @@ class ULID:
         >>> ulid = ULID()
         >>> str(ulid)
         '01E75PVKXA3GFABX1M1J9NZZNF'
+
+    Args:
+        value (bytes, None):  A sequence of 16 bytes representing an encoded ULID.
+        validate (bool): If set to `True` validate if the timestamp part is valid.
+
+    Raises:
+        ValueError: If the provided value is not a valid encoded ULID.
     """
 
-    def __init__(self, value: bytes | None = None) -> None:
+    def __init__(self, value: bytes | None = None, validate: bool = True) -> None:
         if value is not None and len(value) != constants.BYTES_LEN:
             raise ValueError("ULID has to be exactly 16 bytes long.")
         self.bytes: bytes = (
             value or ULID.from_timestamp(time.time_ns() // constants.NANOSECS_IN_MILLISECS).bytes
         )
+        if value is not None and validate:
+            try:
+                self.datetime  # noqa: B018
+            except ValueError as err:
+                raise ValueError("ULID timestamp is out of range.") from err
 
     @classmethod
     @validate_type(datetime)
@@ -125,7 +137,7 @@ def from_uuid(cls: type[U], value: uuid.UUID) -> U:
             >>> ULID.from_uuid(uuid4())
             ULID(27Q506DP7E9YNRXA0XVD8Z5YSG)
         """
-        return cls(value.bytes)
+        return cls(value.bytes, validate=False)
 
     @classmethod
     @validate_type(bytes)

diff --git a/ulid/base32.py b/ulid/base32.py
@@ -198,6 +198,8 @@ def encode_randomness(binary: bytes) -> str:
 def decode(encoded: str) -> bytes:
     if len(encoded) != constants.REPR_LEN:
         raise ValueError("Encoded ULID has to be exactly 26 characters long.")
+    if any((c not in ENCODE) for c in encoded):
+        raise ValueError(f"Encoded ULID can only consist of letters in {ENCODE}.")
     return decode_timestamp(encoded[: constants.TIMESTAMP_REPR_LEN]) + decode_randomness(
         encoded[constants.TIMESTAMP_REPR_LEN :]
     )