Scylla2020 · Mar 8, 2023
diff --git a/‎.gitignore
+2 b/‎.gitignore
+2
diff --git a/‎.pre-commit-config.yaml
+28 b/‎.pre-commit-config.yaml
+28
diff --git a/‎MANIFEST.in
+15 b/‎MANIFEST.in
+15
diff --git a/‎README.md
+3-2 b/‎README.md
+3-2
diff --git a/‎requirements-dev.txt
+9 b/‎requirements-dev.txt
+9
diff --git a/‎requirements.txt
+19 b/‎requirements.txt
+19
diff --git a/‎scripts/init.py
+16 b/‎scripts/init.py
+16
diff --git a/‎setup.cfg
+33 b/‎setup.cfg
+33
diff --git a/‎setup.py
+23 b/‎setup.py
+23
diff --git a/‎src/marvin/__init__.py
+15 b/‎src/marvin/__init__.py
+15
diff --git a/‎src/marvin/config.py
+116 b/‎src/marvin/config.py
+116
diff --git a/‎src/marvin/infra/__init__.py
+1 b/‎src/marvin/infra/__init__.py
+1
diff --git a/‎src/marvin/infra/db.py
+134 b/‎src/marvin/infra/db.py
+134
diff --git a/‎src/marvin/utilities/__init__.py
+1 b/‎src/marvin/utilities/__init__.py
+1
diff --git a/‎src/marvin/utilities/async_utils.py
+29 b/‎src/marvin/utilities/async_utils.py
+29
diff --git a/‎src/marvin/utilities/collections.py
+36 b/‎src/marvin/utilities/collections.py
+36
diff --git a/‎src/marvin/utilities/logging.py
+81 b/‎src/marvin/utilities/logging.py
+81
diff --git a/‎src/marvin/utilities/strings.py
+197 b/‎src/marvin/utilities/strings.py
+197
diff --git a/‎src/marvin/utilities/tests.py
+13 b/‎src/marvin/utilities/tests.py
+13
diff --git a/‎src/marvin/utilities/types.py
+174 b/‎src/marvin/utilities/types.py
+174
@@ -150,3 +150,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+.DS_Store
@@ -0,0 +1,28 @@
+repos:
+  - repo: https://github.com/pycqa/autoflake
+    rev: v2.0.0
+    hooks:
+      - id: autoflake
+        language_version: python3
+        args:
+          [
+            "--in-place",
+            "--recursive",
+            "--remove-all-unused-imports",
+            "--remove-unused-variables",
+            "--exclude",
+            '**/__init__.py, **/conftest.py, tests/fixtures/**.py',
+          ]
+
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        language_version: python3
+
+  - repo: https://github.com/psf/black
+    rev: 22.12.0
+    hooks:
+      - id: black
+        args: ["--preview"]
+        language_version: python3
@@ -0,0 +1,15 @@
+# Things to always exclude
+global-exclude .git*
+global-exclude .ipynb_checkpoints
+global-exclude *.py[co]
+global-exclude __pycache__/**
+
+# Top-level Config
+include LICENSE
+include MANIFEST.in
+include setup.cfg
+include requirements.txt
+
+# Prompt templates
+graft src/marvin/programs
+graft src/marvin/prompts
@@ -1,2 +1,3 @@
-# marvin
- 
+# Marvin
+
+> "'Let’s build robots with Genuine People Personalities,' they said. So they tried it out with me. I’m a personality prototype, you can tell, can’t you?"
@@ -0,0 +1,9 @@
+black[jupyter]>=22.12
+pre-commit>=2.21.0
+pytest-asyncio>=0.20.3
+pytest-sugar>=0.9.6
+pytest-env>=0.8.1
+pytest>=7.2.0
+pdbpp>=0.10.3 
+pyperclip>=1.8.2
+ipython>=8.0
@@ -0,0 +1,19 @@
+aiosqlite==0.18.0
+asyncpg==0.27.0
+cloudpickle==2.2.1
+fastapi==0.89.1
+httpx==0.23.3
+jinja2==3.1.2
+nest_asyncio==1.5.6
+openai==0.27.0
+pendulum==2.1.2
+pydantic[dotenv,email]==1.10.4
+rich==13.3.1
+sqlalchemy[asyncio]==1.4.41
+sqlitedict==2.1.0
+sqlmodel==0.0.8
+tiktoken==0.3.0
+ulid-py==1.1.0
+uvicorn==0.20.0
+xxhash==3.2.0
+yake==0.4.8
@@ -0,0 +1,16 @@
+import asyncio
+
+import marvin
+import marvin.examples.prefect
+
+
+async def main():
+    # reset the DB
+    await marvin.database.ddl.reset_db(confirm=True)
+
+    # hydrate with docs
+    await marvin.examples.prefect.load_prefect()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,33 @@
+[tool:pytest]
+markers =
+    ai: mark a test as dependent on external AI APIs.
+norecursedirs = *.egg-info .git .mypy_cache node_modules .pytest_cache .vscode
+asyncio_mode = auto
+filterwarnings =
+    ignore:'crypt' is deprecated and slated for removal in Python 3.13:DeprecationWarning
+
+env =
+    MARVIN_TEST_MODE=1
+    D:MARVIN_DATABASE_CONNECTION_URL=sqlite+aiosqlite:////tmp/marvin-tests/test.sqlite
+    MARVIN_LOG_CONSOLE_WIDTH=120
+    MARVIN_LOG_LEVEL=DEBUG
+
+[isort]
+skip = __init__.py
+profile = black
+skip_gitignore = True
+multi_line_output = 3
+
+[flake8]
+# Match black line-length
+max-line-length = 88
+extend-ignore =
+    # See https://github.com/PyCQA/pycodestyle/issues/373
+    E203,
+
+[pycodestyle]
+# Match black line-length
+max-line-length = 88
+extend-ignore =
+    # See https://github.com/PyCQA/pycodestyle/issues/373
+    E203,
@@ -0,0 +1,23 @@
+from setuptools import find_packages, setup
+
+required_deps = open("requirements.txt").read().strip().split("\n")
+dev_deps = open("requirements-dev.txt").read().strip().split("\n")
+
+setup(
+    # Package metadata
+    name="marvin",
+    url="https://github.com/PrefectHQ/marvin",
+    version="0.3",
+    long_description=open("README.md").read(),
+    long_description_content_type="text/markdown",
+    # Package setup
+    packages=find_packages(where="src"),
+    package_dir={"": "src"},
+    include_package_data=True,
+    # Requirements
+    python_requires=">=3.10",
+    install_requires=required_deps,
+    extras_require={
+        "dev": required_deps + dev_deps,
+    },
+)
@@ -0,0 +1,15 @@
+# load env vars
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# load nest_asyncio
+import nest_asyncio
+
+nest_asyncio.apply()
+
+# load marvin root objects
+from marvin.config import settings
+from marvin.utilities.logging import get_logger
+
+# load marvin
@@ -0,0 +1,116 @@
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Literal
+
+from pydantic import BaseSettings, Field, SecretStr, root_validator, validator
+from rich import print
+from rich.text import Text
+
+import marvin
+
+
+class Settings(BaseSettings):
+    class Config:
+        env_file = ".env"
+        env_prefix = "MARVIN_"
+        validate_assignment = True
+
+    home: Path = Path("~/.marvin").expanduser()
+    test_mode: bool = False
+
+    # LOGGING
+    verbose: bool = False
+    log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "DEBUG"
+    log_console_width: int | None = Field(
+        None,
+        description=(
+            "Marvin will auto-detect the console width when possible, but in deployed"
+            " settings logs will assume a console width of 80 characters unless"
+            " specified here."
+        ),
+    )
+    rich_tracebacks: bool = Field(False, description="Enable rich traceback formatting")
+
+    # EMBEDDINGS
+    # specify the path to the embeddings cache, relative to the home dir
+    embeddings_cache_path: Path = Path("cache/embeddings.sqlite")
+    embeddings_cache_warn_size: int = 4000000000  # 4GB
+
+    # OPENAI
+    openai_api_key: SecretStr = Field(
+        "", env=["MARVIN_OPENAI_API_KEY", "OPENAI_API_KEY"]
+    )
+
+    # DATABASE
+    database_echo: bool = False
+    database_connection_url: SecretStr = "sqlite+aiosqlite:////$HOME/marvin.db"
+
+    # REDIS
+    redis_connection_url: SecretStr = ""
+
+    # BOTS
+    bot_create_profile_picture: bool = False
+
+    @root_validator
+    def initial_setup(cls, values):
+        values["home"].mkdir(parents=True, exist_ok=True)
+
+        # prefix HOME to embeddings cache path
+        if not values["embeddings_cache_path"].is_absolute():
+            values["embeddings_cache_path"] = (
+                values["home"] / values["embeddings_cache_path"]
+            )
+        values["embeddings_cache_path"].parent.mkdir(parents=True, exist_ok=True)
+
+        # interpolate HOME into database connection URL
+        values["database_connection_url"] = SecretStr(
+            values["database_connection_url"]
+            .get_secret_value()
+            .replace("$HOME", str(values["home"]))
+        )
+
+        # print if verbose = True
+        if values["verbose"]:
+            print(Text("Verbose mode enabled", style="green"))
+
+        return values
+
+    @validator("openai_api_key")
+    def warn_if_missing_api_keys(cls, v, field):
+        if not v:
+            print(
+                Text(
+                    f"WARNING: `{field.name}` is not set. Some features may not work.",
+                    style="red",
+                )
+            )
+        return v
+
+    @root_validator
+    def test_mode_settings(cls, values):
+        if values["test_mode"]:
+            print(Text("Marvin is running in test mode!", style="yellow"))
+            values["log_level"] = "DEBUG"
+            values["verbose"] = True
+        return values
+
+    def __setattr__(self, name, value):
+        result = super().__setattr__(name, value)
+        # update log level on assignment
+        if name == "log_level":
+            marvin.utilities.logging.setup_logging()
+        return result
+
+
+settings = Settings()
+
+
+@contextmanager
+def temporary_settings(**kwargs):
+    old_settings = settings.dict()
+    settings.__dict__.update(kwargs)
+    try:
+        yield
+    finally:
+        settings.__dict__.clear()
+        settings.__dict__.update(old_settings)
@@ -0,0 +1 @@
+from . import db
@@ -0,0 +1,134 @@
+import inspect
+from contextlib import asynccontextmanager
+from functools import wraps
+from typing import AsyncGenerator, Callable, Literal
+
+import sqlmodel
+from sqlalchemy.dialects.postgresql import JSONB as postgres_JSONB
+from sqlalchemy.dialects.sqlite import JSON as sqlite_JSON
+from sqlalchemy.ext.asyncio import create_async_engine
+from sqlalchemy.orm import sessionmaker
+from sqlmodel.ext.asyncio.session import AsyncSession
+
+import marvin
+
+engine_kwargs = {}
+# sqlite doesn't support pool configuration
+if marvin.settings.database_connection_url.get_secret_value().startswith("postgresql"):
+    engine_kwargs.update(
+        pool_size=50,
+        max_overflow=20,
+    )
+
+engine = create_async_engine(
+    marvin.settings.database_connection_url.get_secret_value(),
+    echo=marvin.settings.database_echo,
+    **engine_kwargs,
+)
+async_session_maker = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
+
+
+def get_dialect() -> Literal["postgresql", "sqlite"]:
+    return engine.dialect.name
+
+
+async def get_session() -> AsyncGenerator[AsyncSession, None]:
+    return async_session_maker()
+
+
+@asynccontextmanager
+async def session_context(begin_transaction: bool = False):
+    """
+    Provides a SQLAlchemy session and a context manager for opening/closing
+    the underlying connection.
+
+    Args:
+        begin_transaction: if True, the context manager will begin a SQL transaction.
+            Exiting the context manager will COMMIT or ROLLBACK any changes.
+    """
+    async with await get_session() as session:
+        if begin_transaction:
+            async with session.begin():
+                yield session
+        else:
+            try:
+                yield session
+                await session.commit()
+            except Exception:
+                await session.rollback()
+                raise
+
+
+def provide_session(begin_transaction: bool = False) -> Callable:
+    """
+    Decorator that provides a database interface to a function.
+
+    The decorated function _must_ have a kwarg that is annotated as `AsyncSession`.
+    """
+    if isinstance(begin_transaction, Callable):
+        raise TypeError("provide_session() must be called when decorating a function.")
+
+    def wrapper(fn: Callable) -> Callable:
+        SESSION_KWARG = None
+        sig = inspect.signature(fn)
+        for name, param in sig.parameters.items():
+            if param.annotation is AsyncSession:
+                SESSION_KWARG = name
+                break
+        if SESSION_KWARG is None:
+            raise TypeError("No `AsyncSession` kwarg found in function signature.")
+
+        @wraps(fn)
+        async def async_wrapper(*args, **kwargs):
+            try:
+                arguments = sig.bind_partial(*args, **kwargs).arguments
+
+            # typeerror would indicate an illegal argument was passed;
+            # we'll let the function reraise for clarity
+            except TypeError:
+                arguments = {}
+
+            if SESSION_KWARG not in arguments or arguments[SESSION_KWARG] is None:
+                async with session_context(
+                    begin_transaction=begin_transaction
+                ) as session:
+                    kwargs[SESSION_KWARG] = session
+                    return await fn(*args, **kwargs)
+            return await fn(*args, **kwargs)
+
+        return async_wrapper
+
+    return wrapper
+
+
+if get_dialect() == "sqlite":
+    JSONType = sqlite_JSON
+else:
+    JSONType = postgres_JSONB
+
+
+async def destroy_db(confirm: bool = False):
+    if not confirm:
+        raise ValueError("You must confirm that you want to destroy the database.")
+
+    async with session_context(begin_transaction=True) as session:
+        for table in reversed(sqlmodel.SQLModel.metadata.sorted_tables):
+            if marvin.database.engine.get_dialect() == "postgresql":
+                await session.execute(f'DROP TABLE IF EXISTS "{table.name}" CASCADE;')
+            else:
+                await session.execute(f'DROP TABLE IF EXISTS "{table.name}";')
+            marvin.get_logger("db").debug_style(
+                f"Table {table.name!r} dropped.", "white on red"
+            )
+        marvin.get_logger("db").info_style("Database destroyed!", "white on red")
+
+
+async def create_db():
+    async with marvin.database.engine.engine.begin() as conn:
+        await conn.run_sync(sqlmodel.SQLModel.metadata.create_all)
+        marvin.get_logger("db").info_style("Database created!", "green")
+
+
+async def reset_db(confirm: bool = False):
+    await destroy_db(confirm=confirm)
+    await create_db()
@@ -0,0 +1 @@
+from . import logging, async_utils, types, strings, collections, tests
@@ -0,0 +1,29 @@
+import asyncio
+import concurrent.futures
+import functools
+import multiprocessing as mp
+
+import cloudpickle
+
+import marvin
+
+process_pool = concurrent.futures.ProcessPoolExecutor(mp_context=mp.get_context("fork"))
+
+
+async def run_async(func, *args, **kwargs):
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(None, lambda: func(*args, **kwargs))
+
+
+def _cloudpickle_wrapper(pickle):
+    return cloudpickle.loads(pickle)()
+
+
+async def run_async_process(func, *args, **kwargs):
+    # in test mode, don't spawn processes
+    if marvin.settings.test_mode:
+        return await run_async(func, *args, **kwargs)
+
+    pickled_func = cloudpickle.dumps(functools.partial(func, *args, **kwargs))
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(process_pool, _cloudpickle_wrapper, pickled_func)
@@ -0,0 +1,36 @@
+import itertools
+from typing import Any, Callable, Iterable, TypeVar
+
+T = TypeVar("T")
+
+
+def batched(
+    iterable: Iterable[T], size: int, size_fn: Callable[[Any], int] = None
+) -> Iterable[T]:
+    """
+    If size_fn is not provided, then the batch size will be determined by the
+    number of items in the batch.
+
+    If size_fn is provided, then it will be used
+    to compute the batch size. Note that if a single item is larger than the
+    batch size, it will be returned as a batch of its own.
+    """
+    if size_fn is None:
+        it = iter(iterable)
+        while True:
+            batch = tuple(itertools.islice(it, size))
+            if not batch:
+                break
+            yield batch
+    else:
+        batch = []
+        batch_size = 0
+        for item in iter(iterable):
+            batch.append(item)
+            batch_size += size_fn(item)
+            if batch_size > size:
+                yield batch
+                batch = []
+                batch_size = 0
+        if batch:
+            yield batch
@@ -0,0 +1,81 @@
+import logging
+from functools import lru_cache, partial
+
+from rich.console import Console
+from rich.logging import RichHandler
+from rich.markup import escape
+from rich.traceback import install as install_rich_tracebacks
+
+import marvin
+
+
+@lru_cache()
+def get_logger(name: str = None) -> logging.Logger:
+    parent_logger = logging.getLogger("marvin")
+
+    if name:
+        # Append the name if given but allow explicit full names e.g. "marvin.test"
+        # should not become "marvin.marvin.test"
+        if not name.startswith(parent_logger.name + "."):
+            logger = parent_logger.getChild(name)
+        else:
+            logger = logging.getLogger(name)
+    else:
+        logger = parent_logger
+
+    add_logging_methods(logger)
+    return logger
+
+
+def setup_logging():
+    logger = get_logger()
+    logger.setLevel(marvin.settings.log_level)
+
+    if not any(isinstance(h, RichHandler) for h in logger.handlers):
+        handler = RichHandler(
+            rich_tracebacks=True,
+            markup=False,
+            console=Console(width=marvin.settings.log_console_width),
+        )
+        formatter = logging.Formatter("%(name)s: %(message)s")
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+
+
+def add_logging_methods(logger):
+    def log_style(level: int, message: str, style: str = None):
+        if not style:
+            style = "default on default"
+        message = f"[{style}]{escape(str(message))}[/]"
+        logger.log(level, message, extra={"markup": True})
+
+    def log_kv(
+        level: int,
+        key: str,
+        value: str,
+        key_style: str = "default on default",
+        value_style: str = "default on default",
+        delimiter: str = ": ",
+    ):
+        logger.log(
+            level,
+            f"[{key_style}]{escape(str(key))}{delimiter}[/][{value_style}]{escape(str(value))}[/]",
+            extra={"markup": True},
+        )
+
+    logger.debug_style = partial(log_style, logging.DEBUG)
+    logger.info_style = partial(log_style, logging.INFO)
+    logger.warning_style = partial(log_style, logging.WARNING)
+    logger.error_style = partial(log_style, logging.ERROR)
+    logger.critical_style = partial(log_style, logging.CRITICAL)
+
+    logger.debug_kv = partial(log_kv, logging.DEBUG)
+    logger.info_kv = partial(log_kv, logging.INFO)
+    logger.warning_kv = partial(log_kv, logging.WARNING)
+    logger.error_kv = partial(log_kv, logging.ERROR)
+    logger.critical_kv = partial(log_kv, logging.CRITICAL)
+
+
+setup_logging()
+if marvin.settings.rich_tracebacks:
+    install_rich_tracebacks()
@@ -0,0 +1,197 @@
+import asyncio
+import re
+from functools import lru_cache
+from string import Formatter
+from typing import Any, Callable, Mapping, Sequence, Union
+
+import pendulum
+import tiktoken
+import xxhash
+from jinja2 import ChoiceLoader, Environment, StrictUndefined, select_autoescape
+
+import marvin
+
+jinja_env = Environment(
+    loader=ChoiceLoader(
+        [
+            # PackageLoader("marvin", "prompts"),
+            # PackageLoader("marvin", "programs"),
+        ]
+    ),
+    autoescape=select_autoescape(default_for_string=False),
+    trim_blocks=True,
+    lstrip_blocks=True,
+    enable_async=True,
+    auto_reload=True,
+    undefined=StrictUndefined,
+)
+jinja_env.globals.update(
+    zip=zip,
+    str=str,
+    len=len,
+    arun=asyncio.run,
+    pendulum=pendulum,
+    dt=lambda: pendulum.now("UTC").to_day_datetime_string(),
+)
+
+
+class StrictFormatter(Formatter):
+    """A subclass of formatter that checks for extra keys."""
+
+    def check_unused_args(
+        self,
+        used_args: Sequence[Union[int, str]],
+        args: Sequence,
+        kwargs: Mapping[str, Any],
+    ) -> None:
+        """Check to see if extra parameters are passed."""
+        extra = set(kwargs).difference(used_args)
+        if extra:
+            raise KeyError(extra)
+
+
+@lru_cache(maxsize=2000)
+def hash_text(*text: str) -> str:
+    bs = [t.encode() if not isinstance(t, bytes) else t for t in text]
+    return xxhash.xxh3_128_hexdigest(b"".join(bs))
+
+
+VERSION_NUMBERS = re.compile(r"\b\d+\.\d+(?:\.\d+)?\w*\b")
+
+
+def tokenize(text: str) -> list[int]:
+    tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
+    return tokenizer.encode(text)
+
+
+def detokenize(tokens: list[int]) -> str:
+    tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
+    return tokenizer.decode(tokens)
+
+
+def count_tokens(text: str) -> int:
+    return len(tokenize(text))
+
+
+def slice_tokens(text: str, n_tokens: int) -> str:
+    tokens = tokenize(text)
+    return detokenize(tokens[:n_tokens])
+
+
+def split_text(
+    text: str,
+    chunk_size: int,
+    chunk_overlap: float = None,
+    last_chunk_threshold: float = None,
+    return_index: bool = False,
+) -> str | tuple[str, int]:
+    """
+    Split a text into a list of strings. Chunks are split by tokens.
+
+    Args:
+        text (str): The text to split.
+        chunk_size (int): The number of tokens in each chunk.
+        chunk_overlap (float): The fraction of overlap between chunks.
+        last_chunk_threshold (float): If the last chunk is less than this fraction of
+            the chunk_size, it will be added to the prior chunk
+        return_index (bool): If True, return a tuple of (chunk, index) where index is the
+            character index of the start of the chunk in the original text.
+    """
+    if chunk_overlap is None:
+        chunk_overlap = 0.1
+    if chunk_overlap < 0 or chunk_overlap > 1:
+        raise ValueError("chunk_overlap must be between 0 and 1")
+    if last_chunk_threshold is None:
+        last_chunk_threshold = 0.25
+
+    tokens = tokenize(text)
+
+    chunks = []
+    for i in range(0, len(tokens), chunk_size - int(chunk_overlap * chunk_size)):
+        chunks.append((tokens[i : i + chunk_size], len(detokenize(tokens[:i]))))
+
+    # if the last chunk is too small, merge it with the previous chunk
+    if len(chunks) > 1 and len(chunks[-1][0]) < chunk_size * last_chunk_threshold:
+        chunks[-2][0].extend(chunks.pop(-1)[0])
+
+    if return_index:
+        return [(detokenize(chunk), index) for chunk, index in chunks]
+    else:
+        return [detokenize(chunk) for chunk, _ in chunks]
+
+
+def _extract_keywords(text: str, n_keywords: int = None) -> list[str]:
+    # deferred import
+    import yake
+
+    kw = yake.KeywordExtractor(
+        lan="en",
+        n=1,
+        dedupLim=0.9,
+        dedupFunc="seqm",
+        windowsSize=1,
+        top=n_keywords or marvin.settings.default_n_keywords,
+        features=None,
+    )
+    keywords = kw.extract_keywords(text)
+    # return only keyword, not score
+    return [k[0] for k in keywords]
+
+
+async def extract_keywords(text: str, n_keywords: int = None) -> list[str]:
+    # keyword extraction can take a while and is blocking
+    return await marvin.utilities.async_utils.run_async_process(
+        _extract_keywords, text=text, n_keywords=n_keywords
+    )
+    # return _extract_keywords(text=text, n_keywords=n_keywords)
+
+
+def create_minimap_fn(content: str) -> Callable[[int], str]:
+    """
+    Given a document with markdown headers, returns a function that outputs the current headers
+    for any character position in the document.
+    """
+    minimap: dict[int, str] = {}
+    in_code_block = False
+    current_stack = {}
+    characters = 0
+    for line in content.splitlines():
+        characters += len(line)
+        if line.startswith("```"):
+            in_code_block = not in_code_block
+        if in_code_block:
+            continue
+
+        if line.startswith("# "):
+            current_stack = {1: line}
+        elif line.startswith("## "):
+            for i in range(2, 6):
+                current_stack.pop(i, None)
+            current_stack[2] = line
+        elif line.startswith("### "):
+            for i in range(3, 6):
+                current_stack.pop(i, None)
+            current_stack[3] = line
+        elif line.startswith("#### "):
+            for i in range(4, 6):
+                current_stack.pop(i, None)
+            current_stack[4] = line
+        elif line.startswith("##### "):
+            for i in range(5, 6):
+                current_stack.pop(i, None)
+            current_stack[5] = line
+        else:
+            continue
+
+        minimap[characters - len(line)] = current_stack
+
+    def get_location_fn(n: int) -> str:
+        if n < 0:
+            raise ValueError("n must be >= 0")
+        # get the stack of headers that is closest to - but before - the current position
+        stack = minimap.get(max((k for k in minimap if k <= n), default=0), {})
+
+        ordered_stack = [stack.get(i) for i in range(1, 6)]
+        return "\n".join([s for s in ordered_stack if s is not None])
+
+    return get_location_fn
@@ -0,0 +1,13 @@
+import httpx
+
+
+def assert_status_code(response: httpx.Response, status_code: int):
+    try:
+        full_response = response.json()
+    except:
+        full_response = response.text
+    error_message = (
+        f"assert {response.status_code} == {status_code}"
+        f"\nFull response: {full_response}"
+    )
+    assert response.status_code == status_code, error_message
@@ -0,0 +1,174 @@
+import json
+import re
+from functools import lru_cache
+from typing import Any, Callable, Generic, TypeVar
+
+import pydantic
+import ulid
+from fastapi import APIRouter, Response, status
+from fastapi.encoders import jsonable_encoder
+from pydantic import BaseModel, constr
+from sqlalchemy import TypeDecorator
+
+from marvin.infra.db import JSONType
+
+T = TypeVar("T")
+UUID_REGEX = re.compile(
+    r"\b[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}\b"
+)
+# ulid
+ULID_REGEX = r"\b[0-9A-HJ-NP-TV-Z]{26}\b"
+# specific prefix
+PREFIXED_ULID_REGEX = r"\b{prefix}_[0-9A-HJ-NP-TV-Z]{{26}}\b"
+# any prefix
+ANY_PREFIX_ULID_REGEX = r"\b[^\W0-9_][^\W_]+_[0-9A-HJ-NP-TV-Z]{26}\b"
+# optional prefix
+ANY_ULID_REGEX = r"\b(?:[^\W0-9_][^\W_]+_)?[0-9A-HJ-NP-TV-Z]{26}\b"
+
+
+@lru_cache()
+def get_id_type(prefix: str = None) -> type:
+    if prefix is None:
+        type_ = constr(regex=ULID_REGEX)
+        type_.new = lambda: str(ulid.new())
+    else:
+        if "_" in prefix:
+            raise ValueError("Prefix must not contain underscores.")
+        type_ = constr(regex=PREFIXED_ULID_REGEX.format(prefix=prefix))
+        type_.new = lambda: f"{prefix}_{ulid.new()}"
+        type_.regex = PREFIXED_ULID_REGEX.format(prefix=prefix)
+    return type_
+
+
+class MarvinBaseModel(BaseModel):
+    class Config:
+        copy_on_model_validation = "shallow"
+        validate_assignment = True
+        extra = "forbid"
+        json_encoders = {}
+
+    def dict(self, *args, json_compatible=False, **kwargs):
+        if json_compatible:
+            return json.loads(self.json(*args, **kwargs))
+        return super().dict(*args, **kwargs)
+
+    def copy_with_updates(self, exclude: set[str] = None, **updates):
+        """
+        Copies the current model and updates the copy with the provided updates,
+        which can be partial nested dictionaries.
+
+        Unlike `copy(update=updates)`, this method will properly validate
+        updates and apply nested updates.
+        """
+        updated = self.dict(exclude=exclude)
+
+        stack = [(updated, k, v) for k, v in updates.items()]
+        while stack:
+            m, k, v = stack.pop()
+            mv = m.get(k)
+            if isinstance(mv, dict) and isinstance(v, dict):
+                stack.extend([(mv, vk, vv) for vk, vv in v.items()])
+            else:
+                m[k] = v
+
+        excluded = set(self.__exclude_fields__ or []).union(exclude or [])
+        excluded_kwargs = {e: getattr(self, e) for e in excluded if e not in updated}
+        return type(self)(**updated, **excluded_kwargs)
+
+
+class MarvinRouter(APIRouter):
+    """
+    Utilities to make the router a little more convenient to use.
+    """
+
+    def add_api_route(
+        self, path: str, endpoint: Callable[..., Any], **kwargs: Any
+    ) -> None:
+        """
+        Add an API route.
+
+        For routes that return content and have not specified a `response_model`,
+        use return type annotation to infer the response model.
+
+        For routes that return No-Content status codes, explicitly set
+        a `response_class` to ensure nothing is returned in the response body.
+        """
+        if kwargs.get("status_code") == status.HTTP_204_NO_CONTENT:
+            # any routes that return No-Content status codes must
+            # explicilty set a response_class that will handle status codes
+            # and not return anything in the body
+            kwargs["response_class"] = Response
+        return super().add_api_route(path, endpoint, **kwargs)
+
+
+def pydantic_column_type(pydantic_type):
+    """
+    SA Column for converting pydantic models to and from JSON
+    """
+
+    class PydanticJSONType(TypeDecorator, Generic[T]):
+        impl = JSONType()
+
+        def bind_processor(self, dialect):
+            impl_processor = self.impl.bind_processor(dialect)
+            if impl_processor:
+
+                def process(value: T):
+                    if value is not None:
+                        if isinstance(pydantic_type, pydantic.main.ModelMetaclass):
+                            # This allows to assign non-InDB models and if they're
+                            # compatible, they're directly parsed into the InDB
+                            # representation, thus hiding the implementation in the
+                            # background. However, the InDB model will still be returned
+                            value_to_dump = pydantic_type.from_orm(value)
+                        else:
+                            value_to_dump = value
+                        value = jsonable_encoder(value_to_dump)
+                    return impl_processor(value)
+
+            else:
+
+                def process(value):
+                    if isinstance(pydantic_type, pydantic.main.ModelMetaclass):
+                        # This allows to assign non-InDB models and if they're
+                        # compatible, they're directly parsed into the InDB
+                        # representation, thus hiding the implementation in the
+                        # background. However, the InDB model will still be returned
+                        value_to_dump = pydantic_type.from_orm(value)
+                    else:
+                        value_to_dump = value
+                    value = json.dumps(jsonable_encoder(value_to_dump))
+                    return value
+
+            return process
+
+        def result_processor(self, dialect, coltype) -> T:
+            impl_processor = self.impl.result_processor(dialect, coltype)
+            if impl_processor:
+
+                def process(value):
+                    value = impl_processor(value)
+                    if value is None:
+                        return None
+
+                    data = value
+                    # Explicitly use the generic directly, not type(T)
+                    full_obj = pydantic.parse_obj_as(pydantic_type, data)
+                    return full_obj
+
+            else:
+
+                def process(value):
+                    if value is None:
+                        return None
+
+                    # Explicitly use the generic directly, not type(T)
+                    full_obj = pydantic.parse_obj_as(pydantic_type, value)
+                    return full_obj
+
+            return process
+
+        def compare_values(self, x, y):
+            return x == y
+
+    return PydanticJSONType
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from . import logging, async_utils, types, strings, collections, tests`