Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace btree with btree_gin indexes for PostgreSQL to allow larger metadata and faster nested search #588

Merged
merged 14 commits into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,13 @@ jobs:
shell: bash -l {0}
run: |
set -vxeuo pipefail
coverage run -m pytest -v -m "not slow"
coverage report
env:
# Provide test suite with a PostgreSQL database to use.
export TILED_TEST_POSTGRESQL_URI=postgresql+asyncpg://postgres:secret@localhost:5432
TILED_TEST_POSTGRESQL_URI: postgresql+asyncpg://postgres:secret@localhost:5432
# Opt in to LDAPAuthenticator tests.
export TILED_TEST_LDAP=1
coverage run -m pytest -v
coverage report
TILED_TEST_LDAP: 1

windows_checks:
runs-on: windows-latest
Expand Down
3 changes: 3 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@ log_cli = 1
log_cli_level = WARNING
log_cli_format = %(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)
log_cli_date_format=%Y-%m-%d %H:%M:%S
addopts = --strict-markers -m 'not slow'
markers =
slow: marks tests as slow (deselect with '-m "not slow"')
3 changes: 2 additions & 1 deletion tiled/_tests/test_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,9 +156,10 @@ async def test_search(a):
assert await d.search(Eq("number", 12)).keys_range(0, 5) == ["c"]


@pytest.mark.slow
@pytest.mark.asyncio
async def test_metadata_index_is_used(a):
for i in range(10):
for i in range(10000):
await a.create_node(
metadata={
"number": i,
Expand Down
36 changes: 34 additions & 2 deletions tiled/catalog/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import shutil
import sys
import uuid
from functools import partial
from functools import partial, reduce
from pathlib import Path
from urllib.parse import quote_plus, urlparse

Expand Down Expand Up @@ -938,9 +938,18 @@ def _prepare_structure(structure_family, structure):

def binary_op(query, tree, operation):
dialect_name = tree.engine.url.get_dialect().name
attr = orm.Node.metadata_[query.key.split(".")]
keys = query.key.split(".")
attr = orm.Node.metadata_[keys]
if dialect_name == "sqlite":
condition = operation(_get_value(attr, type(query.value)), query.value)
# specific case where GIN optomized index can be used to speed up POSTGRES equals queries
elif (dialect_name == "postgresql") and (operation == operator.eq):
condition = orm.Node.metadata_.op("@>")(
type_coerce(
key_array_to_json(keys, query.value),
orm.Node.metadata_.type,
)
)
else:
condition = operation(attr, type_coerce(query.value, orm.Node.metadata_.type))
return tree.new_variation(conditions=tree.conditions + [condition])
Expand Down Expand Up @@ -1097,6 +1106,29 @@ def json_serializer(obj):
return safe_json_dump(obj).decode()


def key_array_to_json(keys, value):
"""Take JSON accessor information as an array of keys and value

Parameters
----------
keys : iterable
An array of keys to be created in the object.
value : string
Value assigned to the final key.

Returns
-------
json
JSON object for use in postgresql queries.

Examples
--------
>>> key_array_to_json(['x','y','z'], 1)
{'x': {'y': {'z': 1}}
"""
return {keys[0]: reduce(lambda x, y: {y: x}, keys[1:][::-1], value)}


STRUCTURES = {
StructureFamily.container: CatalogContainerAdapter,
StructureFamily.array: CatalogArrayAdapter,
Expand Down
7 changes: 5 additions & 2 deletions tiled/catalog/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,20 @@

# This is the alembic revision ID of the database revision
# required by this version of Tiled.
REQUIRED_REVISION = "0b033e7fbe30"
REQUIRED_REVISION = "3db11ff95b6c"

# This is list of all valid revisions (from current to oldest).
ALL_REVISIONS = ["0b033e7fbe30", "83889e049ddc", "6825c778aa3c"]
ALL_REVISIONS = ["3db11ff95b6c", "0b033e7fbe30", "83889e049ddc", "6825c778aa3c"]


async def initialize_database(engine):
# The definitions in .orm alter Base.metadata.
from . import orm # noqa: F401

async with engine.connect() as connection:
# Install extensions
if engine.dialect.name == "postgresql":
await connection.execute(text("create extension btree_gin;"))
# Create all tables.
await connection.run_sync(Base.metadata.create_all)
if engine.dialect.name == "sqlite":
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""Changing top_level_metadata to btree_gin

Revision ID: 3db11ff95b6c
Revises: 0b033e7fbe30
Create Date: 2023-11-01 15:16:48.554420

"""
import sqlalchemy as sa
from alembic import op

# revision identifiers, used by Alembic.
revision = "3db11ff95b6c"
down_revision = "0b033e7fbe30"
branch_labels = None
depends_on = None


def upgrade():
connection = op.get_bind()
if connection.engine.dialect.name == "postgresql":
with op.get_context().autocommit_block():
op.execute(sa.text("create extension IF NOT EXISTS btree_gin;"))
op.drop_index("top_level_metadata", table_name="nodes")
op.create_index(
"top_level_metadata",
"nodes",
["ancestors", "time_created", "id", "metadata"],
postgresql_using="gin",
)


def downgrade():
# This _could_ be implemented but we will wait for a need since we are
# still in alpha releases.
raise NotImplementedError
2 changes: 1 addition & 1 deletion tiled/catalog/orm.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ class Node(Timestamped, Base):
"time_created",
"id",
"metadata",
postgresql_using="btree",
postgresql_using="gin",
),
# This is used by ORDER BY with the default sorting.
# Index("ancestors_time_created", "ancestors", "time_created"),
Expand Down