Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add unique constraints to the email and username fields in the galaxy_user table #18493

Open
wants to merge 13 commits into
base: dev
Choose a base branch
from
Open
2 changes: 1 addition & 1 deletion lib/galaxy/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -780,7 +780,7 @@ class User(Base, Dictifiable, RepresentById):
id: Mapped[int] = mapped_column(primary_key=True)
create_time: Mapped[datetime] = mapped_column(default=now, nullable=True)
update_time: Mapped[datetime] = mapped_column(default=now, onupdate=now, nullable=True)
email: Mapped[str] = mapped_column(TrimmedString(255), index=True)
email: Mapped[str] = mapped_column(TrimmedString(255), index=True, unique=True)
username: Mapped[Optional[str]] = mapped_column(TrimmedString(255), index=True, unique=True)
password: Mapped[str] = mapped_column(TrimmedString(255))
last_password_change: Mapped[Optional[datetime]] = mapped_column(default=now)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""Email column unique constraint

Revision ID: 1cf595475b58
Revises: d619fdfa6168
Create Date: 2024-07-03 19:53:22.443016
"""

from alembic import op

from galaxy.model.database_object_names import build_index_name
from galaxy.model.migrations.data_fixes.user_table_fixer import EmailDeduplicator
from galaxy.model.migrations.util import (
create_index,
drop_index,
index_exists,
transaction,
)

# revision identifiers, used by Alembic.
revision = "1cf595475b58"
down_revision = "d619fdfa6168"
branch_labels = None
depends_on = None


table_name = "galaxy_user"
column_name = "email"
index_name = build_index_name(table_name, [column_name])


def upgrade():
with transaction():
_fix_duplicate_emails()
# Existing databases may have an existing index we no longer need
# New databases will not have that index, so we must check.
if index_exists(index_name, table_name, False):
drop_index(index_name, table_name)
# Create a UNIQUE index
create_index(index_name, table_name, [column_name], unique=True)


def downgrade():
with transaction():
drop_index(index_name, table_name)
# Restore a non-unique index
create_index(index_name, table_name, [column_name])


def _fix_duplicate_emails():
"""Fix records with duplicate usernames"""
connection = op.get_bind()
EmailDeduplicator(connection).run()
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""Username column unique constraint

Revision ID: d619fdfa6168
Revises: d2d8f51ebb7e
Create Date: 2024-07-02 13:13:10.325586
"""

from alembic import op

from galaxy.model.database_object_names import build_index_name
from galaxy.model.migrations.data_fixes.user_table_fixer import UsernameDeduplicator
from galaxy.model.migrations.util import (
create_index,
drop_index,
index_exists,
transaction,
)

# revision identifiers, used by Alembic.
revision = "d619fdfa6168"
down_revision = "d2d8f51ebb7e"
branch_labels = None
depends_on = None

table_name = "galaxy_user"
column_name = "username"
index_name = build_index_name(table_name, [column_name])


def upgrade():
with transaction():
_fix_duplicate_usernames()
# Existing databases may have an existing index we no longer need
# New databases will not have that index, so we must check.
if index_exists(index_name, table_name, False):
drop_index(index_name, table_name)
# Create a UNIQUE index
create_index(index_name, table_name, [column_name], unique=True)


def downgrade():
with transaction():
drop_index(index_name, table_name)
# Restore a non-unique index
create_index(index_name, table_name, [column_name])


def _fix_duplicate_usernames():
"""Fix records with duplicate usernames"""
connection = op.get_bind()
UsernameDeduplicator(connection).run()
4 changes: 4 additions & 0 deletions lib/galaxy/model/migrations/data_fixes/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"""
Package contains code for fixing inconsistent data in the database that must be
run together with a migration script.
"""
112 changes: 112 additions & 0 deletions lib/galaxy/model/migrations/data_fixes/user_table_fixer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import uuid

from sqlalchemy import (
func,
Result,
select,
text,
update,
)

from galaxy.model import User


class UsernameDeduplicator:

def __init__(self, connection):
self.connection = connection

def run(self):
"""
Deduplicate usernames by generating a unique value for all duplicates, keeping
the username of the most recently created user unchanged.
Records updated with the generated value are marked as deleted.
"""
duplicates = self._get_duplicate_username_data()
prev_username = None
for id, username, _ in duplicates:
if username == prev_username:
new_username = self._generate_next_available_username(username)
stmt = update(User).where(User.id == id).values(username=new_username, deleted=True)
self.connection.execute(stmt)
else:
prev_username = username

def _get_duplicate_username_data(self) -> Result:
# Duplicate usernames
duplicates_stmt = select(User.username).group_by(User.username).having(func.count() > 1)
# User data for records with duplicate usernames (ordering: newest to oldest)
stmt = (
select(User.id, User.username, User.create_time)
.where(User.username.in_(duplicates_stmt))
.order_by(User.username, User.create_time.desc())
)
return self.connection.execute(stmt)

def _generate_next_available_username(self, username):
i = 1
while self.connection.execute(select(User).where(User.username == f"{username}-{i}")).first():
i += 1
return f"{username}-{i}"


class EmailDeduplicator:

def __init__(self, connection):
self.connection = connection

def run(self):
"""
Deduplicate user emails by generating a unique value for all duplicates, keeping
the email of the most recently created user that has one or more history unchanged.
If such a user does not exist, keep the oldest user.
Records updated with the generated value are marked as deleted (we presume them
to be invalid, and the user should not be able to login).
"""
stmt = select(User.email).group_by(User.email).having(func.count() > 1)
duplicate_emails = self.connection.scalars(stmt)
for email in duplicate_emails:
users = self._get_users_with_same_email(email)
user_with_history = self._find_oldest_user_with_history(users)
duplicates = self._get_users_to_deduplicate(users, user_with_history)
self._deduplicate_users(email, duplicates)

def _get_users_with_same_email(self, email: str):
sql = text(
"""
SELECT u.id, EXISTS(SELECT h.id FROM history h WHERE h.user_id = u.id)
FROM galaxy_user u
WHERE u.email = :email
ORDER BY u.create_time
"""
)
params = {"email": email}
return self.connection.execute(sql, params).all()

def _find_oldest_user_with_history(self, users):
for user_id, exists in users:
if exists:
return user_id
return None

def _get_users_to_deduplicate(self, users, user_with_history):
if user_with_history:
# Preserve the oldest user with a history
return [user_id for user_id, _ in users if user_id != user_with_history]
else:
# Preserve the oldest user
return [user_id for user_id, _ in users[1:]]

def _deduplicate_users(self, email, to_deduplicate):
for id in to_deduplicate:
new_email = self._generate_replacement_for_duplicate_email(email)
stmt = update(User).where(User.id == id).values(email=new_email, deleted=True)
self.connection.execute(stmt)

def _generate_replacement_for_duplicate_email(self, email: str) -> str:
"""
Generate a replacement for a duplicate email value. The new value consists of the original
email and a unique suffix. Since the original email is part of the new value, it will be
possible to retrieve the user record based on this value, if needed.
"""
return f"{email}-{uuid.uuid4()}"
13 changes: 13 additions & 0 deletions lib/galaxy/model/unittest_utils/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import random
import string


def random_str() -> str:
alphabet = string.ascii_lowercase + string.digits
size = random.randint(5, 10)
return "".join(random.choices(alphabet, k=size))


def random_email() -> str:
text = random_str()
return f"{text}@galaxy.testing"
2 changes: 1 addition & 1 deletion test/integration/test_celery_user_rate_limit.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def setup_users(dburl: str, num_users: int = 2):
for user_id in user_ids_to_add:
conn.execute(
text("insert into galaxy_user(id, active, email, password) values (:id, :active, :email, :pw)"),
[{"id": user_id, "active": True, "email": "e", "pw": "p"}],
[{"id": user_id, "active": True, "email": f"e{user_id}", "pw": "p"}],
)


Expand Down
2 changes: 1 addition & 1 deletion test/unit/app/jobs/test_rule_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def __setup_fixtures(app):
# user3 has no jobs.
user1 = model.User(email=USER_EMAIL_1, password="pass1")
user2 = model.User(email=USER_EMAIL_2, password="pass2")
user3 = model.User(email=USER_EMAIL_2, password="pass2")
user3 = model.User(email=USER_EMAIL_3, password="pass3")

app.add(user1, user2, user3)

Expand Down
10 changes: 10 additions & 0 deletions test/unit/data/model/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
PRIVATE_OBJECT_STORE_ID = "my_private_data"


class MockObjectStore:

def is_private(self, object):
if object.object_store_id == PRIVATE_OBJECT_STORE_ID:
return True
else:
return False
17 changes: 4 additions & 13 deletions test/unit/data/model/conftest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import contextlib
import os
import random
import string
import tempfile
import uuid

Expand All @@ -10,6 +8,10 @@
from sqlalchemy.orm import Session

from galaxy import model as m
from galaxy.model.unittest_utils.utils import (
random_email,
random_str,
)


@pytest.fixture
Expand Down Expand Up @@ -449,17 +451,6 @@ def transaction(session):
yield


def random_str() -> str:
alphabet = string.ascii_lowercase + string.digits
size = random.randint(5, 10)
return "".join(random.choices(alphabet, k=size))


def random_email() -> str:
text = random_str()
return f"{text}@galaxy.testing"


def write_to_db(session, model) -> None:
with transaction(session):
session.add(model)
Expand Down
11 changes: 0 additions & 11 deletions test/unit/data/model/db/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,9 @@
namedtuple,
)

PRIVATE_OBJECT_STORE_ID = "my_private_data"

MockTransaction = namedtuple("MockTransaction", "user")


class MockObjectStore:

def is_private(self, object):
if object.object_store_id == PRIVATE_OBJECT_STORE_ID:
return True
else:
return False


def verify_items(items, expected_items):
"""
Assert that items and expected_items contain the same elements.
Expand Down
2 changes: 1 addition & 1 deletion test/unit/data/model/db/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from galaxy import model as m
from galaxy.datatypes.registry import Registry as DatatypesRegistry
from galaxy.model.triggers.update_audit_table import install as install_timestamp_triggers
from . import MockObjectStore
from .. import MockObjectStore

if TYPE_CHECKING:
from sqlalchemy.engine import Engine
Expand Down
6 changes: 2 additions & 4 deletions test/unit/data/model/db/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,8 @@

from galaxy import model as m
from galaxy.model.unittest_utils.db_helpers import get_hdca_by_name
from . import (
MockTransaction,
PRIVATE_OBJECT_STORE_ID,
)
from . import MockTransaction
from .. import PRIVATE_OBJECT_STORE_ID


def test_history_update(make_history, make_hda, session):
Expand Down
Empty file.
Loading
Loading