From da387d490cc2325b382d7e72680d33ec8c4b6ed1 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Sun, 26 Nov 2023 15:58:58 +0100 Subject: [PATCH] Remove rollback from ``__check_jobs_at_startup`` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I think the premise that we want to do a rollback on exceptions in this method is wrong (it **may** be correct apprach in other places in the codebase e.g. in `Tool.handle_single_execution()`). Here it prevents us from comitting anything inside the with statement (as the job_wrapper.fail method does). Here's the simplified issue: ```shell ❯ python -i scripts/db_shell.py -c config/galaxy.yml >>> with sa_session() as session, session.begin(): ... sa_session.execute(update(Job).where(Job.id == 1).values(state="error")) ... sa_session.commit() ... sa_session.execute(update(Job).where(Job.id == 1).values(state="ok")) ... sa_session.commit() ... Traceback (most recent call last): File "", line 4, in File "", line 2, in execute File "/Users/mvandenb/src/galaxy/.venv/lib/python3.11/site-packages/sqlalchemy/orm/session.py", line 1711, in execute conn = self._connection_for_bind(bind, close_with_result=True) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Users/mvandenb/src/galaxy/.venv/lib/python3.11/site-packages/sqlalchemy/orm/session.py", line 1552, in _connection_for_bind TransactionalContext._trans_ctx_check(self) File "/Users/mvandenb/src/galaxy/.venv/lib/python3.11/site-packages/sqlalchemy/engine/util.py", line 199, in _trans_ctx_check raise exc.InvalidRequestError( sqlalchemy.exc.InvalidRequestError: Can't operate on closed transaction inside context manager. Please complete the context manager before emitting further commands. ``` It is probably still worthwhile to have the job recovery be minimal and do things such as calling the job wrapper fail method that does actual work to the job handler as in https://github.com/galaxyproject/galaxy/pull/17083/, but that's refactoring that can be done on the dev branch and it still seems risky in the sense that we then need to be very careful in ensuring we don't commit anywhere else inside the scope of the begin() statement. Finally I don't think it makes sense that the startup check should ever cause the boot process to fail. This isn't a misconfiguration or even anything catastrophic for the remaining jobs and places unnecessary stress on admins and can basically break at any time and shouldn't cause a complete service failure. Fixes https://github.com/galaxyproject/galaxy/issues/17079 --- lib/galaxy/jobs/handler.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/lib/galaxy/jobs/handler.py b/lib/galaxy/jobs/handler.py index 92beb0b4f1a0..3daeef1044e9 100644 --- a/lib/galaxy/jobs/handler.py +++ b/lib/galaxy/jobs/handler.py @@ -273,12 +273,13 @@ def __check_jobs_at_startup(self): In case the activation is enforced it will filter out the jobs of inactive users. """ stmt = self._build_check_jobs_at_startup_statement() - with self.sa_session() as session, session.begin(): - try: - for job in session.scalars(stmt): - with session.begin_nested(): - self._check_job_at_startup(job) - finally: + with self.sa_session() as session: + for job in session.scalars(stmt): + try: + self._check_job_at_startup(job) + except Exception: + log.exception("Error while recovering job %s during application startup.", job.id) + with transaction(session): session.commit() def _check_job_at_startup(self, job):