diff --git a/CHANGELOG.md b/CHANGELOG.md index d29e6491d..33fb0303a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ Deployed on October 14th, 2024 * `Woltka v0.1.7, paired-end` superseded `Woltka v0.1.6` in `qp-woltka`; [more information](https://qiita.ucsd.edu/static/doc/html/processingdata/woltka_pairedend.html). Thank you to @qiyunzhu for the benchmarks! * Other general fixes, like [#3424](https://github.com/qiita-spots/qiita/pull/3424), [#3425](https://github.com/qiita-spots/qiita/pull/3425), [#3439](https://github.com/qiita-spots/qiita/pull/3439), [#3440](https://github.com/qiita-spots/qiita/pull/3440). * General SPP improvements, like: [NuQC modified to preserve metadata in fastq files](https://github.com/biocore/mg-scripts/pull/155), [use squeue instead of sacct](https://github.com/biocore/mg-scripts/pull/152), , [job aborts if Qiita study contains sample metadata columns reserved for prep-infos](https://github.com/biocore/mg-scripts/pull/151), [metapool generates OverrideCycles value](https://github.com/biocore/metagenomics_pooling_notebook/pull/225). +* We updated the available parameters for `Filter features against reference [filter_features]`, `Non V4 16S sequence assessment [non_v4_16s]` and all the phylogenetic analytical commands so they can use `Greengenes2 2024.09`. diff --git a/qiita_db/handlers/processing_job.py b/qiita_db/handlers/processing_job.py index 6bb15cdf4..832d2407a 100644 --- a/qiita_db/handlers/processing_job.py +++ b/qiita_db/handlers/processing_job.py @@ -146,7 +146,9 @@ def post(self, job_id): cmd, values_dict={'job_id': job_id, 'payload': self.request.body.decode( 'ascii')}) - job = qdb.processing_job.ProcessingJob.create(job.user, params) + # complete_job are unique so it is fine to force them to be created + job = qdb.processing_job.ProcessingJob.create( + job.user, params, force=True) job.submit() self.finish() diff --git a/qiita_db/processing_job.py b/qiita_db/processing_job.py index 11145925b..a8844d181 100644 --- a/qiita_db/processing_job.py +++ b/qiita_db/processing_job.py @@ -582,10 +582,10 @@ def create(cls, user, parameters, force=False): TTRN = qdb.sql_connection.TRN with TTRN: command = parameters.command - - # check if a job with the same parameters already exists - sql = """SELECT processing_job_id, email, processing_job_status, - COUNT(aopj.artifact_id) + if not force: + # check if a job with the same parameters already exists + sql = """SELECT processing_job_id, email, + processing_job_status, COUNT(aopj.artifact_id) FROM qiita.processing_job LEFT JOIN qiita.processing_job_status USING (processing_job_status_id) @@ -596,41 +596,42 @@ def create(cls, user, parameters, force=False): GROUP BY processing_job_id, email, processing_job_status""" - # we need to use ILIKE because of booleans as they can be - # false or False - params = [] - for k, v in parameters.values.items(): - # this is necessary in case we have an Iterable as a value - # but that is string - if isinstance(v, Iterable) and not isinstance(v, str): - for vv in v: - params.extend([k, str(vv)]) + # we need to use ILIKE because of booleans as they can be + # false or False + params = [] + for k, v in parameters.values.items(): + # this is necessary in case we have an Iterable as a value + # but that is string + if isinstance(v, Iterable) and not isinstance(v, str): + for vv in v: + params.extend([k, str(vv)]) + else: + params.extend([k, str(v)]) + + if params: + # divided by 2 as we have key-value pairs + len_params = int(len(params)/2) + sql = sql.format(' AND ' + ' AND '.join( + ["command_parameters->>%s ILIKE %s"] * len_params)) + params = [command.id] + params + TTRN.add(sql, params) else: - params.extend([k, str(v)]) - - if params: - # divided by 2 as we have key-value pairs - len_params = int(len(params)/2) - sql = sql.format(' AND ' + ' AND '.join( - ["command_parameters->>%s ILIKE %s"] * len_params)) - params = [command.id] + params - TTRN.add(sql, params) - else: - # the sql variable expects the list of parameters but if there - # is no param we need to replace the {0} with an empty string - TTRN.add(sql.format(""), [command.id]) - - # checking that if the job status is success, it has children - # [2] status, [3] children count - existing_jobs = [r for r in TTRN.execute_fetchindex() - if r[2] != 'success' or r[3] > 0] - if existing_jobs and not force: - raise ValueError( - 'Cannot create job because the parameters are the same as ' - 'jobs that are queued, running or already have ' - 'succeeded:\n%s' % '\n'.join( - ["%s: %s" % (jid, status) - for jid, _, status, _ in existing_jobs])) + # the sql variable expects the list of parameters but if + # there is no param we need to replace the {0} with an + # empty string + TTRN.add(sql.format(""), [command.id]) + + # checking that if the job status is success, it has children + # [2] status, [3] children count + existing_jobs = [r for r in TTRN.execute_fetchindex() + if r[2] != 'success' or r[3] > 0] + if existing_jobs: + raise ValueError( + 'Cannot create job because the parameters are the ' + 'same as jobs that are queued, running or already ' + 'have succeeded:\n%s' % '\n'.join( + ["%s: %s" % (jid, status) + for jid, _, status, _ in existing_jobs])) sql = """INSERT INTO qiita.processing_job (email, command_id, command_parameters, diff --git a/qiita_db/support_files/patches/93.sql b/qiita_db/support_files/patches/93.sql new file mode 100644 index 000000000..81abc3331 --- /dev/null +++ b/qiita_db/support_files/patches/93.sql @@ -0,0 +1,57 @@ +-- Oct 18, 2024 +-- ProcessingJob.create can take up to 52 seconds if creating a complete_job; mainly +-- due to the number of jobs of this command and using json. The solution in the database +-- is to convert to jsonb and index the values of the database + +-- ### This are the stats before the change in a single example +-- GroupAggregate (cost=67081.81..67081.83 rows=1 width=77) (actual time=51859.962..51862.637 rows=1 loops=1) +-- Group Key: processing_job.processing_job_id, processing_job_status.processing_job_status +-- -> Sort (cost=67081.81..67081.81 rows=1 width=77) (actual time=51859.952..51862.627 rows=1 loops=1) +-- Sort Key: processing_job.processing_job_id, processing_job_status.processing_job_status +-- Sort Method: quicksort Memory: 25kB +-- -> Nested Loop Left Join (cost=4241.74..67081.80 rows=1 width=77) (actual time=51859.926..51862.604 rows=1 loops=1) +-- -> Nested Loop (cost=4237.30..67069.64 rows=1 width=69) (actual time=51859.889..51862.566 rows=1 loops=1) +-- Join Filter: (processing_job.processing_job_status_id = processing_job_status.processing_job_status_id) +-- Rows Removed by Join Filter: 1 +-- -> Gather (cost=4237.30..67068.50 rows=1 width=45) (actual time=51859.846..51862.522 rows=1 loops=1) +-- Workers Planned: 2 +-- Workers Launched: 2 +-- -> Parallel Bitmap Heap Scan on processing_job (cost=3237.30..66068.40 rows=1 width=45) (actual time=51785.317..51785.446 rows=0 loops=3) +-- Recheck Cond: (command_id = 83) +-- Filter: (((command_parameters ->> 'job_id'::text) ~~* '3432a908-f7b8-4e36-89fc-88f3310b84d5'::text) AND ((command_parameters ->> ' +-- payload'::text) ~~* '{"success": true, "error": "", "artifacts": {"alpha_diversity": {"artifact_type": "alpha_vector", "filepaths": [["/qmounts/qiita_test_data/tes +-- tlocal/working_dir/3432a908-f7b8-4e36-89fc-88f3310b84d5/alpha_phylogenetic/alpha_diversity/alpha-diversity.tsv", "plain_text"], ["/qmounts/qiita_test_data/testloca +-- l/working_dir/3432a908-f7b8-4e36-89fc-88f3310b84d5/alpha_phylogenetic/alpha_diversity.qza", "qza"]], "archive": {}}}}'::text)) +-- Rows Removed by Filter: 97315 +-- Heap Blocks: exact=20133 +-- -> Bitmap Index Scan on idx_processing_job_command_id (cost=0.00..3237.30 rows=294517 width=0) (actual time=41.569..41.569 rows= +-- 293054 loops=1) +-- Index Cond: (command_id = 83) +-- -> Seq Scan on processing_job_status (cost=0.00..1.09 rows=4 width=40) (actual time=0.035..0.035 rows=2 loops=1) +-- Filter: ((processing_job_status)::text = ANY ('{success,waiting,running,in_construction}'::text[])) +-- Rows Removed by Filter: 1 +-- -> Bitmap Heap Scan on artifact_output_processing_job aopj (cost=4.43..12.14 rows=2 width=24) (actual time=0.031..0.031 rows=0 loops=1) +-- Recheck Cond: (processing_job.processing_job_id = processing_job_id) +-- -> Bitmap Index Scan on idx_artifact_output_processing_job_job (cost=0.00..4.43 rows=2 width=0) (actual time=0.026..0.026 rows=0 loops=1) +-- Index Cond: (processing_job_id = processing_job.processing_job_id) +-- Planning Time: 1.173 ms +-- Execution Time: 51862.756 ms + +-- Note: for this to work you need to have created as admin the extension +-- CREATE EXTENSION pg_trgm; +CREATE EXTENSION IF NOT EXISTS "pg_trgm" WITH SCHEMA public; + +-- This alter table will take close to 11 min +ALTER TABLE qiita.processing_job + ALTER COLUMN command_parameters TYPE JSONB USING command_parameters::jsonb; + +-- This indexing will take like 5 min +CREATE INDEX IF NOT EXISTS processing_job_command_parameters_job_id ON qiita.processing_job + USING GIN((command_parameters->>'job_id') gin_trgm_ops); + +-- This indexing will take like an hour +CREATE INDEX IF NOT EXISTS processing_job_command_parameters_payload ON qiita.processing_job + USING GIN((command_parameters->>'payload') gin_trgm_ops); + +-- After the changes +-- 18710.404 ms