diff --git a/qp_woltka/__init__.py b/qp_woltka/__init__.py index 5d02462..68ae5bc 100644 --- a/qp_woltka/__init__.py +++ b/qp_woltka/__init__.py @@ -46,14 +46,15 @@ 'input': ('artifact', ['per_sample_FASTQ']) } opt_params = { - 'Database': [f'choice: ["{db_path}"]', db_path] + 'Database': [f'choice: ["{db_path}"]', db_path], + 'min_sample_counts': ('integer', '1') } outputs = { 'SynDNA hits': 'BIOM', 'reads without SynDNA': 'per_sample_FASTQ', } dflt_param_set = { - 'SynDNA': {'Database': db_path}, + 'SynDNA': {'Database': db_path, 'min_sample_counts': 1}, } syndna_cmd = QiitaCommand( 'SynDNA Woltka', "Process SynDNA reads using woltka", woltka_syndna, @@ -65,11 +66,21 @@ 'synDNA hits': ('artifact', ['BIOM']), 'Woltka per-genome': ('artifact', ['BIOM']) } -opt_params = {} +opt_params = { + 'min_coverage': ('integer', '1'), + 'read_length': ('integer', '150'), + 'min_rsquared': ('float', '0.8'), +} outputs = { 'Cell counts': 'BIOM' } -dflt_param_set = {} +dflt_param_set = { + '150bp @ min_coverage:1 R^2:0.8': { + 'min_coverage': 1, + 'read_length': 150, + 'min_rsquared': 0.8 + } +} calculate_cell_counts_cmd = QiitaCommand( 'Calculate Cell Counts', "Calculate cell counts per-genome", calculate_cell_counts, req_params, opt_params, outputs, dflt_param_set) diff --git a/qp_woltka/support_files/fit_syndna_models_log.txt b/qp_woltka/support_files/fit_syndna_models_log.txt new file mode 100644 index 0000000..e69de29 diff --git a/qp_woltka/support_files/lin_regress_by_sample_id.yaml b/qp_woltka/support_files/lin_regress_by_sample_id.yaml new file mode 100644 index 0000000..bd775bd --- /dev/null +++ b/qp_woltka/support_files/lin_regress_by_sample_id.yaml @@ -0,0 +1,14 @@ +1.SKB8.640193: + intercept: 9.755082803643 + intercept_stderr: 3.058237204847 + pvalue: 0.01197011187 + rvalue: -0.78628520649 + slope: -3.509994537266 + stderr: 1.04248546695 +1.SKD8.640184: + intercept: 8.352404328098 + intercept_stderr: 2.313971537139 + pvalue: 0.005567676743 + rvalue: -0.798913662914 + slope: -2.991479594754 + stderr: 0.79622836446 diff --git a/qp_woltka/support_files/syndna.biom b/qp_woltka/support_files/syndna.biom new file mode 100644 index 0000000..55199a4 Binary files /dev/null and b/qp_woltka/support_files/syndna.biom differ diff --git a/qp_woltka/tests/test_woltka.py b/qp_woltka/tests/test_woltka.py index 7b446d1..e46bfab 100644 --- a/qp_woltka/tests/test_woltka.py +++ b/qp_woltka/tests/test_woltka.py @@ -17,7 +17,8 @@ from qp_woltka import plugin from qp_woltka.woltka import ( - woltka_to_array, woltka, woltka_syndna_to_array, woltka_syndna) + woltka_to_array, woltka, woltka_syndna_to_array, woltka_syndna, + calculate_cell_counts) class WoltkaTests(PluginTestCase): @@ -415,8 +416,12 @@ def test_creation_error_no_unique_run_prefix(self): def test_woltka_syndna_to_array(self): # inserting new prep template prep_info_dict = { - 'SKB8.640193': {'run_prefix': 'S22205_S104'}, - 'SKD8.640184': {'run_prefix': 'S22282_S102'}} + 'SKB8.640193': { + 'run_prefix': 'S22205_S104', 'syndna_pool_number': 1, + 'raw_reads_r1r2': 10000, 'mass_syndna_input_ng': 120}, + 'SKD8.640184': { + 'run_prefix': 'S22282_S102', 'syndna_pool_number': 1, + 'raw_reads_r1r2': 10002, 'mass_syndna_input_ng': 120}} pid, aid, job_id = self._helper_woltka_bowtie(prep_info_dict) out_dir = mkdtemp() @@ -518,7 +523,7 @@ def test_woltka_syndna_to_array(self): # now let's test that if finished correctly sdir = 'qp_woltka/support_files/' - copyfile(f'{sdir}/none.biom', f'{out_dir}/syndna.biom') + copyfile(f'{sdir}/syndna.biom', f'{out_dir}/syndna.biom') # we just need the file to exists so is fine to use the biom as tar mkdir(f'{out_dir}/sams') mkdir(f'{out_dir}/sams/final') @@ -534,8 +539,10 @@ def test_woltka_syndna_to_array(self): copyfile(iname, jname) reads.append((jname, ft)) + params = self.params.copy() + params['min_sample_counts'] = 1 success, ainfo, msg = woltka_syndna( - self.qclient, job_id, self.params, out_dir) + self.qclient, job_id, params, out_dir) self.assertEqual("", msg) self.assertTrue(success) @@ -543,10 +550,69 @@ def test_woltka_syndna_to_array(self): exp = [ ArtifactInfo('SynDNA hits', 'BIOM', [(f'{out_dir}/syndna.biom', 'biom'), - (f'{out_dir}/sams/final/alignment.tar', 'log')]), + (f'{out_dir}/sams/final/alignment.tar', 'log'), + (f'{out_dir}/lin_regress_by_sample_id.yaml', 'log'), + (f'{out_dir}/fit_syndna_models_log.txt', 'log')]), ArtifactInfo('reads without SynDNA', 'per_sample_FASTQ', reads)] self.assertCountEqual(ainfo, exp) + def test_calculate_cell_counts(self): + params = {'synDNA hits': 5, 'Woltka per-genome': 6, + 'min_coverage': 1, 'read_length': 150, + 'min_rsquared': 0.8} + job_id = 'my-job-id' + out_dir = mkdtemp() + self._clean_up_files.append(out_dir) + + # this should fail cause we don't have valid data + success, ainfo, msg = calculate_cell_counts( + self.qclient, job_id, params, out_dir) + self.assertFalse(success) + self.assertEqual(msg, "No logs found, are you sure you selected the " + "correct artifact for 'synDNA hits'?") + + # this should fail too because but now we are getting deeper into + # the validation + prep_info_dict = { + 'SKB8.640193': { + 'run_prefix': 'S22205_S104', 'syndna_pool_number': 1, + 'raw_reads_r1r2': 10000, 'mass_syndna_input_ng': 120, + 'minipico_dna_concentration_ng_ul': 10, 'vol_elute_ul': 5}, + 'SKD8.640184': { + 'run_prefix': 'S22282_S102', 'syndna_pool_number': 1, + 'raw_reads_r1r2': 10002, 'mass_syndna_input_ng': 120, + 'minipico_dna_concentration_ng_ul': 11, 'vol_elute_ul': 6}} + data = {'prep_info': dumps(prep_info_dict), + 'study': 1, + 'data_type': 'Metagenomic'} + pid = self.qclient.post('/apitest/prep_template/', data=data)['prep'] + + sdir = 'qp_woltka/support_files/' + fp_to_cp = [ + 'fit_syndna_models_log.txt', 'lin_regress_by_sample_id.yaml', + 'syndna.biom'] + for fp in fp_to_cp: + copyfile(f'{sdir}/{fp}', f'{out_dir}/{fp}') + data = { + 'filepaths': dumps([ + (f'{out_dir}/syndna.biom', 'biom'), + (f'{out_dir}/lin_regress_by_sample_id.yaml', 'log'), + (f'{out_dir}/fit_syndna_models_log.txt', 'log')]), + 'type': "BIOM", + 'name': "SynDNA Hits - Test", + 'prep': pid} + params['synDNA hits'] = self.qclient.post( + '/apitest/artifact/', data=data)['artifact'] + + success, ainfo, msg = calculate_cell_counts( + self.qclient, job_id, params, out_dir) + self.assertFalse(success) + self.assertEqual(msg, "The selected 'Woltka per-genome' artifact " + "doesn't look like one, did you select the correct " + "file?") + + # Finally, adding a full test is close to impossible - too many steps. + if __name__ == '__main__': main() diff --git a/qp_woltka/woltka.py b/qp_woltka/woltka.py index 1787bc3..0309578 100644 --- a/qp_woltka/woltka.py +++ b/qp_woltka/woltka.py @@ -11,6 +11,11 @@ from glob import glob from shutil import copy2 from math import ceil +from biom import load_table +from biom.util import biom_open +import pandas as pd +from pysyndna import fit_linear_regression_models_for_qiita +from pysyndna import calc_ogu_cell_counts_per_g_of_sample_for_qiita from qp_woltka.util import search_by_filename @@ -356,6 +361,10 @@ def woltka_syndna_to_array(files, output, database_bowtie2, prep, url, name): db_folder = dirname(database_bowtie2) db_name = basename(database_bowtie2) + # storing the prep so we can use later + preparation_information = join(output, 'prep_info.tsv') + prep.set_index('sample_name').to_csv(preparation_information, sep='\t') + lookup = prep.set_index('run_prefix')['sample_name'].to_dict() n_files = 1 for i, (k, (f, r)) in enumerate(files.items()): @@ -476,7 +485,7 @@ def woltka_syndna(qclient, job_id, parameters, out_dir): job_id : str The job id parameters : dict - The parameter values to run split libraries + The parameter values to wolka syndna out_dir : str The path to the job's output directory @@ -490,15 +499,23 @@ def woltka_syndna(qclient, job_id, parameters, out_dir): fp_biom = f'{out_dir}/syndna.biom' fp_alng = f'{out_dir}/sams/final/alignment.tar' if exists(fp_biom) and exists(fp_alng): - # ToDo: - # add call and creation of synDNA regression - # job_info = qclient.get_job_info(job_id) - # artifact_id = parameters['input'] - # files, prep = qclient.artifact_and_preparation_files(artifact_id) - # print(files, prep) - + # if we got to this point a preparation file should exist in + # the output folder + prep = pd.read_csv( + f'{out_dir}/prep_info.tsv', index_col=None, sep='\t') + output = fit_linear_regression_models_for_qiita( + prep, load_table(fp_biom), parameters['min_sample_counts']) + # saving results to disk + lin_regress_results_fp = f'{out_dir}/lin_regress_by_sample_id.yaml' + fit_syndna_models_log_fp = f'{out_dir}/fit_syndna_models_log.txt' + with open(lin_regress_results_fp, 'w') as fp: + fp.write(output['lin_regress_by_sample_id']) + with open(fit_syndna_models_log_fp, 'w') as fp: + fp.write(output['fit_syndna_models_log']) ainfo = [ArtifactInfo('SynDNA hits', 'BIOM', [ - (fp_biom, 'biom'), (fp_alng, 'log')])] + (fp_biom, 'biom'), (fp_alng, 'log'), + (lin_regress_results_fp, 'log'), + (fit_syndna_models_log_fp, 'log')])] else: ainfo = [] errors.append('Missing files from the "SynDNA hits"; please ' @@ -533,6 +550,84 @@ def woltka_syndna(qclient, job_id, parameters, out_dir): def calculate_cell_counts(qclient, job_id, parameters, out_dir): + """Run calc_ogu_cell_counts_per_g_of_sample_for_qiita + + Parameters + ---------- + qclient : tgp.qiita_client.QiitaClient + The Qiita server client + job_id : str + The job id + parameters : dict + The parameter values to wolka syndna + out_dir : str + The path to the job's output directory + + Returns + ------- + bool, list, str + The results of the job """ - """ - raise ValueError('Not implemented') + error = '' + # let's get the syndna_id and prep in a single go + syndna_id = parameters['synDNA hits'] + syndna_files, prep = qclient.artifact_and_preparation_files(syndna_id) + if 'log' not in syndna_files.keys(): + error = ("No logs found, are you sure you selected the correct " + "artifact for 'synDNA hits'?") + else: + + lin_regress_by_sample_id_fp = [f for f in syndna_files['log'] + if 'lin_regress_by_sample_id' in f] + if not lin_regress_by_sample_id_fp: + error = ("No 'lin_regress_by_sample_id' log found, are you sure " + " you selected the correct artifact for 'synDNA hits'?") + else: + lin_regress_by_sample_id_fp = lin_regress_by_sample_id_fp[0] + + # for per_genome_id let's do it separately so we can also ge the + # sample information + per_genome_id = parameters['Woltka per-genome'] + ainfo = qclient.get("/qiita_db/artifacts/%s/" % per_genome_id) + aparams = ainfo['processing_parameters'] + ogu_fp = ainfo['files']['biom'][0]['filepath'] + + if 'Database' not in aparams or not ogu_fp.endswith('none.biom'): + error = ("The selected 'Woltka per-genome' artifact doesn't " + "look like one, did you select the correct file?") + else: + ogu_counts_per_sample = load_table(ogu_fp) + + db_files = _process_database_files(aparams['Database']) + ogu_lengths_fp = db_files["length.map"] + + sample_info = qclient.get( + '/qiita_db/prep_template/%s/data/?sample_information=true' + % ainfo['prep_information'][0]) + sample_info = pd.DataFrame.from_dict( + sample_info['data'], orient='index') + sample_info.reset_index(names='sample_name', inplace=True) + + if error: + return False, None, error + + try: + output = calc_ogu_cell_counts_per_g_of_sample_for_qiita( + sample_info, prep, lin_regress_by_sample_id_fp, + ogu_counts_per_sample, ogu_lengths_fp, + parameters['read_length'], parameters['min_rsquared'], + parameters['min_rsquared']) + except Exception as e: + return False, None, str(e) + + log_fp = f'{out_dir}/cell_counts.log' + with open(log_fp, 'w') as f: + f.write(output['calc_cell_counts_log']) + biom_fp = f'{out_dir}/cell_counts.biom' + with biom_open(biom_fp, 'w') as f: + output['cell_count_biom'].to_hdf5(f, f"Cell Counts - {job_id}") + ainfo = [ + ArtifactInfo( + 'Cell counts', 'BIOM', [(biom_fp, 'biom'), (log_fp, 'log')])] + + return True, ainfo, "" diff --git a/setup.py b/setup.py index 260f344..4f15419 100644 --- a/setup.py +++ b/setup.py @@ -55,6 +55,8 @@ 'h5py >= 2.3.1', 'biom-format', # forcing 0.1.4 because Qiita uses this version 'woltka @ https://github.com/' - 'qiyunzhu/woltka/archive/refs/tags/v0.1.4.zip'], + 'qiyunzhu/woltka/archive/refs/tags/v0.1.4.zip', + 'pysyndna @ https://github.com/AmandaBirmingham/' + 'pysyndna/archive/refs/heads/main.zip'], classifiers=classifiers )