Merge pull request #23 from antgonza/pysyndna

pysyndna
qiita-spots · Dec 13, 2023 · 71fe32d · 71fe32d
2 parents 2c6070a + 0c323fd
commit 71fe32d
Show file tree

Hide file tree

Showing 7 changed files with 210 additions and 22 deletions.
diff --git a/qp_woltka/__init__.py b/qp_woltka/__init__.py
@@ -46,14 +46,15 @@
     'input': ('artifact', ['per_sample_FASTQ'])
 }
 opt_params = {
-    'Database': [f'choice: ["{db_path}"]', db_path]
+    'Database': [f'choice: ["{db_path}"]', db_path],
+    'min_sample_counts': ('integer', '1')
 }
 outputs = {
     'SynDNA hits': 'BIOM',
     'reads without SynDNA': 'per_sample_FASTQ',
 }
 dflt_param_set = {
-    'SynDNA': {'Database': db_path},
+    'SynDNA': {'Database': db_path, 'min_sample_counts': 1},
 }
 syndna_cmd = QiitaCommand(
     'SynDNA Woltka', "Process SynDNA reads using woltka", woltka_syndna,
@@ -65,11 +66,21 @@
     'synDNA hits': ('artifact', ['BIOM']),
     'Woltka per-genome': ('artifact', ['BIOM'])
 }
-opt_params = {}
+opt_params = {
+    'min_coverage': ('integer', '1'),
+    'read_length': ('integer', '150'),
+    'min_rsquared': ('float', '0.8'),
+}
 outputs = {
     'Cell counts': 'BIOM'
 }
-dflt_param_set = {}
+dflt_param_set = {
+    '150bp @ min_coverage:1 R^2:0.8': {
+        'min_coverage': 1,
+        'read_length': 150,
+        'min_rsquared': 0.8
+    }
+}
 calculate_cell_counts_cmd = QiitaCommand(
     'Calculate Cell Counts', "Calculate cell counts per-genome",
     calculate_cell_counts, req_params, opt_params, outputs, dflt_param_set)

diff --git a/qp_woltka/support_files/fit_syndna_models_log.txt b/qp_woltka/support_files/fit_syndna_models_log.txt
diff --git a/qp_woltka/support_files/lin_regress_by_sample_id.yaml b/qp_woltka/support_files/lin_regress_by_sample_id.yaml
@@ -0,0 +1,14 @@
+1.SKB8.640193:
+  intercept: 9.755082803643
+  intercept_stderr: 3.058237204847
+  pvalue: 0.01197011187
+  rvalue: -0.78628520649
+  slope: -3.509994537266
+  stderr: 1.04248546695
+1.SKD8.640184:
+  intercept: 8.352404328098
+  intercept_stderr: 2.313971537139
+  pvalue: 0.005567676743
+  rvalue: -0.798913662914
+  slope: -2.991479594754
+  stderr: 0.79622836446
diff --git a/qp_woltka/support_files/syndna.biom b/qp_woltka/support_files/syndna.biom
diff --git a/qp_woltka/tests/test_woltka.py b/qp_woltka/tests/test_woltka.py
@@ -17,7 +17,8 @@
 
 from qp_woltka import plugin
 from qp_woltka.woltka import (
-    woltka_to_array, woltka, woltka_syndna_to_array, woltka_syndna)
+    woltka_to_array, woltka, woltka_syndna_to_array, woltka_syndna,
+    calculate_cell_counts)
 
 
 class WoltkaTests(PluginTestCase):
@@ -415,8 +416,12 @@ def test_creation_error_no_unique_run_prefix(self):
     def test_woltka_syndna_to_array(self):
         # inserting new prep template
         prep_info_dict = {
-            'SKB8.640193': {'run_prefix': 'S22205_S104'},
-            'SKD8.640184': {'run_prefix': 'S22282_S102'}}
+            'SKB8.640193': {
+                'run_prefix': 'S22205_S104', 'syndna_pool_number': 1,
+                'raw_reads_r1r2': 10000, 'mass_syndna_input_ng': 120},
+            'SKD8.640184': {
+                'run_prefix': 'S22282_S102', 'syndna_pool_number': 1,
+                'raw_reads_r1r2': 10002, 'mass_syndna_input_ng': 120}}
         pid, aid, job_id = self._helper_woltka_bowtie(prep_info_dict)
 
         out_dir = mkdtemp()
@@ -518,7 +523,7 @@ def test_woltka_syndna_to_array(self):
 
         # now let's test that if finished correctly
         sdir = 'qp_woltka/support_files/'
-        copyfile(f'{sdir}/none.biom', f'{out_dir}/syndna.biom')
+        copyfile(f'{sdir}/syndna.biom', f'{out_dir}/syndna.biom')
         # we just need the file to exists so is fine to use the biom as tar
         mkdir(f'{out_dir}/sams')
         mkdir(f'{out_dir}/sams/final')
@@ -534,19 +539,80 @@ def test_woltka_syndna_to_array(self):
                 copyfile(iname, jname)
                 reads.append((jname, ft))
 
+        params = self.params.copy()
+        params['min_sample_counts'] = 1
         success, ainfo, msg = woltka_syndna(
-            self.qclient, job_id, self.params, out_dir)
+            self.qclient, job_id, params, out_dir)
 
         self.assertEqual("", msg)
         self.assertTrue(success)
 
         exp = [
             ArtifactInfo('SynDNA hits', 'BIOM',
                          [(f'{out_dir}/syndna.biom', 'biom'),
-                          (f'{out_dir}/sams/final/alignment.tar', 'log')]),
+                          (f'{out_dir}/sams/final/alignment.tar', 'log'),
+                          (f'{out_dir}/lin_regress_by_sample_id.yaml', 'log'),
+                          (f'{out_dir}/fit_syndna_models_log.txt', 'log')]),
             ArtifactInfo('reads without SynDNA', 'per_sample_FASTQ', reads)]
         self.assertCountEqual(ainfo, exp)
 
+    def test_calculate_cell_counts(self):
+        params = {'synDNA hits': 5, 'Woltka per-genome': 6,
+                  'min_coverage': 1, 'read_length': 150,
+                  'min_rsquared': 0.8}
+        job_id = 'my-job-id'
+        out_dir = mkdtemp()
+        self._clean_up_files.append(out_dir)
+
+        # this should fail cause we don't have valid data
+        success, ainfo, msg = calculate_cell_counts(
+            self.qclient, job_id, params, out_dir)
+        self.assertFalse(success)
+        self.assertEqual(msg, "No logs found, are you sure you selected the "
+                         "correct artifact for 'synDNA hits'?")
+
+        # this should fail too because but now we are getting deeper into
+        # the validation
+        prep_info_dict = {
+            'SKB8.640193': {
+                'run_prefix': 'S22205_S104', 'syndna_pool_number': 1,
+                'raw_reads_r1r2': 10000, 'mass_syndna_input_ng': 120,
+                'minipico_dna_concentration_ng_ul': 10, 'vol_elute_ul': 5},
+            'SKD8.640184': {
+                'run_prefix': 'S22282_S102', 'syndna_pool_number': 1,
+                'raw_reads_r1r2': 10002, 'mass_syndna_input_ng': 120,
+                'minipico_dna_concentration_ng_ul': 11, 'vol_elute_ul': 6}}
+        data = {'prep_info': dumps(prep_info_dict),
+                'study': 1,
+                'data_type': 'Metagenomic'}
+        pid = self.qclient.post('/apitest/prep_template/', data=data)['prep']
+
+        sdir = 'qp_woltka/support_files/'
+        fp_to_cp = [
+            'fit_syndna_models_log.txt', 'lin_regress_by_sample_id.yaml',
+            'syndna.biom']
+        for fp in fp_to_cp:
+            copyfile(f'{sdir}/{fp}', f'{out_dir}/{fp}')
+        data = {
+            'filepaths': dumps([
+                (f'{out_dir}/syndna.biom', 'biom'),
+                (f'{out_dir}/lin_regress_by_sample_id.yaml', 'log'),
+                (f'{out_dir}/fit_syndna_models_log.txt', 'log')]),
+            'type': "BIOM",
+            'name': "SynDNA Hits - Test",
+            'prep': pid}
+        params['synDNA hits'] = self.qclient.post(
+            '/apitest/artifact/', data=data)['artifact']
+
+        success, ainfo, msg = calculate_cell_counts(
+            self.qclient, job_id, params, out_dir)
+        self.assertFalse(success)
+        self.assertEqual(msg, "The selected 'Woltka per-genome' artifact "
+                         "doesn't look like one, did you select the correct "
+                         "file?")
+
+        # Finally, adding a full test is close to impossible - too many steps.
+
 
 if __name__ == '__main__':
     main()
diff --git a/qp_woltka/woltka.py b/qp_woltka/woltka.py
@@ -11,6 +11,11 @@
 from glob import glob
 from shutil import copy2
 from math import ceil
+from biom import load_table
+from biom.util import biom_open
+import pandas as pd
+from pysyndna import fit_linear_regression_models_for_qiita
+from pysyndna import calc_ogu_cell_counts_per_g_of_sample_for_qiita
 
 from qp_woltka.util import search_by_filename
 
@@ -356,6 +361,10 @@ def woltka_syndna_to_array(files, output, database_bowtie2, prep, url, name):
     db_folder = dirname(database_bowtie2)
     db_name = basename(database_bowtie2)
 
+    # storing the prep so we can use later
+    preparation_information = join(output, 'prep_info.tsv')
+    prep.set_index('sample_name').to_csv(preparation_information, sep='\t')
+
     lookup = prep.set_index('run_prefix')['sample_name'].to_dict()
     n_files = 1
     for i, (k, (f, r)) in enumerate(files.items()):
@@ -476,7 +485,7 @@ def woltka_syndna(qclient, job_id, parameters, out_dir):
     job_id : str
         The job id
     parameters : dict
-        The parameter values to run split libraries
+        The parameter values to wolka syndna
     out_dir : str
         The path to the job's output directory
 
@@ -490,15 +499,23 @@ def woltka_syndna(qclient, job_id, parameters, out_dir):
     fp_biom = f'{out_dir}/syndna.biom'
     fp_alng = f'{out_dir}/sams/final/alignment.tar'
     if exists(fp_biom) and exists(fp_alng):
-        # ToDo:
-        # add call and creation of synDNA regression
-        # job_info = qclient.get_job_info(job_id)
-        # artifact_id = parameters['input']
-        # files, prep = qclient.artifact_and_preparation_files(artifact_id)
-        # print(files, prep)
-
+        # if we got to this point a preparation file should exist in
+        # the output folder
+        prep = pd.read_csv(
+            f'{out_dir}/prep_info.tsv', index_col=None, sep='\t')
+        output = fit_linear_regression_models_for_qiita(
+            prep, load_table(fp_biom), parameters['min_sample_counts'])
+        # saving results to disk
+        lin_regress_results_fp = f'{out_dir}/lin_regress_by_sample_id.yaml'
+        fit_syndna_models_log_fp = f'{out_dir}/fit_syndna_models_log.txt'
+        with open(lin_regress_results_fp, 'w') as fp:
+            fp.write(output['lin_regress_by_sample_id'])
+        with open(fit_syndna_models_log_fp, 'w') as fp:
+            fp.write(output['fit_syndna_models_log'])
         ainfo = [ArtifactInfo('SynDNA hits', 'BIOM', [
-            (fp_biom, 'biom'), (fp_alng, 'log')])]
+            (fp_biom, 'biom'), (fp_alng, 'log'),
+            (lin_regress_results_fp, 'log'),
+            (fit_syndna_models_log_fp, 'log')])]
     else:
         ainfo = []
         errors.append('Missing files from the "SynDNA hits"; please '
@@ -533,6 +550,84 @@ def woltka_syndna(qclient, job_id, parameters, out_dir):
 
 
 def calculate_cell_counts(qclient, job_id, parameters, out_dir):
+    """Run calc_ogu_cell_counts_per_g_of_sample_for_qiita
+
+    Parameters
+    ----------
+    qclient : tgp.qiita_client.QiitaClient
+        The Qiita server client
+    job_id : str
+        The job id
+    parameters : dict
+        The parameter values to wolka syndna
+    out_dir : str
+        The path to the job's output directory
+
+    Returns
+    -------
+    bool, list, str
+        The results of the job
     """
-    """
-    raise ValueError('Not implemented')
+    error = ''
+    # let's get the syndna_id and prep in a single go
+    syndna_id = parameters['synDNA hits']
+    syndna_files, prep = qclient.artifact_and_preparation_files(syndna_id)
+    if 'log' not in syndna_files.keys():
+        error = ("No logs found, are you sure you selected the correct "
+                 "artifact for 'synDNA hits'?")
+    else:
+
+        lin_regress_by_sample_id_fp = [f for f in syndna_files['log']
+                                       if 'lin_regress_by_sample_id' in f]
+        if not lin_regress_by_sample_id_fp:
+            error = ("No 'lin_regress_by_sample_id' log found, are you sure "
+                     " you selected the correct artifact for 'synDNA hits'?")
+        else:
+            lin_regress_by_sample_id_fp = lin_regress_by_sample_id_fp[0]
+
+            # for per_genome_id let's do it separately so we can also ge the
+            # sample information
+            per_genome_id = parameters['Woltka per-genome']
+            ainfo = qclient.get("/qiita_db/artifacts/%s/" % per_genome_id)
+            aparams = ainfo['processing_parameters']
+            ogu_fp = ainfo['files']['biom'][0]['filepath']
+
+            if 'Database' not in aparams or not ogu_fp.endswith('none.biom'):
+                error = ("The selected 'Woltka per-genome' artifact doesn't "
+                         "look like one, did you select the correct file?")
+            else:
+                ogu_counts_per_sample = load_table(ogu_fp)
+
+                db_files = _process_database_files(aparams['Database'])
+                ogu_lengths_fp = db_files["length.map"]
+
+                sample_info = qclient.get(
+                    '/qiita_db/prep_template/%s/data/?sample_information=true'
+                    % ainfo['prep_information'][0])
+                sample_info = pd.DataFrame.from_dict(
+                    sample_info['data'], orient='index')
+                sample_info.reset_index(names='sample_name', inplace=True)
+
+    if error:
+        return False, None, error
+
+    try:
+        output = calc_ogu_cell_counts_per_g_of_sample_for_qiita(
+            sample_info, prep, lin_regress_by_sample_id_fp,
+            ogu_counts_per_sample, ogu_lengths_fp,
+            parameters['read_length'], parameters['min_rsquared'],
+            parameters['min_rsquared'])
+    except Exception as e:
+        return False, None, str(e)
+
+    log_fp = f'{out_dir}/cell_counts.log'
+    with open(log_fp, 'w') as f:
+        f.write(output['calc_cell_counts_log'])
+    biom_fp = f'{out_dir}/cell_counts.biom'
+    with biom_open(biom_fp, 'w') as f:
+        output['cell_count_biom'].to_hdf5(f, f"Cell Counts - {job_id}")
+    ainfo = [
+        ArtifactInfo(
+            'Cell counts', 'BIOM', [(biom_fp, 'biom'), (log_fp, 'log')])]
+
+    return True, ainfo, ""
diff --git a/setup.py b/setup.py
@@ -55,6 +55,8 @@
                         'h5py >= 2.3.1', 'biom-format',
                         # forcing 0.1.4 because Qiita uses this version
                         'woltka @ https://github.com/'
-                        'qiyunzhu/woltka/archive/refs/tags/v0.1.4.zip'],
+                        'qiyunzhu/woltka/archive/refs/tags/v0.1.4.zip',
+                        'pysyndna @ https://github.com/AmandaBirmingham/'
+                        'pysyndna/archive/refs/heads/main.zip'],
       classifiers=classifiers
       )