Skip to content

Commit

Permalink
Merge pull request #23 from antgonza/pysyndna
Browse files Browse the repository at this point in the history
pysyndna
  • Loading branch information
charles-cowart authored Dec 13, 2023
2 parents 2c6070a + 0c323fd commit 71fe32d
Show file tree
Hide file tree
Showing 7 changed files with 210 additions and 22 deletions.
19 changes: 15 additions & 4 deletions qp_woltka/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,15 @@
'input': ('artifact', ['per_sample_FASTQ'])
}
opt_params = {
'Database': [f'choice: ["{db_path}"]', db_path]
'Database': [f'choice: ["{db_path}"]', db_path],
'min_sample_counts': ('integer', '1')
}
outputs = {
'SynDNA hits': 'BIOM',
'reads without SynDNA': 'per_sample_FASTQ',
}
dflt_param_set = {
'SynDNA': {'Database': db_path},
'SynDNA': {'Database': db_path, 'min_sample_counts': 1},
}
syndna_cmd = QiitaCommand(
'SynDNA Woltka', "Process SynDNA reads using woltka", woltka_syndna,
Expand All @@ -65,11 +66,21 @@
'synDNA hits': ('artifact', ['BIOM']),
'Woltka per-genome': ('artifact', ['BIOM'])
}
opt_params = {}
opt_params = {
'min_coverage': ('integer', '1'),
'read_length': ('integer', '150'),
'min_rsquared': ('float', '0.8'),
}
outputs = {
'Cell counts': 'BIOM'
}
dflt_param_set = {}
dflt_param_set = {
'150bp @ min_coverage:1 R^2:0.8': {
'min_coverage': 1,
'read_length': 150,
'min_rsquared': 0.8
}
}
calculate_cell_counts_cmd = QiitaCommand(
'Calculate Cell Counts', "Calculate cell counts per-genome",
calculate_cell_counts, req_params, opt_params, outputs, dflt_param_set)
Expand Down
Empty file.
14 changes: 14 additions & 0 deletions qp_woltka/support_files/lin_regress_by_sample_id.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
1.SKB8.640193:
intercept: 9.755082803643
intercept_stderr: 3.058237204847
pvalue: 0.01197011187
rvalue: -0.78628520649
slope: -3.509994537266
stderr: 1.04248546695
1.SKD8.640184:
intercept: 8.352404328098
intercept_stderr: 2.313971537139
pvalue: 0.005567676743
rvalue: -0.798913662914
slope: -2.991479594754
stderr: 0.79622836446
Binary file added qp_woltka/support_files/syndna.biom
Binary file not shown.
78 changes: 72 additions & 6 deletions qp_woltka/tests/test_woltka.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@

from qp_woltka import plugin
from qp_woltka.woltka import (
woltka_to_array, woltka, woltka_syndna_to_array, woltka_syndna)
woltka_to_array, woltka, woltka_syndna_to_array, woltka_syndna,
calculate_cell_counts)


class WoltkaTests(PluginTestCase):
Expand Down Expand Up @@ -415,8 +416,12 @@ def test_creation_error_no_unique_run_prefix(self):
def test_woltka_syndna_to_array(self):
# inserting new prep template
prep_info_dict = {
'SKB8.640193': {'run_prefix': 'S22205_S104'},
'SKD8.640184': {'run_prefix': 'S22282_S102'}}
'SKB8.640193': {
'run_prefix': 'S22205_S104', 'syndna_pool_number': 1,
'raw_reads_r1r2': 10000, 'mass_syndna_input_ng': 120},
'SKD8.640184': {
'run_prefix': 'S22282_S102', 'syndna_pool_number': 1,
'raw_reads_r1r2': 10002, 'mass_syndna_input_ng': 120}}
pid, aid, job_id = self._helper_woltka_bowtie(prep_info_dict)

out_dir = mkdtemp()
Expand Down Expand Up @@ -518,7 +523,7 @@ def test_woltka_syndna_to_array(self):

# now let's test that if finished correctly
sdir = 'qp_woltka/support_files/'
copyfile(f'{sdir}/none.biom', f'{out_dir}/syndna.biom')
copyfile(f'{sdir}/syndna.biom', f'{out_dir}/syndna.biom')
# we just need the file to exists so is fine to use the biom as tar
mkdir(f'{out_dir}/sams')
mkdir(f'{out_dir}/sams/final')
Expand All @@ -534,19 +539,80 @@ def test_woltka_syndna_to_array(self):
copyfile(iname, jname)
reads.append((jname, ft))

params = self.params.copy()
params['min_sample_counts'] = 1
success, ainfo, msg = woltka_syndna(
self.qclient, job_id, self.params, out_dir)
self.qclient, job_id, params, out_dir)

self.assertEqual("", msg)
self.assertTrue(success)

exp = [
ArtifactInfo('SynDNA hits', 'BIOM',
[(f'{out_dir}/syndna.biom', 'biom'),
(f'{out_dir}/sams/final/alignment.tar', 'log')]),
(f'{out_dir}/sams/final/alignment.tar', 'log'),
(f'{out_dir}/lin_regress_by_sample_id.yaml', 'log'),
(f'{out_dir}/fit_syndna_models_log.txt', 'log')]),
ArtifactInfo('reads without SynDNA', 'per_sample_FASTQ', reads)]
self.assertCountEqual(ainfo, exp)

def test_calculate_cell_counts(self):
params = {'synDNA hits': 5, 'Woltka per-genome': 6,
'min_coverage': 1, 'read_length': 150,
'min_rsquared': 0.8}
job_id = 'my-job-id'
out_dir = mkdtemp()
self._clean_up_files.append(out_dir)

# this should fail cause we don't have valid data
success, ainfo, msg = calculate_cell_counts(
self.qclient, job_id, params, out_dir)
self.assertFalse(success)
self.assertEqual(msg, "No logs found, are you sure you selected the "
"correct artifact for 'synDNA hits'?")

# this should fail too because but now we are getting deeper into
# the validation
prep_info_dict = {
'SKB8.640193': {
'run_prefix': 'S22205_S104', 'syndna_pool_number': 1,
'raw_reads_r1r2': 10000, 'mass_syndna_input_ng': 120,
'minipico_dna_concentration_ng_ul': 10, 'vol_elute_ul': 5},
'SKD8.640184': {
'run_prefix': 'S22282_S102', 'syndna_pool_number': 1,
'raw_reads_r1r2': 10002, 'mass_syndna_input_ng': 120,
'minipico_dna_concentration_ng_ul': 11, 'vol_elute_ul': 6}}
data = {'prep_info': dumps(prep_info_dict),
'study': 1,
'data_type': 'Metagenomic'}
pid = self.qclient.post('/apitest/prep_template/', data=data)['prep']

sdir = 'qp_woltka/support_files/'
fp_to_cp = [
'fit_syndna_models_log.txt', 'lin_regress_by_sample_id.yaml',
'syndna.biom']
for fp in fp_to_cp:
copyfile(f'{sdir}/{fp}', f'{out_dir}/{fp}')
data = {
'filepaths': dumps([
(f'{out_dir}/syndna.biom', 'biom'),
(f'{out_dir}/lin_regress_by_sample_id.yaml', 'log'),
(f'{out_dir}/fit_syndna_models_log.txt', 'log')]),
'type': "BIOM",
'name': "SynDNA Hits - Test",
'prep': pid}
params['synDNA hits'] = self.qclient.post(
'/apitest/artifact/', data=data)['artifact']

success, ainfo, msg = calculate_cell_counts(
self.qclient, job_id, params, out_dir)
self.assertFalse(success)
self.assertEqual(msg, "The selected 'Woltka per-genome' artifact "
"doesn't look like one, did you select the correct "
"file?")

# Finally, adding a full test is close to impossible - too many steps.


if __name__ == '__main__':
main()
117 changes: 106 additions & 11 deletions qp_woltka/woltka.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
from glob import glob
from shutil import copy2
from math import ceil
from biom import load_table
from biom.util import biom_open
import pandas as pd
from pysyndna import fit_linear_regression_models_for_qiita
from pysyndna import calc_ogu_cell_counts_per_g_of_sample_for_qiita

from qp_woltka.util import search_by_filename

Expand Down Expand Up @@ -356,6 +361,10 @@ def woltka_syndna_to_array(files, output, database_bowtie2, prep, url, name):
db_folder = dirname(database_bowtie2)
db_name = basename(database_bowtie2)

# storing the prep so we can use later
preparation_information = join(output, 'prep_info.tsv')
prep.set_index('sample_name').to_csv(preparation_information, sep='\t')

lookup = prep.set_index('run_prefix')['sample_name'].to_dict()
n_files = 1
for i, (k, (f, r)) in enumerate(files.items()):
Expand Down Expand Up @@ -476,7 +485,7 @@ def woltka_syndna(qclient, job_id, parameters, out_dir):
job_id : str
The job id
parameters : dict
The parameter values to run split libraries
The parameter values to wolka syndna
out_dir : str
The path to the job's output directory
Expand All @@ -490,15 +499,23 @@ def woltka_syndna(qclient, job_id, parameters, out_dir):
fp_biom = f'{out_dir}/syndna.biom'
fp_alng = f'{out_dir}/sams/final/alignment.tar'
if exists(fp_biom) and exists(fp_alng):
# ToDo:
# add call and creation of synDNA regression
# job_info = qclient.get_job_info(job_id)
# artifact_id = parameters['input']
# files, prep = qclient.artifact_and_preparation_files(artifact_id)
# print(files, prep)

# if we got to this point a preparation file should exist in
# the output folder
prep = pd.read_csv(
f'{out_dir}/prep_info.tsv', index_col=None, sep='\t')
output = fit_linear_regression_models_for_qiita(
prep, load_table(fp_biom), parameters['min_sample_counts'])
# saving results to disk
lin_regress_results_fp = f'{out_dir}/lin_regress_by_sample_id.yaml'
fit_syndna_models_log_fp = f'{out_dir}/fit_syndna_models_log.txt'
with open(lin_regress_results_fp, 'w') as fp:
fp.write(output['lin_regress_by_sample_id'])
with open(fit_syndna_models_log_fp, 'w') as fp:
fp.write(output['fit_syndna_models_log'])
ainfo = [ArtifactInfo('SynDNA hits', 'BIOM', [
(fp_biom, 'biom'), (fp_alng, 'log')])]
(fp_biom, 'biom'), (fp_alng, 'log'),
(lin_regress_results_fp, 'log'),
(fit_syndna_models_log_fp, 'log')])]
else:
ainfo = []
errors.append('Missing files from the "SynDNA hits"; please '
Expand Down Expand Up @@ -533,6 +550,84 @@ def woltka_syndna(qclient, job_id, parameters, out_dir):


def calculate_cell_counts(qclient, job_id, parameters, out_dir):
"""Run calc_ogu_cell_counts_per_g_of_sample_for_qiita
Parameters
----------
qclient : tgp.qiita_client.QiitaClient
The Qiita server client
job_id : str
The job id
parameters : dict
The parameter values to wolka syndna
out_dir : str
The path to the job's output directory
Returns
-------
bool, list, str
The results of the job
"""
"""
raise ValueError('Not implemented')
error = ''
# let's get the syndna_id and prep in a single go
syndna_id = parameters['synDNA hits']
syndna_files, prep = qclient.artifact_and_preparation_files(syndna_id)
if 'log' not in syndna_files.keys():
error = ("No logs found, are you sure you selected the correct "
"artifact for 'synDNA hits'?")
else:

lin_regress_by_sample_id_fp = [f for f in syndna_files['log']
if 'lin_regress_by_sample_id' in f]
if not lin_regress_by_sample_id_fp:
error = ("No 'lin_regress_by_sample_id' log found, are you sure "
" you selected the correct artifact for 'synDNA hits'?")
else:
lin_regress_by_sample_id_fp = lin_regress_by_sample_id_fp[0]

# for per_genome_id let's do it separately so we can also ge the
# sample information
per_genome_id = parameters['Woltka per-genome']
ainfo = qclient.get("/qiita_db/artifacts/%s/" % per_genome_id)
aparams = ainfo['processing_parameters']
ogu_fp = ainfo['files']['biom'][0]['filepath']

if 'Database' not in aparams or not ogu_fp.endswith('none.biom'):
error = ("The selected 'Woltka per-genome' artifact doesn't "
"look like one, did you select the correct file?")
else:
ogu_counts_per_sample = load_table(ogu_fp)

db_files = _process_database_files(aparams['Database'])
ogu_lengths_fp = db_files["length.map"]

sample_info = qclient.get(
'/qiita_db/prep_template/%s/data/?sample_information=true'
% ainfo['prep_information'][0])
sample_info = pd.DataFrame.from_dict(
sample_info['data'], orient='index')
sample_info.reset_index(names='sample_name', inplace=True)

if error:
return False, None, error

try:
output = calc_ogu_cell_counts_per_g_of_sample_for_qiita(
sample_info, prep, lin_regress_by_sample_id_fp,
ogu_counts_per_sample, ogu_lengths_fp,
parameters['read_length'], parameters['min_rsquared'],
parameters['min_rsquared'])
except Exception as e:
return False, None, str(e)

log_fp = f'{out_dir}/cell_counts.log'
with open(log_fp, 'w') as f:
f.write(output['calc_cell_counts_log'])
biom_fp = f'{out_dir}/cell_counts.biom'
with biom_open(biom_fp, 'w') as f:
output['cell_count_biom'].to_hdf5(f, f"Cell Counts - {job_id}")
ainfo = [
ArtifactInfo(
'Cell counts', 'BIOM', [(biom_fp, 'biom'), (log_fp, 'log')])]

return True, ainfo, ""
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@
'h5py >= 2.3.1', 'biom-format',
# forcing 0.1.4 because Qiita uses this version
'woltka @ https://github.com/'
'qiyunzhu/woltka/archive/refs/tags/v0.1.4.zip'],
'qiyunzhu/woltka/archive/refs/tags/v0.1.4.zip',
'pysyndna @ https://github.com/AmandaBirmingham/'
'pysyndna/archive/refs/heads/main.zip'],
classifiers=classifiers
)

0 comments on commit 71fe32d

Please sign in to comment.