diff --git a/README.md b/README.md index 639884d6..228ccaaf 100644 --- a/README.md +++ b/README.md @@ -175,12 +175,10 @@ src/ ├── scenic │ ├── bin │ │ ├── grnboost2_without_dask.py -│ │ └── merge_SCENIC_motif_track_loom.py │ ├── processes │ │ ├── aucell.nf │ │ ├── cistarget.nf │ │ ├── grnboost2withoutDask.nf -│ │ └── mergeScenicLooms.nf │ ├── main.nf │ └── scenic.config │ diff --git a/nextflow.config b/nextflow.config index 3c67f394..123e39db 100644 --- a/nextflow.config +++ b/nextflow.config @@ -3,7 +3,7 @@ manifest { name = 'aertslab/SingleCellTxBenchmark' description = 'A repository of pipelines for single-cell data in Nextflow DSL2' homePage = 'https://github.com/aertslab/SingleCellTxBenchmark' - version = '0.3.1' + version = '0.3.3' mainScript = 'main.nf' defaultBranch = 'master' nextflowVersion = '!19.09.0-edge' // with ! prefix, stop execution if current version does not match required version. @@ -12,11 +12,8 @@ manifest { params { // These parameters are passed to all processes global { - // baseFilePath = "/opt/SingleCellTxBenchmark" - // project_name = "MCF7" project_name = "10x_PBMC" outdir = "out" - // tenx_folder = "/ddn1/vol1/staging/leuven/stg_00002/lcb/lcb_projects/TWE/MCF7/10x/ControlvsTSA/cellranger/TEW*/outs/" tenx_folder = "data/10x/1k_pbmc/1k_pbmc_*/outs/" tracedir = "${params.global.outdir}/pipeline_reports" qsubaccount = "" diff --git a/src/__init__.py b/src/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/scenic/README.md b/src/scenic/README.md index 25e0518e..d1584a6a 100644 --- a/src/scenic/README.md +++ b/src/scenic/README.md @@ -2,6 +2,63 @@ ## Running the pipeline +### Generate the config file + +- Single SCENIC run + +*Note*: The `qsub` profile if you are not running the pipeline on a cluster. + +```{bash} +nextflow config \ + -profile scenic,qsub,singularity aertslab/SingleCellTxBenchmark \ + > nextflow.config +``` + +- Multi-runs SCENIC + +*Note*: The `qsub` profile if you are not running the pipeline on a cluster. + +```{bash} +nextflow config \ + -profile scenic_multiruns,qsub,singularity aertslab/SingleCellTxBenchmark \ + > nextflow.config +``` + +### Update the config file + +Make sure the following parameters are correctly set: +- `params.global.project_name` +- `params.global.qsubaccount` if running on a cluster (SGE cluster) +- `params.sc.scenic.filteredloom` +- `params.sc.scenic.grn.TFs` +- `params.sc.scenic.cistarget.mtfDB` +- `params.sc.scenic.cistarget.mtfANN` +- `params.sc.scenic.cistarget.trkDB` if commented, track-based cisTarget won't run +- `params.sc.scenic.cistarget.trkDB` if commented, track-based cisTarget won't run +- `params.sc.scenic.numRuns` if running SCENIC in multi-runs mode +- `singularity.runOptions` Specify the paths to mount +- `params.sc.scope.tree` + +Additionally, you can update the other paraemeters for the different steps. + +### Run + +```{bash} +nextflow -C nextflow.config run \ + aertslab/SingleCellTxBenchmark \ + -entry scenic \ + -with-report report.html \ + -with-trace +``` + +### Miscellaneous + +Here is the DAG summarizing the multi-runs SCENIC workflow: + +![Multi-Runs Motif and Track based SCENIC](assets/multi_runs_motif_track_scenic.svg) + +## Testing the pipeline + ```{bash} -nextflow -C conf/test.config,scenic.config run main.nf --test +nextflow -C conf/test.config,conf/test_multi_runs.config run main.nf --test ``` \ No newline at end of file diff --git a/src/scenic/assets/multi_runs_motif_track_scenic.svg b/src/scenic/assets/multi_runs_motif_track_scenic.svg new file mode 100644 index 00000000..af1da043 --- /dev/null +++ b/src/scenic/assets/multi_runs_motif_track_scenic.svg @@ -0,0 +1,714 @@ + + +dag + + + +p0 + +Channel.from + + + +p3 + +SCENIC:GRNBOOST2_WITHOUT_DASK + + + +p0->p3 + + +runs + + + +p9 + +SCENIC:CISTARGET__MOTIF + + + +p3->p9 + + + + + +p15 + +SCENIC:CISTARGET__TRACK + + + +p3->p15 + + + + + +p1 + + + + +p1->p3 + + +filteredloom + + + +p2 + + + + +p2->p3 + + +tfs + + + +p18 + +SCENIC:AUCELL__MOTIF + + + +p9->p18 + + +ctx + + + +p22 + +map + + + +p9->p22 + + +ctx + + + +p4 + +Channel.fromPath + + + +p5 + +collect + + + +p4->p5 + + + + + +p5->p9 + + +motifDB + + + +p6 + + + + +p6->p9 + + +filteredloom + + + +p7 + + + + +p7->p9 + + +annotation + + + +p8 + + + + +p8->p9 + + +type + + + +p26 + +collect + + + +p18->p26 + + +auc + + + +p10 + +Channel.fromPath + + + +p11 + +collect + + + +p10->p11 + + + + + +p11->p15 + + +trackDB + + + +p21 + +SCENIC:AUCELL__TRACK + + + +p15->p21 + + +ctx + + + +p37 + +map + + + +p15->p37 + + +ctx + + + +p12 + + + + +p12->p15 + + +filteredloom + + + +p13 + + + + +p13->p15 + + +annotation + + + +p14 + + + + +p14->p15 + + +type + + + +p41 + +collect + + + +p21->p41 + + +auc + + + +p16 + + + + +p16->p18 + + +exprMat + + + +p17 + + + + +p17->p18 + + +type + + + +p28 + +SCENIC:MULTI_RUNS_TO_LOOM__MOTIF:AGGR_REGULONS + + + +p26->p28 + + + + + +p19 + + + + +p19->p21 + + +exprMat + + + +p20 + + + + +p20->p21 + + +type + + + +p43 + +SCENIC:MULTI_RUNS_TO_LOOM__TRACK:AGGR_REGULONS + + + +p41->p43 + + + + + +p23 + +collect + + + +p22->p23 + + + + + +p25 + +SCENIC:MULTI_RUNS_TO_LOOM__MOTIF:AGGR_FEATURES + + + +p23->p25 + + + + + +p33 + +SCENIC:MULTI_RUNS_TO_LOOM__MOTIF:FEATURES_TO_REGULONS + + + +p25->p33 + + + + + +p24 + + + + +p24->p25 + + +type + + + +p36 + +SCENIC:MULTI_RUNS_TO_LOOM__MOTIF:SAVE_TO_LOOM + + + +p33->p36 + + + + + +p28->p33 + + +multiRunsAggrRegulonsFolder + + + +p31 + +SCENIC:MULTI_RUNS_TO_LOOM__MOTIF:AUCELL + + + +p28->p31 + + + + + +p27 + + + + +p27->p28 + + +type + + + +p31->p36 + + + + + +p29 + + + + +p29->p31 + + +exprMat + + + +p30 + + + + +p30->p31 + + +type + + + +p52 + +SCENIC:MERGE_MOTIF_TRACK_LOOMS + + + +p36->p52 + + + + + +p32 + + + + +p32->p33 + + +type + + + +p34 + + + + +p34->p36 + + +exprMat + + + +p35 + + + + +p35->p36 + + +type + + + +p53 + +SCENIC:VISUALIZE + + + +p52->p53 + + + + + +p38 + +collect + + + +p37->p38 + + + + + +p40 + +SCENIC:MULTI_RUNS_TO_LOOM__TRACK:AGGR_FEATURES + + + +p38->p40 + + + + + +p48 + +SCENIC:MULTI_RUNS_TO_LOOM__TRACK:FEATURES_TO_REGULONS + + + +p40->p48 + + + + + +p39 + + + + +p39->p40 + + +type + + + +p51 + +SCENIC:MULTI_RUNS_TO_LOOM__TRACK:SAVE_TO_LOOM + + + +p48->p51 + + + + + +p43->p48 + + +multiRunsAggrRegulonsFolder + + + +p46 + +SCENIC:MULTI_RUNS_TO_LOOM__TRACK:AUCELL + + + +p43->p46 + + + + + +p42 + + + + +p42->p43 + + +type + + + +p46->p51 + + + + + +p44 + + + + +p44->p46 + + +exprMat + + + +p45 + + + + +p45->p46 + + +type + + + +p51->p52 + + + + + +p47 + + + + +p47->p48 + + +type + + + +p49 + + + + +p49->p51 + + +exprMat + + + +p50 + + + + +p50->p51 + + +type + + + +p54 + +SCENIC:PUBLISH_LOOM + + + +p53->p54 + + + + + +p55 + + + + +p54->p55 + + +f + + + \ No newline at end of file diff --git a/src/scenic/bin/add_visualization.py b/src/scenic/bin/add_visualization.py index 432af452..83e35c58 100755 --- a/src/scenic/bin/add_visualization.py +++ b/src/scenic/bin/add_visualization.py @@ -14,6 +14,8 @@ import umap from MulticoreTSNE import MulticoreTSNE as TSNE +import export_to_loom + ################################################################################ ################################################################################ @@ -39,43 +41,24 @@ args = parser.parse_args() -def df_to_named_matrix(df): - arr_ip = [tuple(i) for i in df.as_matrix()] - dtyp = np.dtype(list(zip(df.dtypes.index, df.dtypes))) - arr = np.array(arr_ip, dtype=dtyp) - return arr - - def visualize_AUCell(args): ################################################################################ - # load data from loom - ################################################################################ - - # scenic output - lf = lp.connect(args.loom_input, mode='r', validate=False) - meta = json.loads(zlib.decompress(base64.b64decode(lf.attrs.MetaData))) - auc_mtx = pd.DataFrame(lf.ca.RegulonsAUC, index=lf.ca.CellID) - regulons = pd.DataFrame(lf.ra.Regulons, index=lf.ra.Gene) - lf.close() - - - ################################################################################ - # Fix regulon objects to display properly in SCope: + # Load the data from the loom and merge if needed ################################################################################ - # add underscore for SCope compatibility: - auc_mtx.columns = auc_mtx.columns.str.replace('\\(', '_(') - - # add underscore for SCope compatibility: - regulons.columns = regulons.columns.str.replace('\\(', '_(') - - # Rename regulons in the thresholds object, motif - rt = meta['regulonThresholds'] - for i, x in enumerate(rt): - tmp = x.get('regulon').replace("(", "_(") # + '-motif' - x.update({'regulon': tmp}) + with lp.connect(args.loom_input, mode='r', validate=False) as lf: + if "RegulonsAUC" in lf.ca.keys(): + auc_mtx = pd.DataFrame(lf.ca.RegulonsAUC, index=lf.ca.CellID) + else: + print("Loom with motif & track regulons detected, merging the regulons AUC matrices...") + mtf_auc_mtx = pd.DataFrame(lf.ca.MotifRegulonsAUC, index=lf.ca.CellID) + trk_auc_mtx = pd.DataFrame(lf.ca.TrackRegulonsAUC, index=lf.ca.CellID) + # merge the AUC matrices: + auc_mtx = pd.concat([mtf_auc_mtx, trk_auc_mtx], sort=False, axis=1, join='outer') + # fill NAs (if any) with 0s: + auc_mtx.fillna(0, inplace=True) ################################################################################ # Visualize AUC matrix: @@ -84,65 +67,20 @@ def visualize_AUCell(args): # UMAP run_umap = umap.UMAP(n_neighbors=10, min_dist=0.4, metric='correlation').fit_transform dr_umap = run_umap(auc_mtx.dropna()) + # tSNE tsne = TSNE(n_jobs=args.num_workers) dr_tsne = tsne.fit_transform(auc_mtx.dropna()) - ################################################################################ - # embeddings + # Add visualization data ################################################################################ - default_embedding = pd.DataFrame(dr_umap, columns=['_X', '_Y'], index=auc_mtx.dropna().index) - - embeddings_x = pd.DataFrame(dr_tsne, columns=['_X', '_Y'], index=auc_mtx.dropna().index)[['_X']].astype('float32') - embeddings_y = pd.DataFrame(dr_tsne, columns=['_X', '_Y'], index=auc_mtx.dropna().index)[['_Y']].astype('float32') - - embeddings_x.columns = ['1'] - embeddings_y.columns = ['1'] - - - ################################################################################ - # copy loom - ################################################################################ - - copyfile(args.loom_input, args.loom_output) - - - ################################################################################ - # update scenic data - ################################################################################ - - lf = lp.connect(args.loom_output, mode='r+', validate=False) - - # write regulon information: - lf.ca['RegulonsAUC'] = df_to_named_matrix(auc_mtx) - lf.ra['Regulons'] = df_to_named_matrix(regulons) - - # write embeddings: - lf.ca['Embedding'] = df_to_named_matrix(default_embedding) - lf.ca['Embeddings_X'] = df_to_named_matrix(embeddings_x) - lf.ca['Embeddings_Y'] = df_to_named_matrix(embeddings_y) - - metaJson = {} - metaJson['embeddings'] = [ - { - "id": -1, - "name": "SCENIC AUC UMAP" - }, - { - "id": 1, - "name": "SCENIC AUC t-SNE" - }, - ] - - metaJson["regulonThresholds"] = rt - - lf.attrs['MetaData'] = base64.b64encode(zlib.compress(json.dumps(metaJson).encode('ascii'))).decode('ascii') - - lf.close() + scope_loom = export_to_loom.SCopeLoom.read_loom(filename=args.loom_input) + scope_loom.add_embedding(embedding=dr_umap, embedding_name="SCENIC AUC UMAP", is_default=True) + scope_loom.add_embedding(embedding=dr_tsne, embedding_name="SCENIC AUC t-SNE", is_default=False) + scope_loom.export(out_fname=args.loom_output) if __name__ == "__main__": visualize_AUCell(args) - diff --git a/src/scenic/bin/aggregate_SCENIC_multi_runs_features.py b/src/scenic/bin/aggregate_SCENIC_multi_runs_features.py deleted file mode 100755 index 91375018..00000000 --- a/src/scenic/bin/aggregate_SCENIC_multi_runs_features.py +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import os -import pandas as pd -import pickle -from pyscenic.transform import COLUMN_NAME_TYPE, COLUMN_NAME_CONTEXT -from pyscenic.utils import ACTIVATING_MODULE, REPRESSING_MODULE -import re -import sys -import time -import utils - -################################################################################ -################################################################################ - -parser_grn = argparse.ArgumentParser( - usage="usage: %prog [options] feature_enrichment_table_fnames", - description='Aggregate feature (motif or track) enrichment tables from multiple SCENIC runs. ' -) - -parser_grn.add_argument( - 'feature_enrichment_table_fnames', - nargs='+', - help='The looms resulting from the SCENIC AUCell step.' -) -parser_grn.add_argument( - '-o', '--output', - type=argparse.FileType('w'), - default=sys.stdout, - help='Output file/stream, i.e. a table of aggregated feature (motif or track) enrichment table (csv, pickle).' -) -parser_grn.add_argument( - '-f', '--output-format', - dest='output_format', - type=str, - default='pickle', - choices=['pickle', 'csv'], - help='Output format of the aggregated feature (motif or track) enrichment table.' -) - -args = parser_grn.parse_args() - -# Do stuff - - -def stack_feature_enrichment_tables(feature_enrichment_table_fnames): - multi_runs_feature_enrichment_table = None - - def get_type(row): - ctx = row[('Enrichment', COLUMN_NAME_CONTEXT)] - # Activating is the default! - return REPRESSING_MODULE if REPRESSING_MODULE in ctx else ACTIVATING_MODULE - - for i in range(0, len(feature_enrichment_table_fnames)): - fname = feature_enrichment_table_fnames[i] - - # Check if the file is empty - if os.stat(fname).st_size == 0: - raise Exception(f"The given file {fname} is empty. Please rerun the GRNboost and/or the cisTarget for this run.", flush=True) - - # Read the enrichment table - feature_enrichment_table = utils.read_feature_enrichment_table(fname=fname, sep=",") - - # Add the run ID - feature_enrichment_table[('', 'RunID')] = int( - re.search(r'run_([0-9]+)+__reg_(mtf|trk)\.csv$', str(fname)).group(1) - ) - feature_enrichment_table[('', COLUMN_NAME_TYPE)] = feature_enrichment_table.apply(get_type, axis=1) - - # Append the current enrichment table - if multi_runs_feature_enrichment_table is None: - multi_runs_feature_enrichment_table = feature_enrichment_table - else: - multi_runs_feature_enrichment_table = pd.concat( - [multi_runs_feature_enrichment_table, feature_enrichment_table] - ) - print("Reading {0}... Done! ({1})".format(fname, multi_runs_feature_enrichment_table.shape), flush=True) - return multi_runs_feature_enrichment_table - - -# Stack all the enrichment tables from the different SCENIC runs -print("Stacking all the enrichment tables from the different runs...", flush=True) -start = time.time() -multi_runs_feature_enrichment_table = stack_feature_enrichment_tables(args.feature_enrichment_table_fnames) -print(f"... took {time.time() - start} seconds to run.", flush=True) - -# Save to 'pickle' or csv format -print("Saving...", flush=True) -start = time.time() -if args.output_format == 'pickle': - with open(args.output.name, 'wb') as f: - pickle.dump(multi_runs_feature_enrichment_table, f) -elif args.output_format == 'csv': - multi_runs_feature_enrichment_table.to_csv(path_or_buf=args.output) -else: - raise Exception(f"The given output_format {args.output_format} has not been implemented") -print(f"... took {time.time() - start} seconds to run.", flush=True) -print("Done.", flush=True) diff --git a/src/scenic/bin/aggregate_multi_runs_features.py b/src/scenic/bin/aggregate_multi_runs_features.py new file mode 100755 index 00000000..23a5d65e --- /dev/null +++ b/src/scenic/bin/aggregate_multi_runs_features.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 + +import argparse +import os +import pandas as pd +import pickle +from pyscenic.transform import COLUMN_NAME_TYPE, COLUMN_NAME_CONTEXT +from pyscenic.utils import ACTIVATING_MODULE, REPRESSING_MODULE +from pyscenic.transform import COLUMN_NAME_CONTEXT, COLUMN_NAME_TARGET_GENES +import re +import sys +import time +import utils + +################################################################################ +################################################################################ + +parser_grn = argparse.ArgumentParser( + usage="usage: %prog [options] feature_enrichment_table_fnames", + description='Aggregate feature (motif or track) enrichment tables from multiple SCENIC runs. ' +) + +parser_grn.add_argument( + 'feature_enrichment_table_fnames', + nargs='+', + help='The looms resulting from the SCENIC AUCell step.' +) +parser_grn.add_argument( + '-o', '--output', + type=argparse.FileType('w'), + default=sys.stdout, + help='Output file/stream, i.e. a table of aggregated feature (motif or track) enrichment table.' +) +parser_grn.add_argument( + '-f', '--output-format', + dest='output_format', + type=str, + default='csv', + choices=['pickle', 'parquet', 'csv'], + help='Output format of the aggregated feature (motif or track) enrichment table.' +) + + +def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + +parser_grn.add_argument( + "-b", '--use-chunking', + type=str2bool, + nargs='?', + dest='use_chunking', + const=True, + default=True, + help="Use the chunking method (low memory usage). Output format is fixed to compressed csv (.csv.gz). If this is set to False, you will require a server w/ a lot of RAM! (> 60gb)") + +args = parser_grn.parse_args() + +# Utils + + +def get_type(row): + ctx = row[('Enrichment', COLUMN_NAME_CONTEXT)] + # Activating is the default! + return REPRESSING_MODULE if REPRESSING_MODULE in ctx else ACTIVATING_MODULE + + +# Do stuff + + +def save_aggregated_feature_enrichment_table(feature_enrichment_table_fnames, out_fname, chunksize=None): + + def to_csv(fname, df, count): + mode = 'a' if count > 0 else 'w' + header = False if count > 0 else True + print(f"Saving {f_name} {df.shape}... Done!", flush=True) + df.to_csv(fname, mode=mode, header=header) + + for f_name_idx in range(0, len(feature_enrichment_table_fnames)): + f_name = feature_enrichment_table_fnames[f_name_idx] + df = utils.read_feature_enrichment_table( + fname=f_name, + sep=",", + chunksize=chunksize + ) + + for df_chunk in df: + to_csv( + fname=out_fname, + df=add_run_id_to_feature_enrichment_table( + df=df_chunk, + fname=f_name + ), + count=f_name_idx + ) + + +def add_run_id_to_feature_enrichment_table(df, fname): + # Add the run ID + df[('', 'RunID')] = int( + re.search(r'run_([0-9]+)+__reg_(mtf|trk)\.csv$', str(fname)).group(1) + ) + df[('', COLUMN_NAME_TYPE)] = df.apply(get_type, axis=1) + return df + + +def read_and_add_run_id_to_feature_enrichment_table(fname): + # Check if the file is empty + if os.stat(fname).st_size == 0: + raise Exception(f"The given file {fname} is empty. Please rerun the GRNboost and/or the cisTarget for this run.", flush=True) + + print("Reading {0}...".format(fname), flush=True) + return add_run_id_to_feature_enrichment_table( + df=utils.read_feature_enrichment_table(fname=fname, sep=","), + fname=fname + ) + + +def stack_feature_enrichment_tables(feature_enrichment_table_fnames): + return pd.concat((read_and_add_run_id_to_feature_enrichment_table(fname=f) for f in feature_enrichment_table_fnames)) + + +# Stack all the enrichment tables from the different SCENIC runs +print("Stacking all the enrichment tables from the different runs...", flush=True) +start = time.time() + +# Use this method if memory is an issue +if args.use_chunking: + save_aggregated_feature_enrichment_table( + feature_enrichment_table_fnames=args.feature_enrichment_table_fnames, + out_fname=args.output.name, + chunksize=50000 + ) +else: + multi_runs_feature_enrichment_table = stack_feature_enrichment_tables(args.feature_enrichment_table_fnames) + print("Final aggregated enrichment table ({0})".format(multi_runs_feature_enrichment_table.shape), flush=True) + print(f"... took {time.time() - start} seconds to run.", flush=True) + + # Save + print("Saving...", flush=True) + start = time.time() + if args.output_format == 'pickle': + print("to pickle (pandas)...", flush=True) + multi_runs_feature_enrichment_table.to_pickle(path=args.output.name) + # Do not support MultiIndex + elif args.output_format == 'parquet': + print("to parquet...", flush=True) + multi_runs_feature_enrichment_table.to_parquet(fname=args.output.name) + elif args.output_format == 'csv': + print("to csv...", flush=True) + multi_runs_feature_enrichment_table.to_csv(path_or_buf=args.output) + else: + raise Exception(f"The given output_format {args.output_format} has not been implemented") +print(f"... took {time.time() - start} seconds to run.", flush=True) +print("Done.", flush=True) diff --git a/src/scenic/bin/aggregate_SCENIC_multi_runs_regulons.py b/src/scenic/bin/aggregate_multi_runs_regulons.py similarity index 71% rename from src/scenic/bin/aggregate_SCENIC_multi_runs_regulons.py rename to src/scenic/bin/aggregate_multi_runs_regulons.py index 2b279a50..a45fba1b 100755 --- a/src/scenic/bin/aggregate_SCENIC_multi_runs_regulons.py +++ b/src/scenic/bin/aggregate_multi_runs_regulons.py @@ -42,6 +42,19 @@ def from_loom_connection_regulon_incidence_matrix_to_long_df(loom): return df_ +def make_regulon_count_table(auc_looms): + regulon_names_all_runs = None + for i in range(0, len(auc_looms)): + loom = lp.connect(filename=auc_looms[i], validate=False) + df = from_loom_connection_regulon_incidence_matrix_to_long_df(loom=loom) + df = pd.DataFrame(df['regulon'].unique(), columns=['regulon']) + if regulon_names_all_runs is None: + regulon_names_all_runs = df + else: + regulon_names_all_runs = pd.concat([regulon_names_all_runs, df], axis=0) + return regulon_names_all_runs + + def stack_regulons(auc_looms): all_runs_regulons_stacked = None @@ -67,13 +80,13 @@ def aggregate_genes_by_regulons(all_runs_regulons_stacked): def save_aggregated_regulons(all_runs_regulons_aggregated, output_dir): os.mkdir(output_dir) for regulon_name in np.unique(all_runs_regulons_aggregated["regulon"]): - regulon_target_gene_occurence_df = all_runs_regulons_aggregated[ + regulon_target_gene_occurrence_df = all_runs_regulons_aggregated[ (all_runs_regulons_aggregated.regulon == regulon_name) ].sort_values( by=['count'], ascending=False )[["gene", "count"]] - regulon_target_gene_occurence_df.to_csv( + regulon_target_gene_occurrence_df.to_csv( path_or_buf=os.path.join(output_dir, regulon_name + ".tsv"), header=False, sep="\t", @@ -81,6 +94,19 @@ def save_aggregated_regulons(all_runs_regulons_aggregated, output_dir): ) +# Aggregate all target genes for each regulon and save it all_runs_regulons_stacked = stack_regulons(auc_looms=args.auc_looms) all_runs_regulons_aggregated = aggregate_genes_by_regulons(all_runs_regulons_stacked=all_runs_regulons_stacked) save_aggregated_regulons(all_runs_regulons_aggregated=all_runs_regulons_aggregated, output_dir=args.output) + +# Make the regulon count table and save to regulons.tsv +all_runs_regulon_count_table = make_regulon_count_table(auc_looms=args.auc_looms) +all_runs_regulon_count_table['regulon'].value_counts().to_frame( + name="count" +).to_csv( + path_or_buf=os.path.join(args.output, "regulons.tsv"), + sep="\t", + header=True, + index=True, + index_label="regulon" +) diff --git a/src/scenic/bin/append_SCENIC_results_to_existing_loom.py b/src/scenic/bin/append_SCENIC_results_to_existing_loom.py deleted file mode 100755 index 4ac07d7c..00000000 --- a/src/scenic/bin/append_SCENIC_results_to_existing_loom.py +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import base64 -import json -import sys -import zlib -from copy import deepcopy -from shutil import copyfile - -import loompy as lp -import numpy as np -import pandas as pd - -################################################################################ -################################################################################ - -parser = argparse.ArgumentParser( - description='Integrate output from pySCENIC with SCope loom from pipeline best-practices path' -) -parser.add_argument( - '--loom_scope', - help='Loom file for SCope from pipeline best-practices path', - required=True, - default='.loom' -) -parser.add_argument( - '--loom_scenic', - help='Loom file from pySCENIC run', - required=True, - default='pyscenic_motif.loom' -) -parser.add_argument( - '--loom_output', - help='Final loom file with pySCENIC results integrated', - required=True, - default='pyscenic.loom' -) -args = parser.parse_args() - - -################################################################################ -################################################################################ - -def dfToNamedMatrix(df): - arr_ip = [tuple(i) for i in df.as_matrix()] - dtyp = np.dtype(list(zip(df.dtypes.index, df.dtypes))) - arr = np.array(arr_ip, dtype=dtyp) - return arr - - -def integrateSCENICloom(args): - ################################################################################ - # load data from scenic loom - ################################################################################ - - # scenic output - lf = lp.connect(args.loom_scenic, mode='r', validate=False) - meta_scenic = json.loads(zlib.decompress(base64.b64decode(lf.attrs.MetaData))) - genes_scenic = lf.ra.Gene - cellid_scenic = lf.ca.CellID - regulonsAUC = lf.ca.RegulonsAUC - regulons = lf.ra.Regulons - - ### capture embeddings: - dr = [ - pd.DataFrame(lf.ca.Embedding, index=lf.ca.CellID) - ] - dr_names = [ - meta_scenic['embeddings'][0]['name'].replace(" ", "_") - ] - - # add other embeddings - drx = pd.DataFrame(lf.ca.Embeddings_X, index=lf.ca.CellID) - dry = pd.DataFrame(lf.ca.Embeddings_Y, index=lf.ca.CellID) - - for i in range(len(drx.columns)): - dr.append(pd.concat([drx.iloc[:, i], dry.iloc[:, i]], sort=False, axis=1, join='outer')) - dr_names.append(meta_scenic['embeddings'][i + 1]['name'].replace(" ", "_").replace('/', '-')) - - # rename columns: - for i, x in enumerate(dr): - x.columns = ['X', 'Y'] - - lf.close() - - ################################################################################ - # copy loom - ################################################################################ - - copyfile(args.loom_scope, args.loom_output) - - ################################################################################ - # add scenic data - ################################################################################ - - lf = lp.connect(args.loom_output, mode='r+', validate=False) - - if not all(lf.ca.CellID == cellid_scenic): - sys.exit(f"ERROR: Column attribute CellIDs does not match between {args.loom_scope} and {args.loom_scenic}") - if not all(lf.ra.Gene == genes_scenic): - sys.exit(f"ERROR: Row attribute 'Gene' does not match between {args.loom_scope} and {args.loom_scenic}") - - # write regulon information: - lf.ca['RegulonsAUC'] = regulonsAUC - lf.ra['Regulons'] = regulons - - # get existing metadata: - meta = json.loads(zlib.decompress(base64.b64decode(lf.attrs.MetaData))) - - # append regulon thresholds: - meta['regulonThresholds'] = meta_scenic['regulonThresholds'] - - embeddings_start_index = 1 - if len(meta['embeddings']) > 1: - embeddings_start_index = int(meta['embeddings'][-1]['id']) - - # append embedding labels: - id = deepcopy(embeddings_start_index) - for i, x in enumerate(dr_names): - id += 1 - meta['embeddings'].append({'id': id, 'name': x}) - - # get existing embeddings: - Embeddings_X = pd.DataFrame(lf.ca.Embeddings_X, index=lf.ca.CellID) - Embeddings_Y = pd.DataFrame(lf.ca.Embeddings_Y, index=lf.ca.CellID) - - # append scenic embeddings: - id = deepcopy(embeddings_start_index) - for i, x in enumerate(dr): - id += 1 - Embeddings_X[str(id)] = dr[i].iloc[:, 0] - Embeddings_Y[str(id)] = dr[i].iloc[:, 1] - - lf.ca.Embeddings_X = dfToNamedMatrix(Embeddings_X) - lf.ca.Embeddings_Y = dfToNamedMatrix(Embeddings_Y) - - lf.attrs['MetaData'] = base64.b64encode(zlib.compress(json.dumps(meta).encode('ascii'))).decode('ascii') - - lf.close() - - -if __name__ == "__main__": - integrateSCENICloom(args) diff --git a/src/scenic/bin/append_results_to_existing_loom.py b/src/scenic/bin/append_results_to_existing_loom.py new file mode 100755 index 00000000..8e19372b --- /dev/null +++ b/src/scenic/bin/append_results_to_existing_loom.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 + +import argparse +import base64 +import json +import sys +import zlib +from copy import deepcopy +from shutil import copyfile + +import loompy as lp +import numpy as np +import pandas as pd +import export_to_loom + +################################################################################ +################################################################################ + +parser = argparse.ArgumentParser( + description='Integrate output from pySCENIC with SCope loom from pipeline best-practices path' +) +parser.add_argument( + '--loom_scope', + help='Loom file for SCope from pipeline best-practices path', + required=True, + default='.loom' +) +parser.add_argument( + '--loom_scenic', + help='Loom file from pySCENIC run', + required=True, + default='pyscenic_motif.loom' +) +parser.add_argument( + '--loom_output', + help='Final loom file with pySCENIC results integrated', + required=True, + default='pyscenic.loom' +) +args = parser.parse_args() + + +################################################################################ +################################################################################ + +def append_to_existing_loom(args): + + ################################################################################ + # Merge looms + ################################################################################ + + scope_loom = export_to_loom.SCopeLoom.read_loom(filename=args.loom_scope) + scenic_loom = export_to_loom.SCopeLoom.read_loom(filename=args.loom_scenic) + scope_loom.merge(loom=scenic_loom) + scope_loom.export(out_fname=args.loom_output) + + +if __name__ == "__main__": + append_to_existing_loom(args) diff --git a/src/scenic/bin/aucell_genesigs_from_folder.py b/src/scenic/bin/aucell_from_folder.py similarity index 96% rename from src/scenic/bin/aucell_genesigs_from_folder.py rename to src/scenic/bin/aucell_from_folder.py index 662b1a6f..fb629a08 100755 --- a/src/scenic/bin/aucell_genesigs_from_folder.py +++ b/src/scenic/bin/aucell_from_folder.py @@ -51,10 +51,10 @@ ' of a feature as fraction of ranked genes (default: {}).'.format(0.05) ) parser_grn.add_argument( - '--gene-occurence-threshold', + '--min-regulon-gene-occurrence', type=int, default=5, - dest="gene_occurence_threshold", + dest="min_regulon_gene_occurrence", help='The threshold used for filtering the genes bases on their occurrence (default: {}).'.format(5) ) parser_grn.add_argument( @@ -90,7 +90,7 @@ signatures = utils.read_signatures_from_tsv_dir( dpath=args.signatures_fname, noweights=False, - weight_threshold=args.gene_occurence_threshold, + weight_threshold=args.min_regulon_gene_occurrence, min_genes=args.min_genes ) auc_threshold = args.auc_threshold diff --git a/src/scenic/bin/convert_multi_runs_features_to_regulons.py b/src/scenic/bin/convert_multi_runs_features_to_regulons.py new file mode 100755 index 00000000..f8e6d3cb --- /dev/null +++ b/src/scenic/bin/convert_multi_runs_features_to_regulons.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 + +import argparse +import re +import sys +import pandas as pd +import pickle +import gzip +from pyscenic import transform +from pyscenic.transform import COLUMN_NAME_NES +from pyscenic.utils import COLUMN_NAME_MOTIF_SIMILARITY_QVALUE, COLUMN_NAME_ORTHOLOGOUS_IDENTITY, \ + COLUMN_NAME_ANNOTATION +import time +import utils + +################################################################################ +# TODO: +# This implementation should be optimized: +# It's taking several hours to run +################################################################################ + +parser_grn = argparse.ArgumentParser(description='Transform aggregated motif enrichment table to regulons') + +parser_grn.add_argument( + 'motif_enrichment_table_fname', + type=argparse.FileType('r'), + help='The name of the file that contains the motif enrichments.' +) +parser_grn.add_argument( + 'signatures_fname', + help='The name of the folder containing the signatures as TSV files.' +) + +parser_grn.add_argument( + '-o', '--output', + type=argparse.FileType('w'), + default=sys.stdout, + help='Output file/stream, i.e. a pickle file containing regulons inferred from motif enrichment table.' +) +# parser_grn.add_argument( +# '--min-genes-regulon', +# type=int, +# default=5, +# dest="min_genes_regulon", +# help='The threshold used for filtering the regulons based on the number of targets (default: {}).'.format(5) +# ) +# parser_grn.add_argument( +# '--min-regulon-gene-occurrence', +# type=int, +# default=5, +# dest="min_regulon_gene_occurrence", +# help='The threshold used for filtering the genes bases on their occurrence (default: {}).'.format(5) +# ) + +args = parser_grn.parse_args() + +# Transform motif enrichment table (generated from the cisTarget step) to regulons +print(f"Reading aggregated motif enrichment table...", flush=True) +start = time.time() +f = args.motif_enrichment_table_fname.name +if f.endswith('.pkl') or f.endswith('.pkl.gz') or f.endswith('.pickle') or f.endswith('.pickle.gz'): + motif_enrichment_table = pd.read_pickle(path=f) +elif f.endswith('.csv') or f.endswith('.csv.gz'): + motif_enrichment_table = utils.read_feature_enrichment_table(fname=args.motif_enrichment_table_fname.name, sep=",") +else: + raise Exception("The aggregated feature enrichment table is in the wrong format. Expecting .pickle or .csv formats.") +print(f"... took {time.time() - start} seconds to run.", flush=True) + +print(f"Making the regulons...", flush=True) +start = time.time() +regulons = transform.df2regulons( + df=motif_enrichment_table, + save_columns=[ + COLUMN_NAME_NES, + COLUMN_NAME_ORTHOLOGOUS_IDENTITY, + COLUMN_NAME_MOTIF_SIMILARITY_QVALUE, + COLUMN_NAME_ANNOTATION + ] +) +print(f"{len(regulons)} regulons from df2regulons.") + +# Read the signatures saved in out/multi_runs_regulons_[mtf|trk] +# Keep all regulons and targets (so that all can be visualized in SCope) +signatures = utils.read_signatures_from_tsv_dir( + dpath=args.signatures_fname, + noweights=False, + weight_threshold=0, + min_genes=0 +) +print(f"{len(signatures)} all regulons from out/multi_runs_regulons_[mtf|trk].") + +# Filter regulons (regulons from motifs enrichment table) by the filtered signatures +regulons = list(filter(lambda x: x.name in list(map(lambda x: x.name, signatures)), regulons)) +# Add gene2occurrence from filtered signatures to regulons +regulons = list( + map( + lambda x: + x.copy(gene2occurrence=list(filter(lambda y: y.name == x.name, signatures))[0].gene2weight), regulons + ) +) +print(f"{len(regulons)} final regulons.") +print(f"Saving...") +with gzip.open(args.output.name, 'wb') as file_handler: + pickle.dump(regulons, file_handler) +print(f"... took {time.time() - start} seconds to run.", flush=True) diff --git a/src/scenic/bin/export_to_loom.py b/src/scenic/bin/export_to_loom.py new file mode 100644 index 00000000..c4160060 --- /dev/null +++ b/src/scenic/bin/export_to_loom.py @@ -0,0 +1,713 @@ +import os +import numpy as np +import pandas as pd +import loompy as lp +from sklearn.manifold.t_sne import TSNE +from pyscenic.aucell import aucell +from pyscenic.genesig import Regulon, GeneSignature +from typing import List, Mapping, Sequence, Optional, Dict +from operator import attrgetter +from multiprocessing import cpu_count +from pyscenic.binarization import binarize +from itertools import chain, repeat, islice +import networkx as nx +import zlib +import base64 +import json +import re +import sys + +from collections import OrderedDict + + +class Embedding: + + def __init__(self, embedding: pd.DataFrame, embedding_name: str, is_default: bool = False): + self.embedding = embedding + self.embedding_name = embedding_name + self._is_default = is_default + + def get_embedding_name(self): + return self.embedding_name + + def get_embedding(self): + return self.embedding + + def is_default(self): + return self._is_default + + +class SCopeLoom: + + def __init__( + self, + filename: str = None, + out_fname: str = None, + ex_mtx: pd.DataFrame = None, + regulons: List[Regulon] = None, + cell_annotations: Optional[Mapping[str, str]] = None, + tree_structure: Sequence[str] = None, + title: Optional[str] = None, + nomenclature: str = "Unknown", + num_workers: int = cpu_count(), + auc_mtx=None, + auc_regulon_weights_key='gene2weight', + auc_thresholds=None, + compress: bool = False, + save_additional_regulon_meta_data: bool = False, # Should be set to true only for multi-runs SCENIC + set_generic_loom: bool = False, + tag: str = None, # Used when merging track and motif-based SCENIC run + col_attrs: Mapping[str, np.ndarray] = None, + row_attrs: Mapping[str, np.ndarray] = None, + global_attrs: Dict = None, + embeddings: Mapping[str, pd.DataFrame] = None + ): + self.filename = filename + self.out_fname = out_fname + self.ex_mtx = ex_mtx + self.regulons = regulons + self.cell_annotations = cell_annotations + self.tree_structure = tree_structure if tree_structure else () + self.title = title + self.nomenclature = nomenclature + self.num_workers = num_workers + self.auc_mtx = auc_mtx + self.auc_regulon_weights_key = auc_regulon_weights_key + self.auc_thresholds = auc_thresholds + self.compress = compress + self.save_additional_regulon_meta_data = save_additional_regulon_meta_data + self.tag = tag + # Loom representation + self.col_attrs = col_attrs if col_attrs else {} + self.row_attrs = row_attrs if row_attrs else {} + self.global_attrs = global_attrs if global_attrs else {} + # Internal representation + self.embeddings = embeddings if embeddings else {} + # Utils + self.id2name = OrderedDict() + # Set Base Loom + if set_generic_loom: + self.set_generic_loom() + + def set_generic_loom(self): + if(self.regulons[0].name.find('_') == -1): + print( + "Regulon name does not seem to be compatible with SCOPE. It should include a space to allow selection of the TF.", + "\nPlease run: \n regulons = [r.rename(r.name.replace('(+)','_('+str(len(r))+'g)')) for r in regulons]", + "\nor:\n regulons = [r.rename(r.name.replace('(','_(')) for r in regulons]" + ) + self.set_cell_annotations() + if self.auc_mtx is None: + self.auc_mtx = self.calculate_regulon_enrichment() + if self.auc_thresholds is None: + self.auc_thresholds = self.binarize_regulon_enrichment() + if len(self.embeddings.keys()) == 0: + self.create_loom_default_embedding() + self.regulon_gene_assignment = self.create_loom_regulon_gene_assignment() + self.ngenes = self.calculate_nb_genes_per_cell() + self.clusterings = self.create_loom_clusterings() + self.regulon_thresholds = self.create_loom_md_regulon_thresholds() + self.set_generic_col_attrs() + self.set_generic_row_attrs() + self.set_generic_global_attrs() + self.set_tree() + + ####### + # I/O # + ####### + + @staticmethod + def read_loom(filename: str, tag: str = None): + with lp.connect(filename, mode='r', validate=False) as loom: + + # Load the content into memory + # Set the main matrix + ex_mtx = pd.DataFrame(loom[:, :], index=loom.ra.Gene, columns=loom.ca.CellID).T + # Set the column, row and global attribute using the underlying Dict of the AttributeManager + col_attrs = {k: v for k, v in loom.ca.items()} + row_attrs = {k: v for k, v in loom.ra.items()} + global_attrs = {k: v for k, v in loom.attrs.items()} + # Decompress and decode the MetaData global attribute + try: + global_attrs["MetaData"] = SCopeLoom.decompress_decode(value=global_attrs["MetaData"]) + except Exception: + # MetaData is uncompressed + global_attrs["MetaData"] = json.loads(global_attrs["MetaData"]) + + scope_loom = SCopeLoom( + filename=filename, + ex_mtx=ex_mtx, + col_attrs=col_attrs, + row_attrs=row_attrs, + global_attrs=global_attrs, + tag=tag + ) + if 'embeddings' in scope_loom.get_meta_data(): + scope_loom.convert_loom_embeddings_repr_to_internal_repr() + return scope_loom + + ############# + # Meta Data # + ############# + + def add_meta_data(self, _dict): + md = self.global_attrs["MetaData"] + meta_data = json.loads(md) + meta_data.update(_dict) + self.global_attrs["MetaData"] = meta_data + + def get_meta_data(self): + return self.global_attrs["MetaData"] + + def set_tree(self): + assert len(self.tree_structure) <= 3, "" + self.global_attrs.update(("SCopeTreeL{}".format(idx + 1), category) for idx, category in enumerate(list(islice(chain(self.tree_structure, repeat("")), 3)))) + + ############# + # Features # + ############# + + def get_genes(self): + return self.row_attrs['Gene'] + + ################ + # Observations # + ################ + + def get_cell_ids(self): + return self.col_attrs['CellID'] + + ############### + # Annotations # + ############### + + def set_cell_annotations(self): + if self.cell_annotations is None: + self.cell_annotations = dict(zip(self.ex_mtx.index.astype(str), ['-'] * self.ex_mtx.shape[0])) + + ########### + # Metrics # + ########### + + def calculate_nb_genes_per_cell(self): + # Calculate the number of genes per cell. + binary_mtx = self.ex_mtx.copy() + binary_mtx[binary_mtx != 0] = 1.0 + return binary_mtx.sum(axis=1).astype(int) + + def add_metrics(self, metrics: List[str]): + md_metrics = [] + for metric in metrics: + md_metrics.append({"name": metric}) + self.global_attrs["MetaData"].update({'metrics': md_metrics}) + + ############## + # Embeddings # + ############## + + @staticmethod + def get_embedding_id(embedding: Embedding, _list): + """ Returns the appropriate index as a string given the _list + + Parameters: + None + + Returns: + str: Returns -1 if the given embedding is the default, 0 if the given _list is empty and length of the given _list minus 1 + + """ + if embedding.is_default(): + return '-1' + elif len(_list) == '0': + return '0' + else: + return str(len(_list) - 1) + + def convert_loom_embeddings_repr_to_internal_repr(self): + for embedding in self.get_meta_data()['embeddings']: + self.add_embedding( + embedding=self.get_embedding_by_id(embedding_id=embedding['id']), + embedding_name=embedding['name'], + is_default=True if str(embedding['id']) == '-1' else False + ) + + def get_embedding_by_id(self, embedding_id): + if str(embedding_id) == '-1': + return self.col_attrs['Embedding'] + x = self.col_attrs['Embeddings_X'][str(embedding_id)] + y = self.col_attrs['Embeddings_Y'][str(embedding_id)] + return np.column_stack((x, y)) + + def add_embedding(self, embedding: np.ndarray, embedding_name, is_default: bool = False): + df_embedding = pd.DataFrame(embedding, columns=['_X', '_Y'], index=self.ex_mtx.index) + _embedding = Embedding(embedding=df_embedding, embedding_name=embedding_name, is_default=is_default) + if is_default: + self.default_embedding = _embedding + self.embeddings[embedding_name] = _embedding + + def create_loom_default_embedding(self): + # Create an embedding based on tSNE. + # Name of columns should be "_X" and "_Y". + self.add_embedding(embedding=TSNE().fit_transform(self.auc_mtx), embedding_name="tSNE (default)", is_default=True) + + def create_loom_md_embeddings_repr(self): + """ Returns a Dictionary (Dict) for the global meta data embeddings with preformated data to be stored in Loom file format and compatible with SCope. + + Parameters: + None + + Returns: + dict: A Dictionary (Dict) for the global meta data embeddings with preformated data to be stored in Loom file format and compatible with SCope. + + """ + md_embeddings = [] + for _, embedding in self.embeddings.items(): + if embedding.is_default(): + md_embeddings = md_embeddings + [{'id': '-1', 'name': embedding.get_embedding_name()}] + else: + md_embeddings = md_embeddings + [ + { + 'id': SCopeLoom.get_embedding_id(embedding=embedding, _list=md_embeddings), + 'name': embedding.get_embedding_name() + } + ] + return {"embeddings": md_embeddings} + + def create_loom_ca_embeddings_repr(self): + """ Returns a Dictionary (Dict) for the embeddings with preformated data to be stored in Loom file format and compatible with SCope. + + Parameters: + None + + Returns: + dict: A Dictionary (Dict) for the embeddings with preformated data to be stored in Loom file format and compatible with SCope. + + """ + default_embedding = None + embeddings_X = pd.DataFrame(index=self.ex_mtx.index) + embeddings_Y = pd.DataFrame(index=self.ex_mtx.index) + + for _, embedding in self.embeddings.items(): + if(embedding.get_embedding().shape[1] != 2): + raise Exception('The embedding should have two columns.') + # Set the default embedding + if embedding.is_default(): + default_embedding = embedding.get_embedding() + # Update the Embeddings_[X|Y] + embedding_id = str(SCopeLoom.get_embedding_id(embedding, embeddings_X.columns)) + embedding = embedding.get_embedding().copy() + embedding.columns = ['_X', '_Y'] + embeddings_X = pd.merge( + embeddings_X, + embedding['_X'].to_frame().rename( + columns={'_X': embedding_id} + ).astype('float32'), left_index=True, right_index=True) + embeddings_Y = pd.merge( + embeddings_Y, + embedding['_Y'].to_frame().rename( + columns={'_Y': embedding_id} + ).astype('float32'), left_index=True, right_index=True) + return { + "Embedding": SCopeLoom.df_to_named_matrix(df=default_embedding), + "Embeddings_X": SCopeLoom.df_to_named_matrix(df=embeddings_X), + "Embeddings_Y": SCopeLoom.df_to_named_matrix(df=embeddings_Y) + } + + ############### + # Clusterings # + ############### + + def create_loom_clusterings(self): + # Encode cell type clusters. + # The name of the column should match the identifier of the clustering. + return pd.DataFrame( + data=self.ex_mtx.index.values, + index=self.ex_mtx.index, + columns=['0'] + ).replace(self.cell_annotations).replace(self.name2idx()) + + ########## + # SCENIC # + ########## + + def has_scenic_multi_runs_data(self): + return 'RegulonGeneOccurrences' in self.row_attrs.keys() and 'RegulonGeneWeights' in self.row_attrs.keys() + + def scopify_md_regulon_data(self): + # Fix regulon objects to display properly in SCope: + # Rename regulons in the thresholds object, motif + md_regulon_thresholds = self.global_attrs['MetaData']['regulonThresholds'] + for _, x in enumerate(md_regulon_thresholds): + tmp = re.sub(r"(_?)\(", '_(', x.get('regulon')) # + '-motif' + x.update({'regulon': tmp}) + return { + 'regulonThresholds': md_regulon_thresholds + } + + def scopify_loom_ca_regulon_data(self): + regulons_auc_col_attrs = list(filter(lambda col_attrs_key: 'RegulonsAUC' in col_attrs_key, self.col_attrs.keys())) + + def fix(col_attrs_key): + regulons = pd.DataFrame(self.col_attrs[col_attrs_key], index=self.col_attrs['CellID']) + # Add underscore for SCope compatibility: + regulons.columns = regulons.columns.str.replace('_?\\(', '_(') + return { + col_attrs_key: SCopeLoom.df_to_named_matrix(regulons) + } + # Update the keys + regulons_auc_col_attrs_update = map(fix, regulons_auc_col_attrs) + # Convert list of dict to dict + return {next(iter(x)): x.get(next(iter(x))) for x in regulons_auc_col_attrs_update} + + def scopify_loom_ra_regulon_data(self): + regulons_row_attrs = list(filter(lambda row_attrs_key: 'Regulons' in row_attrs_key, self.row_attrs.keys())) + + def fix(row_attrs_key): + regulons = pd.DataFrame(self.row_attrs[row_attrs_key], index=self.row_attrs['Gene']) + # Add underscore for SCope compatibility: + regulons.columns = regulons.columns.str.replace('_?\\(', '_(') + return { + row_attrs_key: SCopeLoom.df_to_named_matrix(regulons) + } + # Update the keys + regulons_row_attrs_update = map(fix, regulons_row_attrs) + # Convert list of dict to dict + return {next(iter(x)): x.get(next(iter(x))) for x in regulons_row_attrs_update} + + def calculate_regulon_enrichment(self): + # Calculate regulon enrichment per cell using AUCell. + # Create regulons with weight based on given key + print("Using {} to weight the genes when running AUCell.".format(self.auc_regulon_weights_key)) + regulon_signatures = list(map(lambda x: GeneSignature(name=x.name, gene2weight=self.get_regulon_gene_data(x, self.auc_regulon_weights_key)), self.regulons)) + auc_mtx = aucell(self.ex_mtx, regulon_signatures, num_workers=self.num_workers) # (n_cells x n_regulons) + auc_mtx = auc_mtx.loc[self.ex_mtx.index] + return auc_mtx + + def binarize_regulon_enrichment(self): + _, auc_thresholds = binarize(self.auc_mtx) + return auc_thresholds + + def merge_regulon_data(self, scope_loom): + # Check if SCENIC has been run in multi-runs mode + is_multi_runs_mode = self.has_scenic_multi_runs_data() and scope_loom.has_scenic_multi_runs_data() + + # RegulonsAUC + # Relabel columns with suffix indicating the regulon source + auc_mtx = pd.DataFrame(data=self.col_attrs['RegulonsAUC'], index=self.col_attrs['CellID']) + auc_mtx.columns = auc_mtx.columns + '-' + self.tag + + scope_loom_auc_mtx = pd.DataFrame(data=scope_loom.col_attrs['RegulonsAUC'], index=scope_loom.col_attrs['CellID']) + scope_loom_auc_mtx.columns = scope_loom_auc_mtx.columns + '-' + scope_loom.tag + + # Regulons (regulon assignment matrices) + regulons = pd.DataFrame(self.row_attrs['Regulons'], index=self.row_attrs['Gene']) + regulons.columns = regulons.columns + '-' + self.tag + + scope_loom_regulons = pd.DataFrame(scope_loom.row_attrs['Regulons'], index=scope_loom.row_attrs['Gene']) + scope_loom_regulons.columns = scope_loom_regulons.columns + '-' + scope_loom.tag + + # If multi-runs SCENIC + # Rename Regulons Gene Occurrences + if is_multi_runs_mode: + regulon_gene_occurrences = pd.DataFrame(self.row_attrs['RegulonGeneOccurrences'], index=self.row_attrs['Gene']) + regulon_gene_occurrences.columns = regulon_gene_occurrences.columns + '-' + self.tag + + scope_loom_regulon_gene_occurrences = pd.DataFrame(scope_loom.row_attrs['RegulonGeneOccurrences'], index=scope_loom.row_attrs['Gene']) + scope_loom_regulon_gene_occurrences.columns = scope_loom_regulon_gene_occurrences.columns + '-' + scope_loom.tag + + # If multi-runs SCENIC + # Rename Regulons Gene Weights + if is_multi_runs_mode: + regulon_gene_weights = pd.DataFrame(self.row_attrs['RegulonGeneWeights'], index=self.row_attrs['Gene']) + regulon_gene_weights.columns = regulon_gene_weights.columns + '-' + self.tag + + scope_loom_regulon_gene_weights = pd.DataFrame(scope_loom.row_attrs['RegulonGeneWeights'], index=scope_loom.row_attrs['Gene']) + scope_loom_regulon_gene_weights.columns = scope_loom_regulon_gene_weights.columns + '-' + scope_loom.tag + + # Combine meta data regulons + # Rename regulons in the thresholds object, motif + rt = self.global_attrs["MetaData"]["regulonThresholds"] + for _, x in enumerate(rt): + tmp = x.get('regulon') + '-' + self.tag + x.update({'regulon': tmp}) + + # Rename regulons in the thresholds object, track + scope_loom_rt = scope_loom.global_attrs["MetaData"]["regulonThresholds"] + for _, x in enumerate(scope_loom_rt): + tmp = x.get('regulon') + '-' + scope_loom.tag + x.update({'regulon': tmp}) + # blank out the "motifData" field for track-based regulons: + x.update({'mofitData': 'NA.png'}) + + # merge regulon threshold dictionaries: + rt_merged = rt + scope_loom_rt + + # Remove because we will save them separately + del self.row_attrs["Regulons"] + del self.col_attrs["RegulonsAUC"] + + if is_multi_runs_mode: + del self.row_attrs["RegulonGeneOccurrences"] + del self.row_attrs["RegulonGeneWeights"] + + # Update the attributes + self.row_attrs.update({f'{self.tag.capitalize()}Regulons': SCopeLoom.df_to_named_matrix(regulons)}) + self.col_attrs.update({f'{self.tag.capitalize()}RegulonsAUC': SCopeLoom.df_to_named_matrix(auc_mtx)}) + + if is_multi_runs_mode: + self.row_attrs.update({f'{self.tag.capitalize()}RegulonGeneOccurrences': SCopeLoom.df_to_named_matrix(regulon_gene_occurrences)}) + self.row_attrs.update({f'{self.tag.capitalize()}RegulonGeneWeights': SCopeLoom.df_to_named_matrix(regulon_gene_weights)}) + + self.row_attrs.update({f'{scope_loom.tag.capitalize()}Regulons': SCopeLoom.df_to_named_matrix(scope_loom_regulons)}) + self.col_attrs.update({f'{scope_loom.tag.capitalize()}RegulonsAUC': SCopeLoom.df_to_named_matrix(scope_loom_auc_mtx)}) + + if is_multi_runs_mode: + self.row_attrs.update({f'{scope_loom.tag.capitalize()}RegulonGeneOccurrences': SCopeLoom.df_to_named_matrix(scope_loom_regulon_gene_occurrences)}) + self.row_attrs.update({f'{scope_loom.tag.capitalize()}RegulonGeneWeights': SCopeLoom.df_to_named_matrix(scope_loom_regulon_gene_weights)}) + + self.global_attrs["MetaData"].update({'regulonThresholds': rt_merged}) + + def get_regulon_gene_data(self, regulon, key): + if key == 'gene2occurrence': + return regulon.gene2occurrence + if key == 'gene2weight': + return regulon.gene2weight + raise Exception('Cannot retrieve {} from given regulon. Not implemented.'.format(key)) + + def get_regulon_meta_data(self, name, threshold): + + def fetch_logo(context): + for elem in context: + if elem.endswith('.png'): + return elem + return "" + + name2logo = {reg.name: fetch_logo(reg.context) for reg in self.regulons} + + regulon = list(filter(lambda x: x.name == name, self.regulons))[0] + regulon_meta_data = { + "regulon": name, + "defaultThresholdValue": (threshold if isinstance(threshold, float) else threshold[0]), + "defaultThresholdName": "guassian_mixture_split", + "allThresholds": { + "guassian_mixture_split": (threshold if isinstance(threshold, float) else threshold[0]) + }, + "motifData": name2logo.get(name, "") + } + if self.save_additional_regulon_meta_data: + regulon_meta_data.update({ + "tf": regulon.transcription_factor, + "score": regulon.score if hasattr(regulon, 'score') else 0.0, + "orthologousIdentity": regulon.orthologous_identity if hasattr(regulon, 'orthologous_identity') else 0.0, + "annotation": regulon.annotation if hasattr(regulon, 'annotation') else '', + "similarityQValue": regulon.score if hasattr(regulon, 'similarity_qvalue') else 0.0 + }) + return regulon_meta_data + + def create_loom_md_regulon_thresholds(self): + return [self.get_regulon_meta_data(name, threshold) for name, threshold in self.auc_thresholds.iteritems()] + + def create_loom_regulon_gene_assignment(self): + # Encode genes in regulons as "binary" membership matrix. + genes = np.array(self.ex_mtx.columns) + n_genes = len(genes) + n_regulons = len(self.regulons) + data = np.zeros(shape=(n_genes, n_regulons), dtype=int) + for idx, regulon in enumerate(self.regulons): + data[:, idx] = np.isin(genes, regulon.genes).astype(int) + regulon_gene_assignment = pd.DataFrame( + data=data, + index=self.ex_mtx.columns, + columns=list(map(attrgetter('name'), self.regulons)) + ) + return regulon_gene_assignment + + # Multi-runs SCENIC additional data + + def encode_regulon_gene_data(self, key): + genes = np.array(self.ex_mtx.columns) + n_genes = len(genes) + n_regulons = len(self.regulons) + data = np.zeros(shape=(n_genes, n_regulons), dtype=float) + for idx, regulon in enumerate(self.regulons): + regulon_data = pd.DataFrame.from_dict(self.get_regulon_gene_data(regulon, key), orient='index') + data[np.isin(genes, regulon_data.index), idx] = regulon_data[0][genes[np.isin(genes, regulon_data.index)]] + return data + + def create_loom_regulon_gene_data(self, key): + return pd.DataFrame( + data=self.encode_regulon_gene_data(key=key), + index=self.ex_mtx.columns, + columns=list(map(attrgetter('name'), self.regulons)) + ) + + def add_row_attr_regulon_gene_weights(self): + regulon_gene_weights = self.create_loom_regulon_gene_data(key='gene2weight') + if 'RegulonGeneWeights' not in self.row_attrs.keys(): + print("Added 'RegulonGeneWeights' to the row attributes.") + self.row_attrs['RegulonGeneWeights'] = SCopeLoom.df_to_named_matrix(regulon_gene_weights) + + def add_row_attr_regulon_gene_occurrences(self): + regulon_gene_occurrences = self.create_loom_regulon_gene_data(key='gene2occurrence') + if 'RegulonGeneOccurrences' not in self.row_attrs.keys(): + print("Added 'RegulonGeneOccurrences' to the row attributes.") + self.row_attrs['RegulonGeneOccurrences'] = SCopeLoom.df_to_named_matrix(regulon_gene_occurrences) + + ########### + # Generic # + ########### + + def name2idx(self): + return dict(map(reversed, enumerate(sorted(set(self.cell_annotations.values()))))) + + def set_generic_col_attrs(self): + self.col_attrs = { + "CellID": self.ex_mtx.index.values.astype('str'), + "nGene": self.ngenes.values, + "Embedding": SCopeLoom.df_to_named_matrix(self.default_embedding.get_embedding()), + "RegulonsAUC": SCopeLoom.df_to_named_matrix(self.auc_mtx), + "Clusterings": SCopeLoom.df_to_named_matrix(self.clusterings), + "ClusterID": self.clusterings.values + } + + def set_generic_row_attrs(self): + self.row_attrs = { + "Gene": self.ex_mtx.columns.values.astype('str'), + "Regulons": SCopeLoom.df_to_named_matrix(self.regulon_gene_assignment) + } + + def set_generic_global_attrs(self): + self.global_attrs = { + "title": os.path.splitext(os.path.basename(self.out_fname))[0] if self.title is None else self.title, + "MetaData": json.dumps({ + "embeddings": [{'id': identifier, 'name': name} for identifier, name in self.id2name.items()], + "annotations": [{ + "name": "", + "values": [] + }], + "clusterings": [{ + "id": 0, + "group": "celltype", + "name": "Cell Type", + "clusters": [{"id": idx, "description": name} for name, idx in self.name2idx().items()] + }], + "regulonThresholds": self.regulon_thresholds + }), + "Genome": self.nomenclature + } + + def merge(self, loom): + # Check all the cells and genes are in the same order + if not all(self.get_cell_ids() == loom.get_cell_ids()): + sys.exit(f"ERROR: Column attribute CellIDs does not match between {os.path.basename(self.filename)} and {os.path.basename(loom.filename)}") + if not all(self.get_genes() == loom.get_genes()): + sys.exit(f"ERROR: Row attribute 'Gene' does not match between {os.path.basename(self.filename)} and {os.path.basename(loom.filename)}") + + # Add the embeddings of the given loom (SCopeLoom) to this SCopeLoom + if 'embeddings' in loom.get_meta_data(): + + print(f"Merging embeddings from {os.path.basename(loom.filename)} into {os.path.basename(self.filename)}...") + for embedding in loom.get_meta_data()['embeddings']: + self.add_embedding( + embedding_name=embedding['name'], + embedding=loom.get_embedding_by_id(embedding_id=embedding['id']) + ) + print("Done.") + + # Add SCENIC results if present in the given loom + if any('Regulon' in s for s in loom.row_attrs.keys()) and any('Regulon' in s for s in loom.col_attrs.keys()): + + print(f"Merging SCENIC results from {os.path.basename(loom.filename)} into {os.path.basename(self.filename)}...") + + # Regulon information from row attributes + # All the row attributes containing the substring 'Regulon' within the given SCopeLoom + # will be merged with this SCopeLoom + regulons_row_attrs_keys = list(filter(lambda row_attrs_key: 'Regulon' in row_attrs_key, loom.row_attrs.keys())) + regulons_row_attrs = { + regulons_row_attr_key: loom.row_attrs[regulons_row_attr_key] for regulons_row_attr_key in regulons_row_attrs_keys + } + self.row_attrs.update(regulons_row_attrs) + # Regulon information from column attributes + # All the column attributes containing the substring 'Regulon' within the given SCopeLoom + # will be merged with this SCopeLoom + regulons_auc_col_attrs_keys = list(filter(lambda col_attrs_key: 'Regulon' in col_attrs_key, loom.col_attrs.keys())) + regulons_auc_col_attrs = { + regulons_auc_col_attr_key: loom.col_attrs[regulons_auc_col_attr_key] for regulons_auc_col_attr_key in regulons_auc_col_attrs_keys + } + self.col_attrs.update(regulons_auc_col_attrs) + # regulonThresholds MetaData global attribute + self.global_attrs["MetaData"].update( + { + 'regulonThresholds': loom.get_meta_data()['regulonThresholds'] + } + ) + + print("Done.") + + def export(self, out_fname: str, save_embeddings: bool = True, compress_meta_data: bool = False): + + if out_fname is None: + raise ValueError("The given out_fname cannot be None.") + + ############## + # Embeddings # + ############## + if save_embeddings: + self.col_attrs.update(self.create_loom_ca_embeddings_repr()) + self.global_attrs["MetaData"].update(self.create_loom_md_embeddings_repr()) + + ########## + # SCENIC # + ########## + if any('Regulons' in s for s in self.row_attrs.keys()): + self.row_attrs.update(self.scopify_loom_ra_regulon_data()) + if any('RegulonsAUC' in s for s in self.col_attrs.keys()): + self.col_attrs.update(self.scopify_loom_ca_regulon_data()) + if 'regulonThresholds' in self.global_attrs["MetaData"].keys(): + self.global_attrs["MetaData"].update(self.scopify_md_regulon_data()) + + # Compress MetaData global attribute + # (Should be compressed if Loompy version 2) + self.global_attrs["MetaData"] = json.dumps(self.global_attrs["MetaData"]) + if compress_meta_data: + self.global_attrs["MetaData"] = SCopeLoom.compress_encode(value=self.global_attrs["MetaData"]) + + # Create loom file for use with the SCope tool. + # The loom file format opted for rows as genes to facilitate growth along the column axis (i.e add more cells) + # PySCENIC chose a different orientation because of limitation set by the feather format: selectively reading + # information from disk can only be achieved via column selection. For the ranking databases this is of utmost + # importance. + lp.create( + filename=out_fname, + layers=self.ex_mtx.T.values, + row_attrs=self.row_attrs, + col_attrs=self.col_attrs, + file_attrs=self.global_attrs + ) + + ######### + # Utils # + ######### + + @staticmethod + def df_to_named_matrix(df: pd.DataFrame): + # Create meta-data structure. + # Create a numpy structured array + return np.array([tuple(row) for row in df.values], + dtype=np.dtype(list(zip(df.columns, df.dtypes)))) + + @staticmethod + def compress_encode(value): + ''' + Compress using ZLIB algorithm and encode the given value in base64. + Taken from: https://github.com/aertslab/SCopeLoomPy/blob/5438da52c4bcf48f483a1cf378b1eaa788adefcb/src/scopeloompy/utils/__init__.py#L7 + ''' + return base64.b64encode(zlib.compress(value.encode('ascii'))).decode('ascii') + + @staticmethod + def decompress_decode(value): + try: + value = value.decode('ascii') + return json.loads(zlib.decompress(base64.b64decode(value))) + except AttributeError: + return json.loads(zlib.decompress(base64.b64decode(value.encode('ascii'))).decode('ascii')) diff --git a/src/scenic/bin/merge_SCENIC_motif_track_loom.py b/src/scenic/bin/merge_SCENIC_motif_track_loom.py deleted file mode 100755 index 18f414e5..00000000 --- a/src/scenic/bin/merge_SCENIC_motif_track_loom.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import base64 -import json -import zlib - -import loompy as lp -import numpy as np -import pandas as pd - -################################################################################ -################################################################################ - -parser = argparse.ArgumentParser(description='Integrate output from pySCENIC motif- and track-based runs') -parser.add_argument( - '--loom_motif', - help='Loom file from pySCENIC motif run', - required=True, - default='pyscenic_motif.loom' -) -parser.add_argument( - '--loom_track', - help='Loom file from pySCENIC track run', - required=True, - default='pyscenic_track.loom' -) -parser.add_argument( - '--loom_output', - help='Final loom file with pySCENIC motif and track results integrated', - required=True, - default='pyscenic.loom' -) -args = parser.parse_args() - - -################################################################################ -################################################################################ - - -def df_to_named_matrix(df): - arr_ip = [tuple(i) for i in df.as_matrix()] - dtyp = np.dtype(list(zip(df.dtypes.index, df.dtypes))) - arr = np.array(arr_ip, dtype=dtyp) - return arr - - -def integrate_motif_track(args): - ################################################################################ - # load data from loom - ################################################################################ - - # scenic motif output - lf = lp.connect(args.loom_motif, mode='r', validate=False) - meta_mtf = json.loads(zlib.decompress(base64.b64decode(lf.attrs.MetaData))) - auc_mtx_mtf = pd.DataFrame(lf.ca.RegulonsAUC, index=lf.ca.CellID) - regulons_mtf = pd.DataFrame(lf.ra.Regulons, index=lf.ra.Gene) - lf.close() - - # scenic track output - lf = lp.connect(args.loom_track, mode='r', validate=False) - meta_trk = json.loads(zlib.decompress(base64.b64decode(lf.attrs.MetaData))) - auc_mtx_trk = pd.DataFrame(lf.ca.RegulonsAUC, index=lf.ca.CellID) - regulons_trk = pd.DataFrame(lf.ra.Regulons, index=lf.ra.Gene) - lf.close() - - ################################################################################ - # Fix track auc mtx names: - ################################################################################ - - # relabel columns with suffix indicating the regulon source - auc_mtx_trk.columns = auc_mtx_trk.columns + '-track' - auc_mtx_mtf.columns = auc_mtx_mtf.columns + '-motif' - - # merge the AUC matrices: - auc_mtx = pd.concat([auc_mtx_mtf, auc_mtx_trk], sort=False, axis=1, join='outer') - - # fill NAs (if any) with 0s: - auc_mtx.fillna(0, inplace=True) - - ################################################################################ - # Combine track and motif - ################################################################################ - - # combine regulon assignment matrices: - regulons_trk.columns = regulons_trk.columns + '-track' - regulons_mtf.columns = regulons_mtf.columns + '-motif' - - # merge the regulon assignment matrices: - regulons = pd.concat([regulons_mtf, regulons_trk], sort=False, axis=1, join='outer') - - # replace NAs with 0s: - regulons.fillna(0, inplace=True) - - # Rename regulons in the thresholds object, motif - rt_mtf = meta_mtf['regulonThresholds'] - for i, x in enumerate(rt_mtf): - tmp = x.get('regulon') + '-motif' - x.update({'regulon': tmp}) - - # Rename regulons in the thresholds object, track - rt_trk = meta_trk['regulonThresholds'] - for i, x in enumerate(rt_trk): - tmp = x.get('regulon') + '-track' - x.update({'regulon': tmp}) - # blank out the "motifData" field for track-based regulons: - x.update({'mofitData': 'NA.png'}) - - # merge regulon threshold dictionaries: - rt = rt_mtf + rt_trk - - ################################################################################ - # metadata - ################################################################################ - - metadata = {} - - metadata["metrics"] = [ - { - "name": "nUMI" - }, { - "name": "nGene" - }, { - "name": "Percent_mito" - } - ] - - # SCENIC regulon thresholds: - metadata["regulonThresholds"] = rt - - ################################################################################ - # row/column attributes - ################################################################################ - - # re-open the connection to the loom file to copy the original expression data - lf = lp.connect(args.loom_motif, mode='r', validate=False) - - col_attrs = { - "CellID": lf.ca.CellID, # np.array(adata.obs.index), - "RegulonsAUC": df_to_named_matrix(auc_mtx), - } - - row_attrs = { - "Gene": lf.ra.Gene, - "Regulons": df_to_named_matrix(regulons), - } - - attrs = { - "MetaData": base64.b64encode(zlib.compress(json.dumps(metadata).encode('ascii'))).decode('ascii') - } - - if "SCopeTreeL1" in attrs.keys(): - attrs['SCopeTreeL1'] = lf.attrs.SCopeTreeL1 - if "SCopeTreeL2" in attrs.keys(): - attrs['SCopeTreeL2'] = lf.attrs.SCopeTreeL2 - if "SCopeTreeL3" in attrs.keys(): - attrs['SCopeTreeL3'] = lf.attrs.SCopeTreeL3 - - lp.create( - filename=args.loom_output, - layers=lf[:, :], - row_attrs=row_attrs, - col_attrs=col_attrs, - file_attrs=attrs - ) - lf.close() - - -if __name__ == "__main__": - integrate_motif_track(args) - diff --git a/src/scenic/bin/merge_motif_track_loom.py b/src/scenic/bin/merge_motif_track_loom.py new file mode 100755 index 00000000..16ba519e --- /dev/null +++ b/src/scenic/bin/merge_motif_track_loom.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 + +import argparse +import base64 +import json +import zlib + +import loompy as lp +import numpy as np +import pandas as pd + +import export_to_loom + +################################################################################ +################################################################################ + +parser = argparse.ArgumentParser(description='Integrate output from pySCENIC motif- and track-based runs') +parser.add_argument( + '--loom_motif', + help='Loom file from pySCENIC motif run', + required=True, + default='pyscenic_motif.loom' +) +parser.add_argument( + '--loom_track', + help='Loom file from pySCENIC track run', + required=True, + default='pyscenic_track.loom' +) +parser.add_argument( + '--loom_output', + help='Final loom file with pySCENIC motif and track results integrated', + required=True, + default='pyscenic.loom' +) +args = parser.parse_args() + + +################################################################################ +################################################################################ + +def integrate_motif_track(args): + + ################################################################################ + # load data from loom + ################################################################################ + + # scenic motif output + mtf_scope_loom = export_to_loom.SCopeLoom.read_loom(filename=args.loom_motif, tag='motif') + # scenic track output + trk_scope_loom = export_to_loom.SCopeLoom.read_loom(filename=args.loom_track, tag='track') + + ################################################################################ + # merge scenic outputs + ################################################################################ + + # Merge regulon data from scenic track output into scenic motif output + mtf_scope_loom.merge_regulon_data(scope_loom=trk_scope_loom) + mtf_scope_loom.add_metrics(metrics=["nUMI", "nGene", "Percent_mito"]) + mtf_scope_loom.export(out_fname=args.loom_output, save_embeddings=False) + + +if __name__ == "__main__": + integrate_motif_track(args) diff --git a/src/scenic/bin/reports/scenic_report.ipynb b/src/scenic/bin/reports/scenic_report.ipynb index 59676e73..879ab7e7 100644 --- a/src/scenic/bin/reports/scenic_report.ipynb +++ b/src/scenic/bin/reports/scenic_report.ipynb @@ -149,9 +149,18 @@ "source": [ "# scenic output\n", "lf = lp.connect( FILE, mode='r', validate=False )\n", - "meta = json.loads(zlib.decompress(base64.b64decode( lf.attrs.MetaData )))\n", + "meta = json.loads(lf.attrs[\"MetaData\"])\n", "#exprMat = pd.DataFrame( lf[:,:], index=lf.ra.Gene, columns=lf.ca.CellID).T\n", - "auc_mtx = pd.DataFrame( lf.ca.RegulonsAUC, index=lf.ca.CellID)" + "if \"RegulonsAUC\" in lf.ca.keys():\n", + " auc_mtx = pd.DataFrame(lf.ca.RegulonsAUC, index=lf.ca.CellID)\n", + "else:\n", + " print(\"Loom with motif & track regulons detected, merging the regulons AUC matrices...\")\n", + " mtf_auc_mtx = pd.DataFrame(lf.ca.MotifRegulonsAUC, index=lf.ca.CellID)\n", + " trk_auc_mtx = pd.DataFrame(lf.ca.TrackRegulonsAUC, index=lf.ca.CellID)\n", + " # merge the AUC matrices:\n", + " auc_mtx = pd.concat([mtf_auc_mtx, trk_auc_mtx], sort=False, axis=1, join='outer')\n", + " # fill NAs (if any) with 0s:\n", + " auc_mtx.fillna(0, inplace=True)" ] }, { @@ -162,8 +171,19 @@ "source": [ "# create a dictionary of regulons:\n", "regulons = {}\n", - "for i,r in pd.DataFrame(lf.ra.Regulons,index=lf.ra.Gene).iteritems():\n", - " regulons[i] = list(r[r==1].index.values)" + "if \"Regulons\" in lf.ra.keys():\n", + " for i,r in pd.DataFrame(lf.ra.Regulons,index=lf.ra.Gene).iteritems():\n", + " regulons[i] = list(r[r==1].index.values)\n", + "elif \"MotifRegulons\" in lf.ra.keys() and \"TrackRegulons\" in lf.ra.keys():\n", + " if \"MotifRegulons\" in lf.ra.keys():\n", + " for i,r in pd.DataFrame(lf.ra.MotifRegulons,index=lf.ra.Gene).iteritems():\n", + " regulons[i] = list(r[r==1].index.values)\n", + " if \"TrackRegulons\" in lf.ra.keys():\n", + " for i,r in pd.DataFrame(lf.ra.TrackRegulons,index=lf.ra.Gene).iteritems():\n", + " regulons[i] = list(r[r==1].index.values)\n", + "else:\n", + " print(f\"No regulon information found in the loom: {FILE}...\")\n", + " exit(1)" ] }, { @@ -185,8 +205,16 @@ "dry = pd.DataFrame( lf.ca.Embeddings_Y, index=lf.ca.CellID )\n", "\n", "for i in range( len(drx.columns) ):\n", - " dr.append( pd.concat( [ drx.iloc[:,i], dry.iloc[:,i] ], sort=False, axis=1, join='outer' ))\n", - " dr_names.append( meta['embeddings'][i+1]['name'].replace(\" \",\"_\").replace('/','-') )\n", + " if drx.columns[i] == '-1':\n", + " continue\n", + " dr.append(\n", + " pd.concat(\n", + " [ drx.iloc[:,i], dry.iloc[:,i] ],\n", + " sort=False, axis=1, join='outer' )\n", + " )\n", + " dr_names.append(\n", + " list(filter(lambda e: str(e['id']) == drx.columns[i], meta['embeddings']))[0]['name']\n", + " )\n", "\n", "# rename columns:\n", "for i,x in enumerate( dr ):\n", @@ -219,8 +247,14 @@ "source": [ "adata = sc.read( FILE, validate=False)\n", "\n", + "adata.obs.drop( ['Embedding','Embeddings_X','Embeddings_Y'], axis=1, inplace=True )\n", "# drop the embeddings and extra attributes from the obs object\n", - "adata.obs.drop( ['Embedding','Embeddings_X','Embeddings_Y','RegulonsAUC'], axis=1, inplace=True )" + "if \"Regulons\" in adata.obs.keys():\n", + " adata.obs.drop( ['RegulonsAUC'], axis=1, inplace=True )\n", + "elif \"MotifRegulons\" in adata.obs.keys() and \"TrackRegulons\" in adata.obs.keys():\n", + " adata.obs.drop( ['MotifRegulonsAUC', 'TrackRegulonsAUC'], axis=1, inplace=True )\n", + "else:\n", + " exit(1)" ] }, { @@ -503,7 +537,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.6.8" } }, "nbformat": 4, diff --git a/src/scenic/bin/save_SCENIC_multi_runs_to_loom.py b/src/scenic/bin/save_multi_runs_to_loom.py similarity index 56% rename from src/scenic/bin/save_SCENIC_multi_runs_to_loom.py rename to src/scenic/bin/save_multi_runs_to_loom.py index e026fb66..96852fa5 100755 --- a/src/scenic/bin/save_SCENIC_multi_runs_to_loom.py +++ b/src/scenic/bin/save_multi_runs_to_loom.py @@ -5,19 +5,10 @@ import sys import pandas as pd import pickle -from pyscenic import transform -from pyscenic.export import export2loom -from pyscenic.transform import COLUMN_NAME_NES -from pyscenic.utils import COLUMN_NAME_MOTIF_SIMILARITY_QVALUE, COLUMN_NAME_ORTHOLOGOUS_IDENTITY, \ - COLUMN_NAME_ANNOTATION +import gzip import time import utils - -################################################################################ -# TODO: -# This implementation should be optimized: -# It's taking several hours to run (~5h for ~9k genes and ~13k cells) -################################################################################ +import export_to_loom parser_grn = argparse.ArgumentParser(description='Run AUCell on gene signatures saved as TSV in folder.') @@ -28,13 +19,9 @@ ' Two file formats are supported: csv (rows=cells x columns=genes) or loom (rows=genes x columns=cells).' ) parser_grn.add_argument( - 'motif_enrichment_table_fname', + 'aggregated_regulons_fname', type=argparse.FileType('r'), - help='The name of the file that contains the motif enrichments.' -) -parser_grn.add_argument( - 'signatures_fname', - help='The name of the folder containing the signatures as TSV files.' + help='The name of the file (.pkl.gz aka compressed pickle format) that contains the regulons resulting from the aggregated motif enrichment table.' ) parser_grn.add_argument( 'auc_mtx_fname', @@ -55,11 +42,11 @@ help='The threshold used for filtering the regulons based on the number of targets (default: {}).'.format(5) ) parser_grn.add_argument( - '--gene-occurence-threshold', + '--min-regulon-gene-occurrence', type=int, default=5, - dest="gene_occurence_threshold", - help='The threshold used for filtering the genes bases on their occurence (default: {}).'.format(5) + dest="min_regulon_gene_occurrence", + help='The threshold used for filtering the genes bases on their occurrence (default: {}).'.format(5) ) parser_grn.add_argument( '--cell-id-attribute', @@ -117,48 +104,10 @@ ) print(f"... took {time.time() - start} seconds", flush=True) -# Transform motif enrichment table (generated from the cisTarget step) to regulons -print(f"Reading aggregated motif enrichment table...", flush=True) +print(f"Reading the aggregated regulons...", flush=True) start = time.time() -f = args.motif_enrichment_table_fname.name -if f.endswith('.pickle'): - with open(f, 'rb') as handle: - motif_enrichment_table = pickle.load(handle) -elif f.endswith('.csv'): - motif_enrichment_table = utils.read_feature_enrichment_table(fname=args.motif_enrichment_table_fname.name, sep=",") -else: - raise Exception("The aggregated feature enrichment table is in the wrong format. Expecting .pickle or .csv formats.") -print(f"... took {time.time() - start} seconds to run.", flush=True) - -print(f"Making the regulons...", flush=True) -start = time.time() -regulons = transform.df2regulons( - df=motif_enrichment_table, - save_columns=[ - COLUMN_NAME_NES, - COLUMN_NAME_ORTHOLOGOUS_IDENTITY, - COLUMN_NAME_MOTIF_SIMILARITY_QVALUE, - COLUMN_NAME_ANNOTATION - ] -) - -# Read the signatures (regulons extracted from AUCell looms generated by AUCell step) -signatures = utils.read_signatures_from_tsv_dir( - dpath=args.signatures_fname, - noweights=False, - weight_threshold=args.gene_occurence_threshold, - min_genes=args.min_genes_regulon -) - -# Filter regulons (regulons from motifs enrichment table) by the filtered signatures -regulons = list(filter(lambda x: x.name in list(map(lambda x: x.name, signatures)), regulons)) -# Add gene2occurrence from filtered signatures to regulons -regulons = list( - map( - lambda x: - x.copy(gene2occurrence=list(filter(lambda y: y.name == x.name, signatures))[0].gene2weight), regulons - ) -) +with gzip.open(args.aggregated_regulons_fname.name, 'rb') as file_handler: + regulons = pickle.load(file_handler) print(f"... took {time.time() - start} seconds to run.", flush=True) print(f"Reading AUCell matrix...", flush=True) @@ -171,15 +120,32 @@ # Create loom print(f"Exporting to loom...", flush=True) start = time.time() -export2loom( +# Create the basic loom +scope_loom = export_to_loom.SCopeLoom( ex_mtx=ex_matrix_df, regulons=regulons, - out_fname=args.output.name, title=args.title, nomenclature=args.nomenclature, auc_mtx=auc_mtx, - tree_structure=[args.scope_tree_level_1, args.scope_tree_level_2, args.scope_tree_level_3], - compress=True -) + tree_structure=[ + args.scope_tree_level_1, + args.scope_tree_level_2, + args.scope_tree_level_3 + ], + compress=True, + save_additional_regulon_meta_data=True +) +scope_loom.set_generic_loom() +# Add additional stuff specific to multi-runs SCENIC +scope_loom.add_row_attr_regulon_gene_weights() +scope_loom.add_row_attr_regulon_gene_occurrences() +scope_loom.add_meta_data(_dict={ + "regulonSettings": { + "min_genes_regulon": args.min_genes_regulon, + "min_regulon_gene_occurrence": args.min_regulon_gene_occurrence + } +}) +scope_loom.export(out_fname=args.output.name, save_embeddings=False) + print(f"... took {time.time() - start} seconds to run.", flush=True) print(f"Done.", flush=True) diff --git a/src/scenic/bin/utils.py b/src/scenic/bin/utils.py index 2f8564c3..76aa964e 100755 --- a/src/scenic/bin/utils.py +++ b/src/scenic/bin/utils.py @@ -3,7 +3,7 @@ import os import warnings from typing import List - +import gzip import loompy as lp import pandas as pd from pyscenic.genesig import GeneSignature @@ -22,26 +22,25 @@ def read_signatures_from_tsv_dir(dpath: str, noweights=False, weight_threshold=0 def signatures(): for gene_sig_file_path in gene_sig_file_paths: + gene_sig = pd.read_csv(gene_sig_file_path, sep='\t', header=None, index_col=None) fname = ntpath.basename(gene_sig_file_path) regulon = os.path.splitext(fname)[0] - gene_sig = pd.read_csv(gene_sig_file_path, sep='\t', header=None, index_col=None) + + # Check if the file is the regulon frequency file + if regulon == 'regulons': + continue + # Do some sanity checks if len(gene_sig.columns) == 0: - assert ( - os.path.exists(gene_sig_file_path), - "{} has 0 columns. Requires TSV with 1 or 2 columns. First column should be genes (required)," - " second (optional) are weight for the given genes.".format(gene_sig_file_path) - ) + raise Exception(f"{gene_sig_file_path} has 0 columns. Requires TSV with 1 or 2 columns. First column should be genes (required), second (optional) are weight for the given genes.") if len(gene_sig.columns) > 2: - assert ( - os.path.exists(gene_sig_file_path), - "{} has more than 2 columns. Requires TSV with 1 or 2 columns. First column should be genes," - " second (optional) are weight for the given genes.".format(gene_sig_file_path) - ) + raise Exception(f"{gene_sig_file_path} has more than 2 columns. Requires TSV with 1 or 2 columns. First column should be genes, second (optional) are weight for the given genes.") if len(gene_sig.columns) == 1 or noweights: gene2weight = gene_sig[0] if len(gene_sig.columns) == 2 and not noweights: # Filter the genes based on the given weight_threshold + # 1st column: genes + # 2nd column: weights gene_sig = gene_sig[gene_sig[1] > weight_threshold] if len(gene_sig.index) == 0: if show_warnings: @@ -57,16 +56,36 @@ def signatures(): signatures = list(signatures()) # Filter regulons with less than min_genes (>= min_genes) signatures = list(filter(lambda x: len(x.gene2weight) >= min_genes, signatures)) - print("Signatures passed filtering {0} out of {1}".format(len(signatures), len(gene_sig_file_paths))) + # Subtract 1 because regulons.tsv should not be counted + print("Signatures passed filtering {0} out of {1}".format(len(signatures), len(gene_sig_file_paths) - 1)) return signatures -def read_feature_enrichment_table(fname, sep): - # ext = os.path.splitext(fname,)[1] - df = pd.read_csv(fname, sep=sep, index_col=[0, 1], header=[0, 1], skipinitialspace=True) - df[('Enrichment', COLUMN_NAME_CONTEXT)] = df[('Enrichment', COLUMN_NAME_CONTEXT)].apply(lambda s: eval(s)) - df[('Enrichment', COLUMN_NAME_TARGET_GENES)] = df[('Enrichment', COLUMN_NAME_TARGET_GENES)].apply(lambda s: eval(s)) - return df +def read_feature_enrichment_table(fname, sep, chunksize=None, raw=False): + converters = None + + if not raw and chunksize is None: + with gzip.open(fname, 'r') as fh: + fh.readline() + header_line2 = str(fh.readline()).rstrip('\n').split(',') + # Get the index column of COLUMN_NAME_CONTEXT and COLUMN_NAME_TARGET_GENES + column_name_context_idx = header_line2.index(COLUMN_NAME_CONTEXT) + column_name_target_genes_idx = header_line2.index(COLUMN_NAME_TARGET_GENES) + # Use converters to evaluate the expression a the given columns + converters = { + column_name_context_idx: eval, + column_name_target_genes_idx: eval, + } + return pd.read_csv( + fname, + sep=sep, + index_col=[0, 1], + header=[0, 1], + skipinitialspace=True, + chunksize=chunksize, + converters=converters, + engine='c' + ) def get_matrix(loom_file_path, gene_attribute, cell_id_attribute): diff --git a/src/scenic/conf/multi_runs.config b/src/scenic/conf/multi_runs.config index a231e5b7..7c705b36 100644 --- a/src/scenic/conf/multi_runs.config +++ b/src/scenic/conf/multi_runs.config @@ -1,14 +1,26 @@ params { sc { scenic { - container = "dweemx/sctx-pyscenic:0.9.18-2" + container = "docker://dweemx/sctx-pyscenic:0.9.19-4" numRuns = 2 // AUCell parameters aucell { // only used when running in multi-runs mode // percentile_threshold = 0.01 // will override auc_threshold if uncommented min_genes_regulon = 5 - gene_occurence_threshold = 5 + min_regulon_gene_occurrence = 5 + } + aggregate_features { + use_chunking = true + output_format = 'csv' + compression = 'gzip' + } + motifs_to_regulons { + // Best: (max RAM memory (in Mb) of the node - 4*1024) / number_of_cores + pmem = '6gb' + } + save_to_loom { + pmem = '2gb' } } } diff --git a/src/scenic/conf/test.config b/src/scenic/conf/test.config index e416099f..b91f16d6 100644 --- a/src/scenic/conf/test.config +++ b/src/scenic/conf/test.config @@ -17,13 +17,14 @@ params { sc { scenic { - container = "/ddn1/vol1/staging/leuven/stg_00002/lcb/lcb_projects/Pipeline_Dev/aertslab-pyscenic-0.9.18.sif" + container = "docker://dweemx/sctx-pyscenic:0.9.19-4" scenicoutdir = "/ddn1/vol1/staging/leuven/stg_00002/lcb/dwmax/documents/aertslab/GitHub/SingleCellTxBenchmark/src/scenic/out" filteredloom = "data/expr_mat.loom" // for testing - scenicoutputloom = "SCENIC_output.loom" + scenicOutputLoom = "SCENIC_output.loom" + scenicScopeOutputLoom = "SCENIC_SCope_output.loom" // Computation arguments: - numRuns = 2 numWorkers = 2 + maxForks = 1 mode = "dask_multiprocessing" client_or_address = "" // Loom file arguments: @@ -33,6 +34,8 @@ params { grn { TFs = "data/allTFs_hg38.txt" seed = "" + // Resources settings + pmem = '2gb' } // cisTarget parameters cistarget { @@ -44,9 +47,9 @@ params { mtfANN = "data/motifs.tbl" // for testing //mtfANN = "/staging/leuven/res_00001/databases/cistarget/motif2tf/motifs-v9-nr.hgnc-m0.001-o0.0.tbl" // track feather format databases - trkDB = "/staging/leuven/res_00001/databases/cistarget/databases/homo_sapiens/hg38/refseq_r80/tc_v1/gene_based/encode_20190621__ChIP_seq_transcription_factor.hg38__refseq-r80__*feather" + //trkDB = "/staging/leuven/res_00001/databases/cistarget/databases/homo_sapiens/hg38/refseq_r80/tc_v1/gene_based/encode_20190621__ChIP_seq_transcription_factor.hg38__refseq-r80__*feather" // track annotations - trkANN = "/staging/leuven/stg_00002/lcb/icistarget/data/annotations/homo_sapiens/hg38/track_annotations/encode_project_20190621__ChIP-seq_transcription_factor.homo_sapiens.hg38.bigwig_signal_pvalue.track_to_tf_in_motif_to_tf_format.tsv" + //trkANN = "/staging/leuven/stg_00002/lcb/icistarget/data/annotations/homo_sapiens/hg38/track_annotations/encode_project_20190621__ChIP-seq_transcription_factor.homo_sapiens.hg38.bigwig_signal_pvalue.track_to_tf_in_motif_to_tf_format.tsv" type = "" output = "reg.csv" @@ -71,6 +74,8 @@ params { top_n_regulators = "5,10,50" min_genes = "20" // expression_mtx_fname = "" // uses params.scenic.filteredloom + // Resources settings + pmem = '2gb' } // AUCell parameters @@ -81,9 +86,8 @@ params { auc_threshold = "0.05" // percentile_threshold = 0.01 nes_threshold = "3.0" - // only used when running in multi-runs mode - min_genes_regulon = 5 - gene_occurence_threshold = 0 + // Resources settings + pmem = '2gb' } } @@ -96,3 +100,9 @@ params { } } } + +singularity { + enabled = true + autoMounts = true + runOptions = '-B /ddn1/vol1/staging/leuven/stg_00002/:/ddn1/vol1/staging/leuven/stg_00002/ -B /staging/leuven/stg_00002/:/staging/leuven/stg_00002/ -B /staging/leuven/res_00001/:/staging/leuven/res_00001/ -B /ddn1/vol1/staging/leuven/res_00001/:/ddn1/vol1/staging/leuven/res_00001/' +} diff --git a/src/scenic/conf/test_multi_runs.config b/src/scenic/conf/test_multi_runs.config new file mode 100644 index 00000000..583c7212 --- /dev/null +++ b/src/scenic/conf/test_multi_runs.config @@ -0,0 +1,27 @@ +params { + sc { + scenic { + container = "docker://dweemx/sctx-pyscenic:0.9.19-4" + numRuns = 2 + // AUCell parameters + aucell { + // only used when running in multi-runs mode + // percentile_threshold = 0.01 // will override auc_threshold if uncommented + min_genes_regulon = 1 + min_regulon_gene_occurrence = 1 + } + aggregate_features { + use_chunking = true + output_format = 'csv' + compression = 'gzip' + } + motifs_to_regulons { + // Best: (max RAM memory (in Mb) of the node - 4*1024) / number_of_cores + pmem = '6gb' + } + save_to_loom { + pmem = '2gb' + } + } + } +} diff --git a/src/scenic/main.nf b/src/scenic/main.nf index 151f6f54..27440c5e 100644 --- a/src/scenic/main.nf +++ b/src/scenic/main.nf @@ -10,33 +10,21 @@ * Steps considered: */ -import static groovy.json.JsonOutput.* - nextflow.preview.dsl=2 -// print all parameters: -// println(prettyPrint(toJson( params ))) -// println(prettyPrint(toJson( "$workflow" ))) - ////////////////////////////////////////////////////// // Define the parameters for current testing proces -include SC__SCENIC__GRNBOOST2_WITHOUT_DASK from './processes/grnboost2withoutDask' params(params) -include SC__SCENIC__CISTARGET as SC__SCENIC__CISTARGET__MOTIF from './processes/cistarget' params(params) -include SC__SCENIC__CISTARGET as SC__SCENIC__CISTARGET__TRACK from './processes/cistarget' params(params) -include SC__SCENIC__AUCELL as SC__SCENIC__AUCELL__MOTIF from './processes/aucell' params(params) -include SC__SCENIC__AUCELL as SC__SCENIC__AUCELL__TRACK from './processes/aucell' params(params) -include SC__SCENIC__AGGR_MULTI_RUNS_FEATURES as SC__SCENIC__AGGR_MULTI_RUNS_FEATURES__MOTIF from './processes/aggregateMultiRunsFeatures' params(params) -include SC__SCENIC__AGGR_MULTI_RUNS_FEATURES as SC__SCENIC__AGGR_MULTI_RUNS_FEATURES__TRACK from './processes/aggregateMultiRunsFeatures' params(params) -include SC__SCENIC__AGGR_MULTI_RUNS_REGULONS as SC__SCENIC__AGGR_MULTI_RUNS_REGULONS__MOTIF from './processes/aggregateMultiRunsRegulons' params(params) -include SC__SCENIC__AGGR_MULTI_RUNS_REGULONS as SC__SCENIC__AGGR_MULTI_RUNS_REGULONS__TRACK from './processes/aggregateMultiRunsRegulons' params(params) -include SC__SCENIC__AUCELL_GENESIGS_FROM_FOLDER as SC__SCENIC__AUCELL_GENESIGS_FROM_FOLDER__MOTIF from './processes/aucellGeneSigsFromFolder' params(params) -include SC__SCENIC__AUCELL_GENESIGS_FROM_FOLDER as SC__SCENIC__AUCELL_GENESIGS_FROM_FOLDER__TRACK from './processes/aucellGeneSigsFromFolder' params(params) -include SC__SCENIC__SAVE_SCENIC_MULTI_RUNS_TO_LOOM as SC__SCENIC__SAVE_SCENIC_MULTI_RUNS_TO_LOOM_MOTIF from './processes/saveScenicMultiRunsToLoom' params(params) -include SC__SCENIC__SAVE_SCENIC_MULTI_RUNS_TO_LOOM as SC__SCENIC__SAVE_SCENIC_MULTI_RUNS_TO_LOOM_TRACK from './processes/saveScenicMultiRunsToLoom' params(params) -include SC__SCENIC__PUBLISH_LOOM from './processes/scenicLoomHandler' params(params) -include SC__SCENIC__MERGE_MOTIF_TRACK_LOOMS from './processes/scenicLoomHandler' params(params) -include SC__SCENIC__APPEND_SCENIC_LOOM from './processes/scenicLoomHandler' params(params) -include SC__SCENIC__VISUALIZE from './processes/scenicLoomHandler' params(params) +include GRNBOOST2_WITHOUT_DASK from './processes/grnboost2withoutDask' params(params) +include CISTARGET as CISTARGET__MOTIF from './processes/cistarget' params(params) +include CISTARGET as CISTARGET__TRACK from './processes/cistarget' params(params) +include AUCELL as AUCELL__MOTIF from './processes/aucell' params(params) +include AUCELL as AUCELL__TRACK from './processes/aucell' params(params) +include AGGREGATE_MULTI_RUNS_TO_LOOM as MULTI_RUNS_TO_LOOM__MOTIF from './workflows/aggregateMultiRuns' params(params) +include AGGREGATE_MULTI_RUNS_TO_LOOM as MULTI_RUNS_TO_LOOM__TRACK from './workflows/aggregateMultiRuns' params(params) +include PUBLISH_LOOM from './processes/loomHandler' params(params) +include MERGE_MOTIF_TRACK_LOOMS from './processes/loomHandler' params(params) +include APPEND_SCENIC_LOOM from './processes/loomHandler' params(params) +include VISUALIZE from './processes/loomHandler' params(params) // reporting: include './processes/reports.nf' params(params + params.global) @@ -61,7 +49,7 @@ workflow SCENIC { main: /* GRN */ tfs = file(params.sc.scenic.grn.TFs) - grn = SC__SCENIC__GRNBOOST2_WITHOUT_DASK( runs, filteredloom, tfs ) + grn = GRNBOOST2_WITHOUT_DASK( runs, filteredloom, tfs ) /* cisTarget motif analysis */ // channel for SCENIC databases resources: @@ -69,7 +57,7 @@ workflow SCENIC { .fromPath( params.sc.scenic.cistarget.mtfDB ) .collect() // use all files together in the ctx command motifANN = file(params.sc.scenic.cistarget.mtfANN) - ctx_mtf = SC__SCENIC__CISTARGET__MOTIF( runs, filteredloom, grn, motifDB, motifANN, 'mtf' ) + ctx_mtf = CISTARGET__MOTIF( grn, filteredloom, motifDB, motifANN, 'mtf' ) /* cisTarget track analysis */ if(params.sc.scenic.cistarget.trkDB) { @@ -77,105 +65,55 @@ workflow SCENIC { .fromPath( params.sc.scenic.cistarget.trkDB ) .collect() // use all files together in the ctx command trackANN = file(params.sc.scenic.cistarget.trkANN) - ctx_trk = SC__SCENIC__CISTARGET__TRACK( runs, filteredloom, grn, trackDB, trackANN, 'trk' ) + ctx_trk = CISTARGET__TRACK( grn, filteredloom, trackDB, trackANN, 'trk' ) } /* AUCell, motif regulons */ - auc_mtf = SC__SCENIC__AUCELL__MOTIF( runs, filteredloom, ctx_mtf, 'mtf' ) + auc_mtf = AUCELL__MOTIF( ctx_mtf, filteredloom, 'mtf' ) if(params.sc.scenic.cistarget.trkDB) { /* AUCell, track regulons */ - auc_trk = SC__SCENIC__AUCELL__TRACK( runs, filteredloom, ctx_trk, 'trk' ) + auc_trk = AUCELL__TRACK( ctx_trk, filteredloom, 'trk' ) } // multi-runs aggregation: if(params.sc.scenic.containsKey("numRuns") && params.sc.scenic.numRuns > 1) { if(params.sc.scenic.numRuns > 2 && params.global.qsubaccount.length() == 0) throw new Exception("Consider to run SCENIC in multi-runs mode as jobs. Specify the qsubaccount parameter accordingly.") - // Aggregate features (motifs and tracks) - /* Aggregate motifs from multiple runs */ - aggr_features_mtf = SC__SCENIC__AGGR_MULTI_RUNS_FEATURES__MOTIF( - ctx_mtf.collect(), - 'mtf' - ) - if(params.sc.scenic.cistarget.trkDB) { - /* Aggregate tracks from multiple runs */ - aggr_features_trk = SC__SCENIC__AGGR_MULTI_RUNS_FEATURES__TRACK( - ctx_trk.collect(), - 'trk' - ) - } - - // Aggregate regulons (motifs and tracks) - /* Aggregate motif regulons from multiple runs */ - regulons_folder_mtf = SC__SCENIC__AGGR_MULTI_RUNS_REGULONS__MOTIF( - auc_mtf.collect(), - 'mtf' - ) - if(params.sc.scenic.cistarget.trkDB) { - /* Aggregate track regulons from multiple runs */ - regulons_folder_trk = SC__SCENIC__AGGR_MULTI_RUNS_REGULONS__TRACK( - auc_trk.collect(), - 'trk' - ) - } - - // Run AUCell on aggregated regulons - /* Aggregate motif regulons from multiple runs */ - regulons_auc_mtf = SC__SCENIC__AUCELL_GENESIGS_FROM_FOLDER__MOTIF( + + scenic_loom_mtf = MULTI_RUNS_TO_LOOM__MOTIF( filteredloom, - regulons_folder_mtf, + ctx_mtf, + auc_mtf, 'mtf' ) if(params.sc.scenic.cistarget.trkDB) { - /* Aggregate track regulons from multiple runs */ - regulons_auc_trk = SC__SCENIC__AUCELL_GENESIGS_FROM_FOLDER__TRACK( + scenic_loom_trk = MULTI_RUNS_TO_LOOM__TRACK( filteredloom, - regulons_folder_trk, + ctx_trk, + auc_trk, 'trk' ) - } - - // Save to loom - /* Save multiple motif SCENIC runs to loom*/ - scenic_loom_mtf = SC__SCENIC__SAVE_SCENIC_MULTI_RUNS_TO_LOOM_MOTIF( - filteredloom, - aggr_features_mtf, - regulons_folder_mtf, - regulons_auc_mtf, - 'mtf' - ) - if(params.sc.scenic.cistarget.trkDB) { - /* Save multiple track SCENIC runs to loom*/ - scenic_loom_trk = SC__SCENIC__SAVE_SCENIC_MULTI_RUNS_TO_LOOM_TRACK( - filteredloom, - aggr_features_trk, - regulons_folder_trk, - regulons_auc_trk, - 'trk' - ) - SC__SCENIC__MERGE_MOTIF_TRACK_LOOMS( + MERGE_MOTIF_TRACK_LOOMS( scenic_loom_mtf, scenic_loom_trk ) - } - if(params.sc.scenic.cistarget.trkDB) { - out = SC__SCENIC__VISUALIZE(SC__SCENIC__MERGE_MOTIF_TRACK_LOOMS.out) + out = VISUALIZE(MERGE_MOTIF_TRACK_LOOMS.out) } else { - out = SC__SCENIC__VISUALIZE(scenic_loom_mtf) + out = VISUALIZE(scenic_loom_mtf) } } else { if(params.sc.scenic.cistarget.trkDB) { - out = SC__SCENIC__VISUALIZE( - SC__SCENIC__MERGE_MOTIF_TRACK_LOOMS( + out = VISUALIZE( + MERGE_MOTIF_TRACK_LOOMS( auc_mtf, auc_trk )) } else { - out = SC__SCENIC__VISUALIZE(auc_mtf) + out = VISUALIZE(auc_mtf) } } - SC__SCENIC__PUBLISH_LOOM(out) + PUBLISH_LOOM(out) emit: out } @@ -187,15 +125,15 @@ workflow SCENIC_append { scopeloom main: scenicloom = SCENIC( filteredloom ) - SC__SCENIC__APPEND_SCENIC_LOOM( scopeloom, scenicloom ) - report_notebook = SC__SCENIC__GENERATE_REPORT( + APPEND_SCENIC_LOOM( scopeloom, scenicloom ) + report_notebook = GENERATE_REPORT( file(workflow.projectDir + params.sc.scenic.report_ipynb), - SC__SCENIC__APPEND_SCENIC_LOOM.out, + APPEND_SCENIC_LOOM.out, "SCENIC_report" ) - SC__SCENIC__REPORT_TO_HTML(report_notebook) + REPORT_TO_HTML(report_notebook) emit: - SC__SCENIC__APPEND_SCENIC_LOOM.out + APPEND_SCENIC_LOOM.out } diff --git a/src/scenic/main.test.nf b/src/scenic/main.test.nf index 008a139f..05a3a2e0 100644 --- a/src/scenic/main.test.nf +++ b/src/scenic/main.test.nf @@ -2,40 +2,40 @@ // Tests // -// Test 1: SC__SCENIC__GRNBOOST2WITHOUTDASK (from processes/) +// Test 1: GRNBOOST2WITHOUTDASK (from processes/) // Time: ~2min // Command: -// nextflow -C conf/multi_runs.config,conf/test.config run main.test.nf -profile singularity --test SC__SCENIC__GRNBOOST2WITHOUTDASK +// nextflow -C conf/multi_runs.config,conf/test.config run main.test.nf -profile singularity --test GRNBOOST2WITHOUTDASK -// Test 2: SC__SCENIC__CISTARGET (from processes/) +// Test 2: CISTARGET (from processes/) // Time: ~10min // Command: -// nextflow -C conf/multi_runs.config,conf/test.config run main.test.nf -profile singularity --test SC__SCENIC__CISTARGET +// nextflow -C conf/multi_runs.config,conf/test.config run main.test.nf -profile singularity --test CISTARGET -// Test 3: SC__SCENIC__AUCELL (from processes/) +// Test 3: AUCELL (from processes/) // Time: ~1min // Command: -// nextflow -C conf/multi_runs.config,conf/test.config run main.test.nf -profile singularity --test SC__SCENIC__AUCELL +// nextflow -C conf/multi_runs.config,conf/test.config run main.test.nf -profile singularity --test AUCELL -// Test 4: SC__SCENIC__AGGR_MULTI_RUNS_FEATURES (from processes/) +// Test 4: AGGR_MULTI_RUNS_FEATURES (from processes/) // Time: ~1min // Command: -// nextflow -C conf/multi_runs.config,conf/test.config run main.test.nf -profile singularity --test SC__SCENIC__AGGR_MULTI_RUNS_FEATURES +// nextflow -C conf/multi_runs.config,conf/test.config run main.test.nf -profile singularity --test AGGR_MULTI_RUNS_FEATURES -// Test 5: SC__SCENIC__AGGR_MULTI_RUNS_REGULONS (from processes/) +// Test 5: AGGR_MULTI_RUNS_REGULONS (from processes/) // Time: <1min // Command: -// nextflow -C conf/multi_runs.config,conf/test.config run main.test.nf -profile singularity --test SC__SCENIC__AGGR_MULTI_RUNS_REGULONS +// nextflow -C conf/multi_runs.config,conf/test.config run main.test.nf -profile singularity --test AGGR_MULTI_RUNS_REGULONS -// Test 6: SC__SCENIC__AUCELL_GENESIGS_FROM_FOLDER (from processes/) +// Test 6: AUCELL_FROM_FOLDER (from processes/) // Time: ~?min // Command: -// nextflow -C conf/multi_runs.config,conf/test.config run main.test.nf -profile singularity --test SC__SCENIC__AUCELL_GENESIGS_FROM_FOLDER +// nextflow -C conf/multi_runs.config,conf/test.config run main.test.nf -profile singularity --test AUCELL_FROM_FOLDER -// Test 7: SC__SCENIC__SAVE_SCENIC_MULTI_RUNS_TO_LOOM (from processes/) +// Test 7: SAVE_SCENIC_MULTI_RUNS_TO_LOOM (from processes/) // Time: ~?min // Command: -// nextflow -C conf/multi_runs.config,conf/test.config run main.test.nf --test SC__SCENIC__SAVE_SCENIC_MULTI_RUNS_TO_LOOM +// nextflow -C conf/multi_runs.config,conf/test.config run main.test.nf --test SAVE_SCENIC_MULTI_RUNS_TO_LOOM nextflow.preview.dsl=2 @@ -43,36 +43,39 @@ nextflow.preview.dsl=2 /////////////////////////////////////////// // Define the parameters for all processes -include SC__SCENIC__GRNBOOST2WITHOUTDASK from './processes/grnboost2withoutDask' params(params) -include SC__SCENIC__CISTARGET as SC__SCENIC__CISTARGET__MOTIF from './processes/cistarget' params(params) -include SC__SCENIC__CISTARGET as SC__SCENIC__CISTARGET__TRACK from './processes/cistarget' params(params) -include SC__SCENIC__AUCELL as SC__SCENIC__AUCELL__MOTIF from './processes/aucell' params(params) -include SC__SCENIC__AUCELL as SC__SCENIC__AUCELL__TRACK from './processes/aucell' params(params) -include SC__SCENIC__AGGR_MULTI_RUNS_FEATURES as SC__SCENIC__AGGR_MULTI_RUNS_FEATURES__MOTIF from './processes/aggregateMultiRunsFeatures' params(params) -include SC__SCENIC__AGGR_MULTI_RUNS_FEATURES as SC__SCENIC__AGGR_MULTI_RUNS_FEATURES__TRACK from './processes/aggregateMultiRunsFeatures' params(params) -include SC__SCENIC__AGGR_MULTI_RUNS_REGULONS as SC__SCENIC__AGGR_MULTI_RUNS_REGULONS__MOTIF from './processes/aggregateMultiRunsRegulons' params(params) -include SC__SCENIC__AGGR_MULTI_RUNS_REGULONS as SC__SCENIC__AGGR_MULTI_RUNS_REGULONS__TRACK from './processes/aggregateMultiRunsRegulons' params(params) -include SC__SCENIC__AUCELL_GENESIGS_FROM_FOLDER as SC__SCENIC__AUCELL_GENESIGS_FROM_FOLDER__MOTIF from './processes/aucellGeneSigsFromFolder' params(params) -include SC__SCENIC__AUCELL_GENESIGS_FROM_FOLDER as SC__SCENIC__AUCELL_GENESIGS_FROM_FOLDER__TRACK from './processes/aucellGeneSigsFromFolder' params(params) -include SC__SCENIC__SAVE_SCENIC_MULTI_RUNS_TO_LOOM as SC__SCENIC__SAVE_SCENIC_MULTI_RUNS_TO_LOOM_MOTIF from './processes/saveScenicMultiRunsToLoom' params(params) -include SC__SCENIC__SAVE_SCENIC_MULTI_RUNS_TO_LOOM as SC__SCENIC__SAVE_SCENIC_MULTI_RUNS_TO_LOOM_TRACK from './processes/saveScenicMultiRunsToLoom' params(params) +include GRNBOOST2_WITHOUT_DASK from './processes/grnboost2withoutDask' params(params) +include CISTARGET as CISTARGET__MOTIF from './processes/cistarget' params(params) +include CISTARGET as CISTARGET__TRACK from './processes/cistarget' params(params) +include AUCELL as AUCELL__MOTIF from './processes/aucell' params(params) +include AUCELL as AUCELL__TRACK from './processes/aucell' params(params) +include AGGR_MULTI_RUNS_FEATURES as AGGR_MULTI_RUNS_FEATURES__MOTIF from './processes/multiruns/aggregateFeatures' params(params) +include AGGR_MULTI_RUNS_FEATURES as AGGR_MULTI_RUNS_FEATURES__TRACK from './processes/multiruns/aggregateFeatures' params(params) +include AGGR_MULTI_RUNS_REGULONS as AGGR_MULTI_RUNS_REGULONS__MOTIF from './processes/multiruns/aggregateMultiRunsRegulons' params(params) +include AGGR_MULTI_RUNS_REGULONS as AGGR_MULTI_RUNS_REGULONS__TRACK from './processes/multiruns/aggregateMultiRunsRegulons' params(params) +include AUCELL_FROM_FOLDER as AUCELL_FROM_FOLDER__MOTIF from './processes/aucellFromFolder' params(params) +include AUCELL_FROM_FOLDER as AUCELL_FROM_FOLDER__TRACK from './processes/aucellFromFolder' params(params) +include SAVE_SCENIC_MULTI_RUNS_TO_LOOM as SAVE_SCENIC_MULTI_RUNS_TO_LOOM_MOTIF from './processes/saveScenicMultiRunsToLoom' params(params) +include SAVE_SCENIC_MULTI_RUNS_TO_LOOM as SAVE_SCENIC_MULTI_RUNS_TO_LOOM_TRACK from './processes/saveScenicMultiRunsToLoom' params(params) +include MERGE_MOTIF_TRACK_LOOMS from './processes/loomHandler' params(params) +include PUBLISH_LOOM from './processes/loomHandler' params(params) +include VISUALIZE from './processes/loomHandler' params(params) // Create channel for the different runs runs = Channel.from( 1..params.sc.scenic.numRuns ) // Make the test workflow -workflow test_SC__SCENIC__GRNBOOST2WITHOUTDASK { +workflow test_GRNBOOST2WITHOUTDASK { get: loom main: tfs = file(params.sc.scenic.grn.TFs) - SC__SCENIC__GRNBOOST2WITHOUTDASK( runs, loom, tfs ) + GRNBOOST2WITHOUTDASK( runs, loom, tfs ) emit: - SC__SCENIC__GRNBOOST2WITHOUTDASK.out + GRNBOOST2WITHOUTDASK.out } // Make the test workflow -workflow test_SC__SCENIC__CISTARGET { +workflow test_CISTARGET { get: filteredloom grn @@ -82,7 +85,7 @@ workflow test_SC__SCENIC__CISTARGET { .fromPath( params.sc.scenic.cistarget.mtfDB ) .collect() // use all files together in the ctx command motifANN = file(params.sc.scenic.cistarget.mtfANN) - ctx_mtf = SC__SCENIC__CISTARGET__MOTIF( runs, filteredloom, grn, motifDB, motifANN, 'mtf' ) + ctx_mtf = CISTARGET__MOTIF( runs, filteredloom, grn, motifDB, motifANN, 'mtf' ) /* cisTarget track analysis @@ -91,57 +94,62 @@ workflow test_SC__SCENIC__CISTARGET { .fromPath( params.sc.scenic.cistarget.trkDB ) .collect() // use all files together in the ctx command trackANN = file(params.sc.scenic.cistarget.trkANN) - ctx_trk = SC__SCENIC__CISTARGET__TRACK( runs, filteredloom, grn, trackDB, trackANN, 'trk' ) + ctx_trk = CISTARGET__TRACK( runs, filteredloom, grn, trackDB, trackANN, 'trk' ) emit: ctx_mtf ctx_trk } // Make the test workflow -workflow test_SC__SCENIC__AUCELL { +workflow test_AUCELL { get: filteredloom ctx_mtf ctx_trk main: /* AUCell, motif regulons */ - auc_mtf = SC__SCENIC__AUCELL__MOTIF( runs, filteredloom, ctx_mtf, 'mtf' ) + auc_mtf = AUCELL__MOTIF( runs, filteredloom, ctx_mtf, 'mtf' ) /* AUCell, track regulons */ - auc_trk = SC__SCENIC__AUCELL__TRACK( runs, filteredloom, ctx_trk, 'trk' ) + auc_trk = AUCELL__TRACK( runs, filteredloom, ctx_trk, 'trk' ) emit: auc_mtf auc_trk } // Make the test workflow -workflow test_SC__SCENIC__AGGR_MULTI_RUNS_REGULONS { +workflow test_SINGLE_RUN_BY_ID { get: - auc_mtf_looms - auc_trk_looms + runId main: - /* Aggregate motif regulons from multiple runs */ - aggr_regulons_mtf = SC__SCENIC__AGGR_MULTI_RUNS_REGULONS__MOTIF( auc_mtf_looms, 'mtf' ) - - /* Aggregate track regulons from multiple runs */ - aggr_regulons_trk = SC__SCENIC__AGGR_MULTI_RUNS_REGULONS__TRACK( auc_trk_looms, 'trk' ) + filteredloom = file( params.sc.scenic.filteredloom ) + tfs = file(params.sc.scenic.grn.TFs) + run = Channel.from( runId..runId ) + grn = GRNBOOST2WITHOUTDASK( run, filteredloom, tfs ) + // channel for SCENIC databases resources: + motifDB = Channel + .fromPath( params.sc.scenic.cistarget.mtfDB ) + .collect() // use all files together in the ctx command + motifANN = file(params.sc.scenic.cistarget.mtfANN) + ctx_mtf = CISTARGET__MOTIF( run, filteredloom, grn, motifDB, motifANN, 'mtf' ) + /* AUCell, motif regulons */ + auc_mtf = AUCELL__MOTIF( run, filteredloom, ctx_mtf, 'mtf' ) emit: - aggr_regulons_mtf - aggr_regulons_trk + auc_mtf } // Make the test workflow -workflow test_SC__SCENIC__AUCELL_GENESIGS_FROM_FOLDER { +workflow test_AUCELL_FROM_FOLDER { get: filteredloom regulons_folder_mtf regulons_folder_trk main: /* Aggregate motif regulons from multiple runs */ - regulons_auc_mtf = SC__SCENIC__AUCELL_GENESIGS_FROM_FOLDER__MOTIF( filteredloom, regulons_folder_mtf, 'mtf' ) + regulons_auc_mtf = AUCELL_FROM_FOLDER__MOTIF( filteredloom, regulons_folder_mtf, 'mtf' ) /* Aggregate track regulons from multiple runs */ - regulons_auc_trk = SC__SCENIC__AUCELL_GENESIGS_FROM_FOLDER__TRACK( filteredloom, regulons_folder_trk, 'trk' ) + regulons_auc_trk = AUCELL_FROM_FOLDER__TRACK( filteredloom, regulons_folder_trk, 'trk' ) emit: regulons_auc_mtf regulons_auc_trk @@ -150,65 +158,92 @@ workflow test_SC__SCENIC__AUCELL_GENESIGS_FROM_FOLDER { workflow { main: switch(params.test) { - case "SC__SCENIC__GRNBOOST2WITHOUTDASK": - test_SC__SCENIC__GRNBOOST2WITHOUTDASK( file( params.sc.scenic.filteredloom ) ) + case "SC__SCENIC_SINGLE_RUN_BY_ID": + test_SINGLE_RUN_BY_ID( params.runId ) break; - case "SC__SCENIC__CISTARGET": + case "GRNBOOST2WITHOUTDASK": + test_GRNBOOST2WITHOUTDASK( file( params.sc.scenic.filteredloom ) ) + break; + case "CISTARGET": grn = Channel.fromPath(params.sc.scenic.scenicoutdir + "/grnboost2withoutDask/run_*/run_*__adj.tsv") - test_SC__SCENIC__CISTARGET( file( params.sc.scenic.filteredloom ), grn ) + test_CISTARGET( file( params.sc.scenic.filteredloom ), grn ) break; - case "SC__SCENIC__AUCELL": + case "AUCELL": ctx_mtf = Channel.fromPath(params.sc.scenic.scenicoutdir + "/cistarget/run_*/run_*__reg_mtf.csv") ctx_trk = Channel.fromPath(params.sc.scenic.scenicoutdir + "/cistarget/run_*/run_*__reg_trk.csv") - test_SC__SCENIC__AUCELL( file( params.sc.scenic.filteredloom ), ctx_mtf, ctx_trk ) + test_AUCELL( file( params.sc.scenic.filteredloom ), ctx_mtf, ctx_trk ) break; - case "SC__SCENIC__AGGR_MULTI_RUNS_FEATURES": + case "AGGR_MULTI_RUNS_FEATURES": /* Aggregate motifs from multiple runs */ reg_mtf = Channel.fromPath(params.sc.scenic.scenicoutdir + "/cistarget/run_*/run_*__reg_mtf.csv") - SC__SCENIC__AGGR_MULTI_RUNS_FEATURES__MOTIF( reg_mtf.collect(), 'mtf' ) + AGGR_MULTI_RUNS_FEATURES__MOTIF( reg_mtf.collect(), 'mtf' ) if(params.sc.scenic.cistarget.trkDB) { /* Aggregate tracks from multiple runs */ reg_trk = Channel.fromPath(params.sc.scenic.scenicoutdir + "/cistarget/run_*/run_*__reg_trk.csv") - SC__SCENIC__AGGR_MULTI_RUNS_FEATURES__TRACK( reg_trk.collect(), 'trk' ) + AGGR_MULTI_RUNS_FEATURES__TRACK( reg_trk.collect(), 'trk' ) } break; - case "SC__SCENIC__AGGR_MULTI_RUNS_REGULONS": + case "AGGR_MULTI_RUNS_REGULONS": + /* Aggregate motif regulons from multiple runs */ auc_mtf_looms = Channel.fromPath(params.sc.scenic.scenicoutdir + "/aucell/run_*/run_*__auc_mtf.loom") - auc_trk_looms = Channel.fromPath(params.sc.scenic.scenicoutdir + "/aucell/run_*/run_*__auc_trk.loom") - test_SC__SCENIC__AGGR_MULTI_RUNS_REGULONS(auc_mtf_looms.collect(), auc_trk_looms.collect()) + AGGR_MULTI_RUNS_REGULONS__MOTIF( auc_mtf_looms.collect(), 'mtf' ) + if(params.sc.scenic.cistarget.trkDB) { + /* Aggregate track regulons from multiple runs */ + auc_trk_looms = Channel.fromPath(params.sc.scenic.scenicoutdir + "/aucell/run_*/run_*__auc_trk.loom") + AGGR_MULTI_RUNS_REGULONS__TRACK( auc_trk_looms.collect(), 'trk' ) + } break; - case "SC__SCENIC__AUCELL_GENESIGS_FROM_FOLDER": - mtf_regulons = file(params.sc.scenic.scenicoutdir + "/multi_runs_regulons_mtf") - trk_regulons = file(params.sc.scenic.scenicoutdir + "/multi_runs_regulons_trk") - test_SC__SCENIC__AUCELL_GENESIGS_FROM_FOLDER(file(params.sc.scenic.filteredloom), mtf_regulons, trk_regulons) + case "AUCELL_FROM_FOLDER": + /* Aggregate motif regulons from multiple runs */ + regulons_folder_mtf = file(params.sc.scenic.scenicoutdir + "/multi_runs_regulons_mtf") + AUCELL_FROM_FOLDER__MOTIF( file(params.sc.scenic.filteredloom), regulons_folder_mtf, 'mtf' ) + if(params.sc.scenic.cistarget.trkDB) { + /* Aggregate track regulons from multiple runs */ + regulons_folder_trk = file(params.sc.scenic.scenicoutdir + "/multi_runs_regulons_trk") + AUCELL_FROM_FOLDER__TRACK( file(params.sc.scenic.filteredloom), regulons_folder_trk, 'trk' ) + } break; - case "SC__SCENIC__SAVE_SCENIC_MULTI_RUNS_TO_LOOM": + case "SAVE_SCENIC_MULTI_RUNS_TO_LOOM_MOTIF": filteredloom = file(params.sc.scenic.filteredloom) - aggr_features_mtf = file(params.sc.scenic.scenicoutdir + "/multi_runs_cistarget/multi_runs_features_mtf.pickle") + aggr_features_mtf = file(params.sc.scenic.scenicoutdir + "/multi_runs_cistarget/multi_runs_features_mtf.csv.gz") regulons_folder_mtf = file(params.sc.scenic.scenicoutdir + "/multi_runs_regulons_mtf") regulons_auc_mtf = file(params.sc.scenic.scenicoutdir + "/multi_runs_aucell/multi_runs_regulons_auc_mtf.tsv") /* Save multiple motif SCENIC runs to loom*/ - SC__SCENIC__SAVE_SCENIC_MULTI_RUNS_TO_LOOM_MOTIF( + SAVE_SCENIC_MULTI_RUNS_TO_LOOM_MOTIF( filteredloom, aggr_features_mtf, regulons_folder_mtf, regulons_auc_mtf, 'mtf' ) - if(params.sc.scenic.cistarget.trkDB) { - regulons_folder_trk = file(params.sc.scenic.scenicoutdir + "/multi_runs_regulons_trk") - aggr_features_trk = file(params.sc.scenic.scenicoutdir + "/multi_runs_cistarget/multi_runs_features_trk.pickle") - regulons_auc_trk = file(params.sc.scenic.scenicoutdir + "/multi_runs_aucell/multi_runs_regulons_auc_trk.tsv") - /* Save multiple track SCENIC runs to loom*/ - SC__SCENIC__SAVE_SCENIC_MULTI_RUNS_TO_LOOM_TRACK( - filteredloom, - aggr_features_trk, - regulons_folder_trk, - regulons_auc_trk, - 'trk' - ) - } + break; + case "SAVE_SCENIC_MULTI_RUNS_TO_LOOM_TRACK": + filteredloom = file(params.sc.scenic.filteredloom) + regulons_folder_trk = file(params.sc.scenic.scenicoutdir + "/multi_runs_regulons_trk") + aggr_features_trk = file(params.sc.scenic.scenicoutdir + "/multi_runs_cistarget/multi_runs_features_trk.csv.gz") + regulons_auc_trk = file(params.sc.scenic.scenicoutdir + "/multi_runs_aucell/multi_runs_regulons_auc_trk.tsv") + /* Save multiple track SCENIC runs to loom*/ + SAVE_SCENIC_MULTI_RUNS_TO_LOOM_TRACK( + filteredloom, + aggr_features_trk, + regulons_folder_trk, + regulons_auc_trk, + 'trk' + ) + break; + case "MERGE_MOTIF_TRACK_LOOMS": + scenic_loom_mtf = file( params.sc.scenic.scenicoutdir + "/multi_runs_looms/multi_runs_regulons_auc_mtf.loom" ) + scenic_loom_trk = file( params.sc.scenic.scenicoutdir + "/multi_runs_looms/multi_runs_regulons_auc_trk.loom" ) + MERGE_MOTIF_TRACK_LOOMS( + scenic_loom_mtf, + scenic_loom_trk + ) + break; + case "VISUALIZE_PUBLISH": + /* Aggregate motif regulons from multiple runs */ + scenic_loom = file( params.sc.scenic.scenicoutdir + "/" + params.sc.scenic.scenicOutputLoom ) + PUBLISH_LOOM( VISUALIZE( scenic_loom ) ) break; default: throw new Exception("The test parameters should be specified.") diff --git a/src/scenic/multi_runs.Dockerfile b/src/scenic/multi_runs.Dockerfile index 844eaa1d..f33bffbb 100644 --- a/src/scenic/multi_runs.Dockerfile +++ b/src/scenic/multi_runs.Dockerfile @@ -10,8 +10,10 @@ RUN python -m venv /opt/venv # Make sure we use the virtualenv: ENV PATH="/opt/venv/bin:$PATH" -RUN git clone -b aucell_multi_runs_and_fix_empty_motif_data https://github.com/dweemx/pySCENIC.git && \ +RUN git clone -b aucell_multi_runs https://github.com/dweemx/pySCENIC.git && \ cd pySCENIC && \ + pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r requirements_docker.txt && \ pip install . FROM python:3.6.8-slim-stretch AS build-image @@ -19,6 +21,8 @@ RUN apt-get -y update && \ # Need to run ps apt-get -y install procps && \ apt-get -y install libxml2 && \ + # Need to run MulticoreTSNE + apt-get -y install libgomp1 && \ rm -rf /var/cache/apt/* && \ rm -rf /var/lib/apt/lists/* diff --git a/src/scenic/processes/aggregateMultiRunsFeatures.nf b/src/scenic/processes/aggregateMultiRunsFeatures.nf deleted file mode 100644 index b6aeb419..00000000 --- a/src/scenic/processes/aggregateMultiRunsFeatures.nf +++ /dev/null @@ -1,34 +0,0 @@ -nextflow.preview.dsl=2 - -if(!params.containsKey("test")) { - binDir = "${workflow.projectDir}/src/scenic/bin/" -} else { - binDir = "" -} - -process SC__SCENIC__AGGR_MULTI_RUNS_FEATURES { - cache 'deep' - container params.sc.scenic.container - publishDir "${params.sc.scenic.scenicoutdir}/multi_runs_cistarget/", mode: 'link', overwrite: true - // This process requires a large amount of memory especially for big datasets - // This process is quite slow (could take more than 1h for big datasets, so keep 24h for now) - clusterOptions "-l nodes=1:ppn=${params.sc.scenic.numWorkers} -l pmem=6gb -l walltime=24:00:00 -A ${params.global.qsubaccount}" - - input: - file f - val type - - output: - file "multi_runs_features_${type}.pickle" - - """ - ${binDir}aggregate_SCENIC_multi_runs_features.py \ - ${f} \ - --output "multi_runs_features_${type}.pickle" \ - --output-format "pickle" - """ -} - -/* options to implement: -*/ - diff --git a/src/scenic/processes/aucell.nf b/src/scenic/processes/aucell.nf index 23ec74d3..3e0c2f11 100644 --- a/src/scenic/processes/aucell.nf +++ b/src/scenic/processes/aucell.nf @@ -1,21 +1,19 @@ nextflow.preview.dsl=2 -process SC__SCENIC__AUCELL { +process AUCELL { cache 'deep' container params.sc.scenic.container publishDir "${params.sc.scenic.scenicoutdir}/aucell/${params.sc.scenic.numRuns > 1 ? "run_" + runId : ""}", mode: 'link', overwrite: true - clusterOptions "-l nodes=1:ppn=${params.sc.scenic.numWorkers} -l pmem=2gb -l walltime=24:00:00 -A ${params.global.qsubaccount}" + clusterOptions "-l nodes=1:ppn=${params.sc.scenic.numWorkers} -l pmem=${params.sc.scenic.aucell.pmem} -l walltime=24:00:00 -A ${params.global.qsubaccount}" maxForks params.sc.scenic.maxForks input: - val runId + tuple val(runId), file(regulons) file exprMat - file regulons val type output: - file "${params.sc.scenic.numRuns > 1 ? "run_" + runId +"__auc_" + type + ".loom": "auc_" + type + ".loom"}" - // file params.output + file("${params.sc.scenic.numRuns > 1 ? "run_" + runId +"__auc_" + type + ".loom": "auc_" + type + ".loom"}") """ pyscenic aucell \ diff --git a/src/scenic/processes/cistarget.nf b/src/scenic/processes/cistarget.nf index bb6becfa..608e90c4 100644 --- a/src/scenic/processes/cistarget.nf +++ b/src/scenic/processes/cistarget.nf @@ -1,22 +1,21 @@ nextflow.preview.dsl=2 -process SC__SCENIC__CISTARGET { +process CISTARGET { cache 'deep' container params.sc.scenic.container publishDir "${params.sc.scenic.scenicoutdir}/cistarget/${params.sc.scenic.numRuns > 1 ? "run_" + runId : ""}", mode: 'link', overwrite: true - clusterOptions "-l nodes=1:ppn=${params.sc.scenic.numWorkers} -l pmem=2gb -l walltime=24:00:00 -A ${params.global.qsubaccount}" + clusterOptions "-l nodes=1:ppn=${params.sc.scenic.numWorkers} -l pmem=${params.sc.scenic.cistarget.pmem} -l walltime=24:00:00 -A ${params.global.qsubaccount}" maxForks params.sc.scenic.maxForks input: - val runId + tuple val(runId), file("${params.sc.scenic.numRuns > 1 ? "run_" + runId +"__adj.tsv" : "adj.tsv"}") file filteredloom - file "${params.sc.scenic.numRuns > 1 ? "run_" + runId +"__adj.tsv" : "adj.tsv"}" file featherDB file annotation val type output: - file "${params.sc.scenic.numRuns > 1 ? "run_" + runId +"__reg_" + type + ".csv" : "reg_" + type + ".csv"}" + tuple val(runId), file("${params.sc.scenic.numRuns > 1 ? "run_" + runId +"__reg_" + type + ".csv" : "reg_" + type + ".csv"}") """ pyscenic ctx \ diff --git a/src/scenic/processes/grnboost2withoutDask.nf b/src/scenic/processes/grnboost2withoutDask.nf index 0f2d8098..cded66e5 100644 --- a/src/scenic/processes/grnboost2withoutDask.nf +++ b/src/scenic/processes/grnboost2withoutDask.nf @@ -6,7 +6,7 @@ if(!params.containsKey("test")) { binDir = "" } -process SC__SCENIC__GRNBOOST2_WITHOUT_DASK { +process GRNBOOST2_WITHOUT_DASK { cache 'deep' container params.sc.scenic.container publishDir "${params.sc.scenic.scenicoutdir}/grnboost2withoutDask/${params.sc.scenic.numRuns > 1 ? "run_" + runId : ""}", mode: 'link', overwrite: true @@ -19,7 +19,7 @@ process SC__SCENIC__GRNBOOST2_WITHOUT_DASK { file tfs output: - file "${params.sc.scenic.numRuns > 1 ? "run_" + runId +"__adj.tsv" : "adj.tsv"}" + tuple val(runId), file("${params.sc.scenic.numRuns > 1 ? "run_" + runId +"__adj.tsv" : "adj.tsv"}") script: """ diff --git a/src/scenic/processes/scenicLoomHandler.nf b/src/scenic/processes/loomHandler.nf similarity index 83% rename from src/scenic/processes/scenicLoomHandler.nf rename to src/scenic/processes/loomHandler.nf index 22760b51..f711fcbd 100644 --- a/src/scenic/processes/scenicLoomHandler.nf +++ b/src/scenic/processes/loomHandler.nf @@ -6,9 +6,9 @@ if(!params.containsKey("test")) { binDir = "" } -process SC__SCENIC__PUBLISH_LOOM { +process PUBLISH_LOOM { - publishDir "${params.sc.scenic.scenicoutdir}", mode: 'link', overwrite: true, saveAs: { filename -> params.sc.scenic.scenicOutputLoom } + publishDir "${params.sc.scenic.scenicoutdir}", mode: 'link', overwrite: true, saveAs: { filename -> params.sc.scenic.scenicScopeOutputLoom } input: file f @@ -21,7 +21,7 @@ process SC__SCENIC__PUBLISH_LOOM { } -process SC__SCENIC__VISUALIZE { +process VISUALIZE { cache 'deep' container params.sc.scenic.container clusterOptions "-l nodes=1:ppn=${params.sc.scenic.numWorkers} -l pmem=2gb -l walltime=1:00:00 -A ${params.global.qsubaccount}" @@ -41,7 +41,7 @@ process SC__SCENIC__VISUALIZE { } -process SC__SCENIC__MERGE_MOTIF_TRACK_LOOMS { +process MERGE_MOTIF_TRACK_LOOMS { cache 'deep' container params.sc.scenic.container publishDir "${params.sc.scenic.scenicoutdir}", mode: 'link', overwrite: true @@ -55,7 +55,7 @@ process SC__SCENIC__MERGE_MOTIF_TRACK_LOOMS { file params.sc.scenic.scenicOutputLoom """ - ${binDir}merge_SCENIC_motif_track_loom.py \ + ${binDir}merge_motif_track_loom.py \ --loom_motif ${motifloom} \ --loom_track ${trackloom} \ --loom_output ${params.sc.scenic.scenicOutputLoom} \ @@ -65,7 +65,7 @@ process SC__SCENIC__MERGE_MOTIF_TRACK_LOOMS { /* options to implement: */ -process SC__SCENIC__APPEND_SCENIC_LOOM { +process APPEND_SCENIC_LOOM { cache 'deep' container params.sc.scenic.container publishDir "${params.sc.scenic.scenicoutdir}", mode: 'link', overwrite: true @@ -78,7 +78,7 @@ process SC__SCENIC__APPEND_SCENIC_LOOM { file params.sc.scenic.scenicScopeOutputLoom """ - ${binDir}append_SCENIC_results_to_existing_loom.py \ + ${binDir}append_results_to_existing_loom.py \ --loom_scope ${scopeloom} \ --loom_scenic ${scenicloom} \ --loom_output ${params.sc.scenic.scenicScopeOutputLoom} \ diff --git a/src/scenic/processes/multiruns/aggregateFeatures.nf b/src/scenic/processes/multiruns/aggregateFeatures.nf new file mode 100644 index 00000000..11a9a7be --- /dev/null +++ b/src/scenic/processes/multiruns/aggregateFeatures.nf @@ -0,0 +1,42 @@ +nextflow.preview.dsl=2 + +if(!params.containsKey("test")) { + binDir = "${workflow.projectDir}/src/scenic/bin/" +} else { + binDir = "" +} + +process AGGR_MULTI_RUNS_FEATURES { + cache 'deep' + container params.sc.scenic.container + publishDir "${params.sc.scenic.scenicoutdir}/multi_runs_cistarget/", mode: 'link', overwrite: true + // In the case the chunking method is not used, this process requires a large amount of memory especially for big datasets + // This process is quite slow (could take more than 1h for big datasets, so keep 24h for now) + clusterOptions "-l nodes=1:ppn=${params.sc.scenic.numWorkers} -l pmem=2gb -l walltime=24:00:00 -A ${params.global.qsubaccount}" + + input: + file f + val type + + output: + file "multi_runs_features_${type}.${output_format_ext}${compression_ext}" + + script: + output_format = params.sc.scenic.aggregate_features.output_format + output_format_ext = output_format + if(output_format == 'pickle') { + output_format_ext = 'pkl' + } + compression = params.sc.scenic.aggregate_features.compression + compression_ext = '' + if(compression == 'gzip') { + compression_ext = '.gz' + } + """ + ${binDir}aggregate_multi_runs_features.py \ + ${f} \ + --output "multi_runs_features_${type}.${output_format_ext}${compression_ext}" \ + --output-format ${output_format} \ + --use-chunking ${params.sc.scenic.aggregate_features.use_chunking} + """ +} diff --git a/src/scenic/processes/aggregateMultiRunsRegulons.nf b/src/scenic/processes/multiruns/aggregateRegulons.nf similarity index 85% rename from src/scenic/processes/aggregateMultiRunsRegulons.nf rename to src/scenic/processes/multiruns/aggregateRegulons.nf index 1a5dbc08..eea9c96a 100644 --- a/src/scenic/processes/aggregateMultiRunsRegulons.nf +++ b/src/scenic/processes/multiruns/aggregateRegulons.nf @@ -6,7 +6,7 @@ if(!params.containsKey("test")) { binDir = "" } -process SC__SCENIC__AGGR_MULTI_RUNS_REGULONS { +process AGGR_MULTI_RUNS_REGULONS { cache 'deep' container params.sc.scenic.container publishDir "${params.sc.scenic.scenicoutdir}", mode: 'link', overwrite: true @@ -20,7 +20,7 @@ process SC__SCENIC__AGGR_MULTI_RUNS_REGULONS { file "multi_runs_regulons_${type}" """ - ${binDir}aggregate_SCENIC_multi_runs_regulons.py \ + ${binDir}aggregate_multi_runs_regulons.py \ ${f} \ --output "multi_runs_regulons_${type}" \ """ diff --git a/src/scenic/processes/aucellGeneSigsFromFolder.nf b/src/scenic/processes/multiruns/aucellFromFolder.nf similarity index 86% rename from src/scenic/processes/aucellGeneSigsFromFolder.nf rename to src/scenic/processes/multiruns/aucellFromFolder.nf index 21432968..4e16181d 100644 --- a/src/scenic/processes/aucellGeneSigsFromFolder.nf +++ b/src/scenic/processes/multiruns/aucellFromFolder.nf @@ -6,7 +6,7 @@ if(!params.containsKey("test")) { binDir = "" } -process SC__SCENIC__AUCELL_GENESIGS_FROM_FOLDER { +process AUCELL_FROM_FOLDER { cache 'deep' container params.sc.scenic.container publishDir "${params.sc.scenic.scenicoutdir}/multi_runs_aucell/", mode: 'link', overwrite: true @@ -22,14 +22,14 @@ process SC__SCENIC__AUCELL_GENESIGS_FROM_FOLDER { // file params.output """ - ${binDir}aucell_genesigs_from_folder.py \ + ${binDir}aucell_from_folder.py \ $exprMat \ $multiRunsAggrRegulonsFolder \ -o "multi_runs_regulons_auc_${type}.tsv" \ --min-genes ${params.sc.scenic.aucell.min_genes_regulon} \ --auc-threshold ${params.sc.scenic.aucell.auc_threshold} \ ${params.sc.scenic.aucell.containsKey('percentile_threshold') ? "--percentile-threshold " + params.sc.scenic.aucell.percentile_threshold : ""} \ - --gene-occurence-threshold ${params.sc.scenic.aucell.gene_occurence_threshold} \ + --min-regulon-gene-occurrence ${params.sc.scenic.aucell.min_regulon_gene_occurrence} \ --num-workers ${params.sc.scenic.numWorkers} \ --cell-id-attribute ${params.sc.scenic.cell_id_attribute} \ --gene-attribute ${params.sc.scenic.gene_attribute} diff --git a/src/scenic/processes/multiruns/convertMotifsToRegulons.nf b/src/scenic/processes/multiruns/convertMotifsToRegulons.nf new file mode 100644 index 00000000..5e58779c --- /dev/null +++ b/src/scenic/processes/multiruns/convertMotifsToRegulons.nf @@ -0,0 +1,36 @@ +nextflow.preview.dsl=2 + +// include getBaseName from '../../utils/files.nf' + +if(!params.containsKey("test")) { + binDir = "${workflow.projectDir}/src/scenic/bin/" +} else { + binDir = "" +} + +process CONVERT_MULTI_RUNS_FEATURES_TO_REGULONS { + cache 'deep' + container params.sc.scenic.container + publishDir "${params.sc.scenic.scenicoutdir}/multi_runs_cistarget/", mode: 'link', overwrite: true + // This process requires a large amount of memory especially for big datasets (force to use bigmem node) + // This process is quite slow (could take more than 1h for big datasets, so keep 24h for now) + clusterOptions "-l nodes=1:ppn=${params.sc.scenic.numWorkers} -l pmem=${params.sc.scenic.motifs_to_regulons.pmem} -l walltime=24:00:00 -A ${params.global.qsubaccount}" + + input: + file multiRunsAggrMotifEnrichmentTable + file multiRunsAggrRegulonsFolder + val type + + output: + file "multi_runs_regulons_${type}.pkl.gz" + + """ + ${binDir}convert_multi_runs_features_to_regulons.py \ + $multiRunsAggrMotifEnrichmentTable \ + $multiRunsAggrRegulonsFolder \ + -o "multi_runs_regulons_${type}.pkl.gz" + # --min-genes-regulon ${params.sc.scenic.aucell.min_genes_regulon} \ + # --min-regulon-gene-occurrence ${params.sc.scenic.aucell.min_regulon_gene_occurrence} + """ +} + diff --git a/src/scenic/processes/saveScenicMultiRunsToLoom.nf b/src/scenic/processes/multiruns/saveToLoom.nf similarity index 64% rename from src/scenic/processes/saveScenicMultiRunsToLoom.nf rename to src/scenic/processes/multiruns/saveToLoom.nf index 185c2d19..9cdeee61 100644 --- a/src/scenic/processes/saveScenicMultiRunsToLoom.nf +++ b/src/scenic/processes/multiruns/saveToLoom.nf @@ -6,18 +6,15 @@ if(!params.containsKey("test")) { binDir = "" } -process SC__SCENIC__SAVE_SCENIC_MULTI_RUNS_TO_LOOM { +process SAVE_MULTI_RUNS_TO_LOOM { cache 'deep' container params.sc.scenic.container publishDir "${params.sc.scenic.scenicoutdir}/multi_runs_looms/", mode: 'link', overwrite: true - // This process requires a large amount of memory especially for big datasets (force to use bigmem node) - // This process is quite slow (could take more than 1h for big datasets, so keep 24h for now) - clusterOptions "-l nodes=1:ppn=${params.sc.scenic.numWorkers} -l pmem=6gb -l walltime=24:00:00 -A ${params.global.qsubaccount}" + clusterOptions "-l nodes=1:ppn=${params.sc.scenic.numWorkers} -l pmem=${params.sc.scenic.save_to_loom.pmem} -l walltime=24:00:00 -A ${params.global.qsubaccount}" input: file exprMat - file multiRunsAggrMotifEnrichmentTable - file multiRunsAggrRegulonsFolder + file multiRunsAggrRegulons file multiRunsAggrRegulonsAUC val type @@ -26,14 +23,13 @@ process SC__SCENIC__SAVE_SCENIC_MULTI_RUNS_TO_LOOM { // file params.output """ - ${binDir}save_SCENIC_multi_runs_to_loom.py \ + ${binDir}save_multi_runs_to_loom.py \ $exprMat \ - $multiRunsAggrMotifEnrichmentTable \ - $multiRunsAggrRegulonsFolder \ + $multiRunsAggrRegulons \ $multiRunsAggrRegulonsAUC \ -o "multi_runs_regulons_auc_${type}.loom" \ --min-genes-regulon ${params.sc.scenic.aucell.min_genes_regulon} \ - --gene-occurence-threshold ${params.sc.scenic.aucell.gene_occurence_threshold} \ + --min-regulon-gene-occurrence ${params.sc.scenic.aucell.min_regulon_gene_occurrence} \ --cell-id-attribute ${params.sc.scenic.cell_id_attribute} \ --gene-attribute ${params.sc.scenic.gene_attribute} \ --title "${params.global.project_name} - pySCENIC (${type})" \ @@ -43,4 +39,3 @@ process SC__SCENIC__SAVE_SCENIC_MULTI_RUNS_TO_LOOM { --scope-tree-level-3 "${params.sc.scope.tree.level_3}" """ } - diff --git a/src/scenic/processes/reports.nf b/src/scenic/processes/reports.nf index 5d24ccef..4b37578f 100644 --- a/src/scenic/processes/reports.nf +++ b/src/scenic/processes/reports.nf @@ -4,7 +4,7 @@ nextflow.preview.dsl=2 takes a template ipynb and adata as input, outputs ipynb named by the value in ${report_title} */ -process SC__SCENIC__GENERATE_REPORT { +process GENERATE_REPORT { container params.sc.scenic.container publishDir "${params.outdir}/notebooks", mode: 'link', overwrite: true @@ -24,7 +24,7 @@ process SC__SCENIC__GENERATE_REPORT { """ } -process SC__SCENIC__REPORT_TO_HTML { +process REPORT_TO_HTML { container params.sc.scenic.container publishDir "${params.outdir}/notebooks", mode: 'link', overwrite: true diff --git a/src/scenic/scenic.config b/src/scenic/scenic.config index badf9171..4e7d2968 100644 --- a/src/scenic/scenic.config +++ b/src/scenic/scenic.config @@ -14,8 +14,8 @@ params { scenicOutputLoom = "SCENIC_output.loom" scenicScopeOutputLoom = "SCENIC_SCope_output.loom" // computation arguments: - numWorkers = "20" - maxForks = 10 + numWorkers = "2" + maxForks = 1 mode = "dask_multiprocessing" client_or_address = "" // loom file arguments: @@ -62,6 +62,8 @@ params { top_n_regulators = "5,10,50" min_genes = "20" // expression_mtx_fname = "" // uses params.scenic.filteredloom + // Resources settings + pmem = '2gb' } aucell { @@ -70,6 +72,8 @@ params { rank_threshold = "5000" auc_threshold = "0.05" nes_threshold = "3.0" + // Resources settings + pmem = '2gb' } } diff --git a/src/scenic/workflows/aggregateMultiRuns.nf b/src/scenic/workflows/aggregateMultiRuns.nf new file mode 100644 index 00000000..b7d63b49 --- /dev/null +++ b/src/scenic/workflows/aggregateMultiRuns.nf @@ -0,0 +1,70 @@ +// +// Version: +// Test: +// Command: +// +/* + * QC workflow + * Source: + * + * Steps considered: + * - Aggregating data from multiple motifs (or tracks) SCENIC runs to loom + */ + +nextflow.preview.dsl=2 + +////////////////////////////////////////////////////// +// process imports: + +include AGGR_MULTI_RUNS_FEATURES as AGGR_FEATURES from './../processes/multiruns/aggregateFeatures' params(params) +include AGGR_MULTI_RUNS_REGULONS as AGGR_REGULONS from './../processes/multiruns/aggregateRegulons' params(params) +include AUCELL_FROM_FOLDER as AUCELL from './../processes/multiruns/aucellFromFolder' params(params) +include CONVERT_MULTI_RUNS_FEATURES_TO_REGULONS as FEATURES_TO_REGULONS from './../processes/multiruns/convertMotifsToRegulons' params(params) +include SAVE_MULTI_RUNS_TO_LOOM as SAVE_TO_LOOM from './../processes/multiruns/saveToLoom' params(params) + +////////////////////////////////////////////////////// +// Define the workflow + +workflow AGGREGATE_MULTI_RUNS_TO_LOOM { + get: + filteredloom + ctx + auc + type + main: + /* Aggregate features (motifs or tracks) from multiple runs */ + ctx_aggr_features = AGGR_FEATURES( + ctx.map{ it -> it[1] }.collect(), + type + ) + + /* Aggregate regulons (motifs or tracks) from multiple runs */ + regulons_folder = AGGR_REGULONS( + auc.collect(), + type + ) + + /* Run AUCell on aggregated regulons */ + regulons_auc = AUCELL( + filteredloom, + regulons_folder, + type + ) + + /* Convert aggregated motif enrichment table to regulons */ + aggr_regulons = FEATURES_TO_REGULONS( + ctx_aggr_features, + regulons_folder, + type + ) + + /* Save multiple motifs (or tracks) SCENIC runs to loom */ + scenic_loom = SAVE_TO_LOOM( + filteredloom, + aggr_regulons, + regulons_auc, + type + ) + emit: + scenic_loom +}